Subversion Repositories Kolibri OS

Rev

Rev 6937 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
5354 serge 1
/*
2
 * Copyright © 2014 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 * Authors:
24
 *    Ben Widawsky 
25
 *    Michel Thierry 
26
 *    Thomas Daniel 
27
 *    Oscar Mateo 
28
 *
29
 */
30
 
31
/**
32
 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33
 *
34
 * Motivation:
35
 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36
 * These expanded contexts enable a number of new abilities, especially
37
 * "Execlists" (also implemented in this file).
38
 *
39
 * One of the main differences with the legacy HW contexts is that logical
40
 * ring contexts incorporate many more things to the context's state, like
41
 * PDPs or ringbuffer control registers:
42
 *
43
 * The reason why PDPs are included in the context is straightforward: as
44
 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45
 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46
 * instead, the GPU will do it for you on the context switch.
47
 *
48
 * But, what about the ringbuffer control registers (head, tail, etc..)?
49
 * shouldn't we just need a set of those per engine command streamer? This is
50
 * where the name "Logical Rings" starts to make sense: by virtualizing the
51
 * rings, the engine cs shifts to a new "ring buffer" with every context
52
 * switch. When you want to submit a workload to the GPU you: A) choose your
53
 * context, B) find its appropriate virtualized ring, C) write commands to it
54
 * and then, finally, D) tell the GPU to switch to that context.
55
 *
56
 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57
 * to a contexts is via a context execution list, ergo "Execlists".
58
 *
59
 * LRC implementation:
60
 * Regarding the creation of contexts, we have:
61
 *
62
 * - One global default context.
63
 * - One local default context for each opened fd.
64
 * - One local extra context for each context create ioctl call.
65
 *
66
 * Now that ringbuffers belong per-context (and not per-engine, like before)
67
 * and that contexts are uniquely tied to a given engine (and not reusable,
68
 * like before) we need:
69
 *
70
 * - One ringbuffer per-engine inside each context.
71
 * - One backing object per-engine inside each context.
72
 *
73
 * The global default context starts its life with these new objects fully
74
 * allocated and populated. The local default context for each opened fd is
75
 * more complex, because we don't know at creation time which engine is going
76
 * to use them. To handle this, we have implemented a deferred creation of LR
77
 * contexts:
78
 *
79
 * The local context starts its life as a hollow or blank holder, that only
80
 * gets populated for a given engine once we receive an execbuffer. If later
81
 * on we receive another execbuffer ioctl for the same context but a different
82
 * engine, we allocate/populate a new ringbuffer and context backing object and
83
 * so on.
84
 *
85
 * Finally, regarding local contexts created using the ioctl call: as they are
86
 * only allowed with the render ring, we can allocate & populate them right
87
 * away (no need to defer anything, at least for now).
88
 *
89
 * Execlists implementation:
90
 * Execlists are the new method by which, on gen8+ hardware, workloads are
91
 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92
 * This method works as follows:
93
 *
94
 * When a request is committed, its commands (the BB start and any leading or
95
 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96
 * for the appropriate context. The tail pointer in the hardware context is not
97
 * updated at this time, but instead, kept by the driver in the ringbuffer
98
 * structure. A structure representing this request is added to a request queue
99
 * for the appropriate engine: this structure contains a copy of the context's
100
 * tail after the request was written to the ring buffer and a pointer to the
101
 * context itself.
102
 *
103
 * If the engine's request queue was empty before the request was added, the
104
 * queue is processed immediately. Otherwise the queue will be processed during
105
 * a context switch interrupt. In any case, elements on the queue will get sent
106
 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107
 * globally unique 20-bits submission ID.
108
 *
109
 * When execution of a request completes, the GPU updates the context status
110
 * buffer with a context complete event and generates a context switch interrupt.
111
 * During the interrupt handling, the driver examines the events in the buffer:
112
 * for each context complete event, if the announced ID matches that on the head
113
 * of the request queue, then that request is retired and removed from the queue.
114
 *
115
 * After processing, if any requests were retired and the queue is not empty
116
 * then a new execution list can be submitted. The two requests at the front of
117
 * the queue are next to be submitted but since a context may not occur twice in
118
 * an execution list, if subsequent requests have the same ID as the first then
119
 * the two requests must be combined. This is done simply by discarding requests
120
 * at the head of the queue until either only one requests is left (in which case
121
 * we use a NULL second context) or the first two requests have unique IDs.
122
 *
123
 * By always executing the first two requests in the queue the driver ensures
124
 * that the GPU is kept as busy as possible. In the case where a single context
125
 * completes but a second context is still executing, the request for this second
126
 * context will be at the head of the queue when we remove the first one. This
127
 * request will then be resubmitted along with a new request for a different context,
128
 * which will cause the hardware to continue executing the second request and queue
129
 * the new request (the GPU detects the condition of a context getting preempted
130
 * with the same context and optimizes the context switch flow by not doing
131
 * preemption, but just sampling the new tail pointer).
132
 *
133
 */
134
 
135
#include 
136
#include 
6937 serge 137
#include "i915_drv.h"
6084 serge 138
#include "intel_mocs.h"
5354 serge 139
 
140
#define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
141
#define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE)
142
#define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE)
143
 
144
#define RING_EXECLIST_QFULL		(1 << 0x2)
145
#define RING_EXECLIST1_VALID		(1 << 0x3)
146
#define RING_EXECLIST0_VALID		(1 << 0x4)
147
#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
148
#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
149
#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
150
 
151
#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
152
#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
153
#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
154
#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
155
#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
156
#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
157
 
158
#define CTX_LRI_HEADER_0		0x01
159
#define CTX_CONTEXT_CONTROL		0x02
160
#define CTX_RING_HEAD			0x04
161
#define CTX_RING_TAIL			0x06
162
#define CTX_RING_BUFFER_START		0x08
163
#define CTX_RING_BUFFER_CONTROL		0x0a
164
#define CTX_BB_HEAD_U			0x0c
165
#define CTX_BB_HEAD_L			0x0e
166
#define CTX_BB_STATE			0x10
167
#define CTX_SECOND_BB_HEAD_U		0x12
168
#define CTX_SECOND_BB_HEAD_L		0x14
169
#define CTX_SECOND_BB_STATE		0x16
170
#define CTX_BB_PER_CTX_PTR		0x18
171
#define CTX_RCS_INDIRECT_CTX		0x1a
172
#define CTX_RCS_INDIRECT_CTX_OFFSET	0x1c
173
#define CTX_LRI_HEADER_1		0x21
174
#define CTX_CTX_TIMESTAMP		0x22
175
#define CTX_PDP3_UDW			0x24
176
#define CTX_PDP3_LDW			0x26
177
#define CTX_PDP2_UDW			0x28
178
#define CTX_PDP2_LDW			0x2a
179
#define CTX_PDP1_UDW			0x2c
180
#define CTX_PDP1_LDW			0x2e
181
#define CTX_PDP0_UDW			0x30
182
#define CTX_PDP0_LDW			0x32
183
#define CTX_LRI_HEADER_2		0x41
184
#define CTX_R_PWR_CLK_STATE		0x42
185
#define CTX_GPGPU_CSR_BASE_ADDRESS	0x44
186
 
187
#define GEN8_CTX_VALID (1<<0)
188
#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
189
#define GEN8_CTX_FORCE_RESTORE (1<<2)
190
#define GEN8_CTX_L3LLC_COHERENT (1<<5)
191
#define GEN8_CTX_PRIVILEGE (1<<8)
6084 serge 192
 
6937 serge 193
#define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \
194
	(reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \
195
	(reg_state)[(pos)+1] = (val); \
196
} while (0)
197
 
198
#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do {		\
6084 serge 199
	const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n));	\
200
	reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \
201
	reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \
6937 serge 202
} while (0)
6084 serge 203
 
6937 serge 204
#define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \
6084 serge 205
	reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \
206
	reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \
6937 serge 207
} while (0)
6084 serge 208
 
5354 serge 209
enum {
210
	ADVANCED_CONTEXT = 0,
6084 serge 211
	LEGACY_32B_CONTEXT,
5354 serge 212
	ADVANCED_AD_CONTEXT,
213
	LEGACY_64B_CONTEXT
214
};
6084 serge 215
#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3
216
#define GEN8_CTX_ADDRESSING_MODE(dev)  (USES_FULL_48BIT_PPGTT(dev) ?\
217
		LEGACY_64B_CONTEXT :\
218
		LEGACY_32B_CONTEXT)
5354 serge 219
enum {
220
	FAULT_AND_HANG = 0,
221
	FAULT_AND_HALT, /* Debug only */
222
	FAULT_AND_STREAM,
223
	FAULT_AND_CONTINUE /* Unsupported */
224
};
225
#define GEN8_CTX_ID_SHIFT 32
7144 serge 226
#define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT	0x17
227
#define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT	0x26
5354 serge 228
 
7144 serge 229
static int intel_lr_context_pin(struct intel_context *ctx,
230
				struct intel_engine_cs *engine);
6084 serge 231
static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
232
		struct drm_i915_gem_object *default_ctx_obj);
5354 serge 233
 
6084 serge 234
 
5354 serge 235
/**
236
 * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
237
 * @dev: DRM device.
238
 * @enable_execlists: value of i915.enable_execlists module parameter.
239
 *
240
 * Only certain platforms support Execlists (the prerequisites being
6084 serge 241
 * support for Logical Ring Contexts and Aliasing PPGTT or better).
5354 serge 242
 *
243
 * Return: 1 if Execlists is supported and has to be enabled.
244
 */
245
int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists)
246
{
247
	WARN_ON(i915.enable_ppgtt == -1);
248
 
6084 serge 249
	/* On platforms with execlist available, vGPU will only
250
	 * support execlist mode, no ring buffer mode.
251
	 */
252
	if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev))
253
		return 1;
254
 
5354 serge 255
	if (INTEL_INFO(dev)->gen >= 9)
256
		return 1;
257
 
258
	if (enable_execlists == 0)
259
		return 0;
260
 
261
	if (HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev) &&
262
	    i915.use_mmio_flip >= 0)
263
		return 1;
264
 
265
	return 0;
266
}
267
 
7144 serge 268
static void
269
logical_ring_init_platform_invariants(struct intel_engine_cs *ring)
270
{
271
	struct drm_device *dev = ring->dev;
272
 
273
	ring->disable_lite_restore_wa = (IS_SKL_REVID(dev, 0, SKL_REVID_B0) ||
274
					IS_BXT_REVID(dev, 0, BXT_REVID_A1)) &&
275
					(ring->id == VCS || ring->id == VCS2);
276
 
277
	ring->ctx_desc_template = GEN8_CTX_VALID;
278
	ring->ctx_desc_template |= GEN8_CTX_ADDRESSING_MODE(dev) <<
279
				   GEN8_CTX_ADDRESSING_MODE_SHIFT;
280
	if (IS_GEN8(dev))
281
		ring->ctx_desc_template |= GEN8_CTX_L3LLC_COHERENT;
282
	ring->ctx_desc_template |= GEN8_CTX_PRIVILEGE;
283
 
284
	/* TODO: WaDisableLiteRestore when we start using semaphore
285
	 * signalling between Command Streamers */
286
	/* ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; */
287
 
288
	/* WaEnableForceRestoreInCtxtDescForVCS:skl */
289
	/* WaEnableForceRestoreInCtxtDescForVCS:bxt */
290
	if (ring->disable_lite_restore_wa)
291
		ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE;
292
}
293
 
5354 serge 294
/**
7144 serge 295
 * intel_lr_context_descriptor_update() - calculate & cache the descriptor
296
 * 					  descriptor for a pinned context
5354 serge 297
 *
7144 serge 298
 * @ctx: Context to work on
299
 * @ring: Engine the descriptor will be used with
5354 serge 300
 *
7144 serge 301
 * The context descriptor encodes various attributes of a context,
302
 * including its GTT address and some flags. Because it's fairly
303
 * expensive to calculate, we'll just do it once and cache the result,
304
 * which remains valid until the context is unpinned.
305
 *
306
 * This is what a descriptor looks like, from LSB to MSB:
307
 *    bits 0-11:    flags, GEN8_CTX_* (cached in ctx_desc_template)
308
 *    bits 12-31:    LRCA, GTT address of (the HWSP of) this context
309
 *    bits 32-51:    ctx ID, a globally unique tag (the LRCA again!)
310
 *    bits 52-63:    reserved, may encode the engine ID (for GuC)
5354 serge 311
 */
7144 serge 312
static void
313
intel_lr_context_descriptor_update(struct intel_context *ctx,
314
				   struct intel_engine_cs *ring)
5354 serge 315
{
7144 serge 316
	uint64_t lrca, desc;
5354 serge 317
 
7144 serge 318
	lrca = ctx->engine[ring->id].lrc_vma->node.start +
319
	       LRC_PPHWSP_PN * PAGE_SIZE;
5354 serge 320
 
7144 serge 321
	desc = ring->ctx_desc_template;			   /* bits  0-11 */
322
	desc |= lrca;					   /* bits 12-31 */
323
	desc |= (lrca >> PAGE_SHIFT) << GEN8_CTX_ID_SHIFT; /* bits 32-51 */
6084 serge 324
 
7144 serge 325
	ctx->engine[ring->id].lrc_desc = desc;
6084 serge 326
}
327
 
328
uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
329
				     struct intel_engine_cs *ring)
330
{
7144 serge 331
	return ctx->engine[ring->id].lrc_desc;
332
}
5354 serge 333
 
7144 serge 334
/**
335
 * intel_execlists_ctx_id() - get the Execlists Context ID
336
 * @ctx: Context to get the ID for
337
 * @ring: Engine to get the ID for
338
 *
339
 * Do not confuse with ctx->id! Unfortunately we have a name overload
340
 * here: the old context ID we pass to userspace as a handler so that
341
 * they can refer to a context, and the new context ID we pass to the
342
 * ELSP so that the GPU can inform us of the context status via
343
 * interrupts.
344
 *
345
 * The context ID is a portion of the context descriptor, so we can
346
 * just extract the required part from the cached descriptor.
347
 *
348
 * Return: 20-bits globally unique context ID.
349
 */
350
u32 intel_execlists_ctx_id(struct intel_context *ctx,
351
			   struct intel_engine_cs *ring)
352
{
353
	return intel_lr_context_descriptor(ctx, ring) >> GEN8_CTX_ID_SHIFT;
5354 serge 354
}
355
 
6084 serge 356
static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
357
				 struct drm_i915_gem_request *rq1)
5354 serge 358
{
6084 serge 359
 
360
	struct intel_engine_cs *ring = rq0->ring;
5354 serge 361
	struct drm_device *dev = ring->dev;
362
	struct drm_i915_private *dev_priv = dev->dev_private;
6084 serge 363
	uint64_t desc[2];
5354 serge 364
 
6084 serge 365
	if (rq1) {
366
		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring);
367
		rq1->elsp_submitted++;
5354 serge 368
	} else {
6084 serge 369
		desc[1] = 0;
5354 serge 370
	}
371
 
6084 serge 372
	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring);
373
	rq0->elsp_submitted++;
5354 serge 374
 
6084 serge 375
	/* You must always write both descriptors in the order below. */
376
	spin_lock(&dev_priv->uncore.lock);
377
	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
378
	I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
379
	I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
5354 serge 380
 
6084 serge 381
	I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0]));
382
	/* The context is automatically loaded after the following */
383
	I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0]));
5354 serge 384
 
6084 serge 385
	/* ELSP is a wo register, use another nearby reg for posting */
386
	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
387
	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
388
	spin_unlock(&dev_priv->uncore.lock);
5354 serge 389
}
390
 
6084 serge 391
static int execlists_update_context(struct drm_i915_gem_request *rq)
5354 serge 392
{
6084 serge 393
	struct intel_engine_cs *ring = rq->ring;
394
	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
7144 serge 395
	uint32_t *reg_state = rq->ctx->engine[ring->id].lrc_reg_state;
5354 serge 396
 
6084 serge 397
	reg_state[CTX_RING_TAIL+1] = rq->tail;
5354 serge 398
 
6084 serge 399
	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
400
		/* True 32b PPGTT with dynamic page allocation: update PDP
401
		 * registers and point the unallocated PDPs to scratch page.
402
		 * PML4 is allocated during ppgtt init, so this is not needed
403
		 * in 48-bit mode.
404
		 */
405
		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
406
		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
407
		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
408
		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
409
	}
410
 
5354 serge 411
	return 0;
412
}
413
 
6084 serge 414
static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
415
				      struct drm_i915_gem_request *rq1)
5354 serge 416
{
6084 serge 417
	execlists_update_context(rq0);
5354 serge 418
 
6084 serge 419
	if (rq1)
420
		execlists_update_context(rq1);
5354 serge 421
 
6084 serge 422
	execlists_elsp_write(rq0, rq1);
5354 serge 423
}
424
 
425
static void execlists_context_unqueue(struct intel_engine_cs *ring)
426
{
6084 serge 427
	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
428
	struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
5354 serge 429
 
430
	assert_spin_locked(&ring->execlist_lock);
431
 
6084 serge 432
	/*
433
	 * If irqs are not active generate a warning as batches that finish
434
	 * without the irqs may get lost and a GPU Hang may occur.
435
	 */
436
	WARN_ON(!intel_irqs_enabled(ring->dev->dev_private));
437
 
5354 serge 438
	if (list_empty(&ring->execlist_queue))
439
		return;
440
 
441
	/* Try to read in pairs */
442
	list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue,
443
				 execlist_link) {
444
		if (!req0) {
445
			req0 = cursor;
446
		} else if (req0->ctx == cursor->ctx) {
447
			/* Same ctx: ignore first request, as second request
448
			 * will update tail past first request's workload */
449
			cursor->elsp_submitted = req0->elsp_submitted;
7144 serge 450
			list_move_tail(&req0->execlist_link,
451
				       &ring->execlist_retired_req_list);
5354 serge 452
			req0 = cursor;
453
		} else {
454
			req1 = cursor;
455
			break;
456
		}
457
	}
458
 
6084 serge 459
	if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) {
460
		/*
461
		 * WaIdleLiteRestore: make sure we never cause a lite
462
		 * restore with HEAD==TAIL
463
		 */
464
		if (req0->elsp_submitted) {
465
			/*
466
			 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
467
			 * as we resubmit the request. See gen8_emit_request()
468
			 * for where we prepare the padding after the end of the
469
			 * request.
470
			 */
471
			struct intel_ringbuffer *ringbuf;
472
 
473
			ringbuf = req0->ctx->engine[ring->id].ringbuf;
474
			req0->tail += 8;
475
			req0->tail &= ringbuf->size - 1;
476
		}
477
	}
478
 
5354 serge 479
	WARN_ON(req1 && req1->elsp_submitted);
480
 
6084 serge 481
	execlists_submit_requests(req0, req1);
5354 serge 482
}
483
 
484
static bool execlists_check_remove_request(struct intel_engine_cs *ring,
485
					   u32 request_id)
486
{
6084 serge 487
	struct drm_i915_gem_request *head_req;
5354 serge 488
 
489
	assert_spin_locked(&ring->execlist_lock);
490
 
491
	head_req = list_first_entry_or_null(&ring->execlist_queue,
6084 serge 492
					    struct drm_i915_gem_request,
5354 serge 493
					    execlist_link);
494
 
495
	if (head_req != NULL) {
7144 serge 496
		if (intel_execlists_ctx_id(head_req->ctx, ring) == request_id) {
5354 serge 497
			WARN(head_req->elsp_submitted == 0,
498
			     "Never submitted head request\n");
499
 
500
			if (--head_req->elsp_submitted <= 0) {
7144 serge 501
				list_move_tail(&head_req->execlist_link,
502
					       &ring->execlist_retired_req_list);
5354 serge 503
				return true;
504
			}
505
		}
506
	}
507
 
508
	return false;
509
}
510
 
7144 serge 511
static void get_context_status(struct intel_engine_cs *ring,
512
			       u8 read_pointer,
513
			       u32 *status, u32 *context_id)
514
{
515
	struct drm_i915_private *dev_priv = ring->dev->dev_private;
516
 
517
	if (WARN_ON(read_pointer >= GEN8_CSB_ENTRIES))
518
		return;
519
 
520
	*status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer));
521
	*context_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer));
522
}
523
 
5354 serge 524
/**
6084 serge 525
 * intel_lrc_irq_handler() - handle Context Switch interrupts
5354 serge 526
 * @ring: Engine Command Streamer to handle.
527
 *
528
 * Check the unread Context Status Buffers and manage the submission of new
529
 * contexts to the ELSP accordingly.
530
 */
6084 serge 531
void intel_lrc_irq_handler(struct intel_engine_cs *ring)
5354 serge 532
{
533
	struct drm_i915_private *dev_priv = ring->dev->dev_private;
534
	u32 status_pointer;
535
	u8 read_pointer;
536
	u8 write_pointer;
6084 serge 537
	u32 status = 0;
5354 serge 538
	u32 status_id;
539
	u32 submit_contexts = 0;
540
 
541
	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
542
 
543
	read_pointer = ring->next_context_status_buffer;
7144 serge 544
	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
5354 serge 545
	if (read_pointer > write_pointer)
6084 serge 546
		write_pointer += GEN8_CSB_ENTRIES;
5354 serge 547
 
548
	spin_lock(&ring->execlist_lock);
549
 
550
	while (read_pointer < write_pointer) {
551
 
7144 serge 552
		get_context_status(ring, ++read_pointer % GEN8_CSB_ENTRIES,
553
				   &status, &status_id);
554
 
6084 serge 555
		if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
556
			continue;
557
 
5354 serge 558
		if (status & GEN8_CTX_STATUS_PREEMPTED) {
559
			if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
560
				if (execlists_check_remove_request(ring, status_id))
561
					WARN(1, "Lite Restored request removed from queue\n");
562
			} else
563
				WARN(1, "Preemption without Lite Restore\n");
564
		}
565
 
7144 serge 566
		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
567
		    (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
5354 serge 568
			if (execlists_check_remove_request(ring, status_id))
569
				submit_contexts++;
570
		}
571
	}
572
 
7144 serge 573
	if (ring->disable_lite_restore_wa) {
6084 serge 574
		/* Prevent a ctx to preempt itself */
575
		if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
576
		    (submit_contexts != 0))
577
			execlists_context_unqueue(ring);
578
	} else if (submit_contexts != 0) {
5354 serge 579
		execlists_context_unqueue(ring);
6084 serge 580
	}
5354 serge 581
 
582
	spin_unlock(&ring->execlist_lock);
583
 
7144 serge 584
	if (unlikely(submit_contexts > 2))
585
		DRM_ERROR("More than two context complete events?\n");
586
 
6084 serge 587
	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
5354 serge 588
 
7144 serge 589
	/* Update the read pointer to the old write pointer. Manual ringbuffer
590
	 * management ftw  */
5354 serge 591
	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
7144 serge 592
		   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
593
				 ring->next_context_status_buffer << 8));
5354 serge 594
}
595
 
6084 serge 596
static int execlists_context_queue(struct drm_i915_gem_request *request)
5354 serge 597
{
6084 serge 598
	struct intel_engine_cs *ring = request->ring;
599
	struct drm_i915_gem_request *cursor;
5354 serge 600
	int num_elements = 0;
601
 
7144 serge 602
	if (request->ctx != request->i915->kernel_context)
603
		intel_lr_context_pin(request->ctx, ring);
5354 serge 604
 
6084 serge 605
	i915_gem_request_reference(request);
5354 serge 606
 
6084 serge 607
	spin_lock_irq(&ring->execlist_lock);
5354 serge 608
 
609
	list_for_each_entry(cursor, &ring->execlist_queue, execlist_link)
610
		if (++num_elements > 2)
611
			break;
612
 
613
	if (num_elements > 2) {
6084 serge 614
		struct drm_i915_gem_request *tail_req;
5354 serge 615
 
616
		tail_req = list_last_entry(&ring->execlist_queue,
6084 serge 617
					   struct drm_i915_gem_request,
5354 serge 618
					   execlist_link);
619
 
6084 serge 620
		if (request->ctx == tail_req->ctx) {
5354 serge 621
			WARN(tail_req->elsp_submitted != 0,
622
				"More than 2 already-submitted reqs queued\n");
7144 serge 623
			list_move_tail(&tail_req->execlist_link,
624
				       &ring->execlist_retired_req_list);
5354 serge 625
		}
626
	}
627
 
6084 serge 628
	list_add_tail(&request->execlist_link, &ring->execlist_queue);
5354 serge 629
	if (num_elements == 0)
630
		execlists_context_unqueue(ring);
631
 
6084 serge 632
	spin_unlock_irq(&ring->execlist_lock);
5354 serge 633
 
634
	return 0;
635
}
636
 
6084 serge 637
static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
5354 serge 638
{
6084 serge 639
	struct intel_engine_cs *ring = req->ring;
5354 serge 640
	uint32_t flush_domains;
641
	int ret;
642
 
643
	flush_domains = 0;
644
	if (ring->gpu_caches_dirty)
645
		flush_domains = I915_GEM_GPU_DOMAINS;
646
 
6084 serge 647
	ret = ring->emit_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
5354 serge 648
	if (ret)
649
		return ret;
650
 
651
	ring->gpu_caches_dirty = false;
652
	return 0;
653
}
654
 
6084 serge 655
static int execlists_move_to_gpu(struct drm_i915_gem_request *req,
5354 serge 656
				 struct list_head *vmas)
657
{
6084 serge 658
	const unsigned other_rings = ~intel_ring_flag(req->ring);
5354 serge 659
	struct i915_vma *vma;
660
	uint32_t flush_domains = 0;
661
	bool flush_chipset = false;
662
	int ret;
663
 
664
	list_for_each_entry(vma, vmas, exec_list) {
665
		struct drm_i915_gem_object *obj = vma->obj;
666
 
6084 serge 667
		if (obj->active & other_rings) {
668
			ret = i915_gem_object_sync(obj, req->ring, &req);
669
			if (ret)
670
				return ret;
671
		}
5354 serge 672
 
673
		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
674
			flush_chipset |= i915_gem_clflush_object(obj, false);
675
 
676
		flush_domains |= obj->base.write_domain;
677
	}
678
 
679
	if (flush_domains & I915_GEM_DOMAIN_GTT)
680
		wmb();
681
 
682
	/* Unconditionally invalidate gpu caches and ensure that we do flush
683
	 * any residual writes from the previous batch.
684
	 */
6084 serge 685
	return logical_ring_invalidate_all_caches(req);
5354 serge 686
}
687
 
6084 serge 688
int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
689
{
7144 serge 690
	int ret = 0;
6084 serge 691
 
692
	request->ringbuf = request->ctx->engine[request->ring->id].ringbuf;
693
 
7144 serge 694
	if (i915.enable_guc_submission) {
695
		/*
696
		 * Check that the GuC has space for the request before
697
		 * going any further, as the i915_add_request() call
698
		 * later on mustn't fail ...
699
		 */
700
		struct intel_guc *guc = &request->i915->guc;
701
 
702
		ret = i915_guc_wq_check_space(guc->execbuf_client);
6084 serge 703
		if (ret)
704
			return ret;
705
	}
706
 
7144 serge 707
	if (request->ctx != request->i915->kernel_context)
708
		ret = intel_lr_context_pin(request->ctx, request->ring);
709
 
710
	return ret;
6084 serge 711
}
712
 
713
static int logical_ring_wait_for_space(struct drm_i915_gem_request *req,
714
				       int bytes)
715
{
716
	struct intel_ringbuffer *ringbuf = req->ringbuf;
717
	struct intel_engine_cs *ring = req->ring;
718
	struct drm_i915_gem_request *target;
719
	unsigned space;
720
	int ret;
721
 
722
	if (intel_ring_space(ringbuf) >= bytes)
723
		return 0;
724
 
725
	/* The whole point of reserving space is to not wait! */
726
	WARN_ON(ringbuf->reserved_in_use);
727
 
728
	list_for_each_entry(target, &ring->request_list, list) {
729
		/*
730
		 * The request queue is per-engine, so can contain requests
731
		 * from multiple ringbuffers. Here, we must ignore any that
732
		 * aren't from the ringbuffer we're considering.
733
		 */
734
		if (target->ringbuf != ringbuf)
735
			continue;
736
 
737
		/* Would completion of this request free enough space? */
738
		space = __intel_ring_space(target->postfix, ringbuf->tail,
739
					   ringbuf->size);
740
		if (space >= bytes)
741
			break;
742
	}
743
 
744
	if (WARN_ON(&target->list == &ring->request_list))
745
		return -ENOSPC;
746
 
747
	ret = i915_wait_request(target);
748
	if (ret)
749
		return ret;
750
 
751
	ringbuf->space = space;
752
	return 0;
753
}
754
 
755
/*
756
 * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload
757
 * @request: Request to advance the logical ringbuffer of.
758
 *
759
 * The tail is updated in our logical ringbuffer struct, not in the actual context. What
760
 * really happens during submission is that the context and current tail will be placed
761
 * on a queue waiting for the ELSP to be ready to accept a new context submission. At that
762
 * point, the tail *inside* the context is updated and the ELSP written to.
763
 */
7144 serge 764
static int
6084 serge 765
intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
766
{
7144 serge 767
	struct intel_ringbuffer *ringbuf = request->ringbuf;
6084 serge 768
	struct drm_i915_private *dev_priv = request->i915;
7144 serge 769
	struct intel_engine_cs *engine = request->ring;
6084 serge 770
 
7144 serge 771
	intel_logical_ring_advance(ringbuf);
772
	request->tail = ringbuf->tail;
6084 serge 773
 
7144 serge 774
	/*
775
	 * Here we add two extra NOOPs as padding to avoid
776
	 * lite restore of a context with HEAD==TAIL.
777
	 *
778
	 * Caller must reserve WA_TAIL_DWORDS for us!
779
	 */
780
	intel_logical_ring_emit(ringbuf, MI_NOOP);
781
	intel_logical_ring_emit(ringbuf, MI_NOOP);
782
	intel_logical_ring_advance(ringbuf);
6084 serge 783
 
7144 serge 784
	if (intel_ring_stopped(engine))
785
		return 0;
6084 serge 786
 
7144 serge 787
	if (engine->last_context != request->ctx) {
788
		if (engine->last_context)
789
			intel_lr_context_unpin(engine->last_context, engine);
790
		if (request->ctx != request->i915->kernel_context) {
791
			intel_lr_context_pin(request->ctx, engine);
792
			engine->last_context = request->ctx;
793
		} else {
794
			engine->last_context = NULL;
795
		}
796
	}
797
 
6084 serge 798
	if (dev_priv->guc.execbuf_client)
799
		i915_guc_submit(dev_priv->guc.execbuf_client, request);
800
	else
801
		execlists_context_queue(request);
7144 serge 802
 
803
	return 0;
6084 serge 804
}
805
 
806
static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
807
{
808
	uint32_t __iomem *virt;
809
	int rem = ringbuf->size - ringbuf->tail;
810
 
811
	virt = ringbuf->virtual_start + ringbuf->tail;
812
	rem /= 4;
813
	while (rem--)
814
		iowrite32(MI_NOOP, virt++);
815
 
816
	ringbuf->tail = 0;
817
	intel_ring_update_space(ringbuf);
818
}
819
 
820
static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes)
821
{
822
	struct intel_ringbuffer *ringbuf = req->ringbuf;
823
	int remain_usable = ringbuf->effective_size - ringbuf->tail;
824
	int remain_actual = ringbuf->size - ringbuf->tail;
825
	int ret, total_bytes, wait_bytes = 0;
826
	bool need_wrap = false;
827
 
828
	if (ringbuf->reserved_in_use)
829
		total_bytes = bytes;
830
	else
831
		total_bytes = bytes + ringbuf->reserved_size;
832
 
833
	if (unlikely(bytes > remain_usable)) {
834
		/*
835
		 * Not enough space for the basic request. So need to flush
836
		 * out the remainder and then wait for base + reserved.
837
		 */
838
		wait_bytes = remain_actual + total_bytes;
839
		need_wrap = true;
840
	} else {
841
		if (unlikely(total_bytes > remain_usable)) {
842
			/*
843
			 * The base request will fit but the reserved space
6660 serge 844
			 * falls off the end. So don't need an immediate wrap
845
			 * and only need to effectively wait for the reserved
846
			 * size space from the start of ringbuffer.
6084 serge 847
			 */
848
			wait_bytes = remain_actual + ringbuf->reserved_size;
849
		} else if (total_bytes > ringbuf->space) {
850
			/* No wrapping required, just waiting. */
851
			wait_bytes = total_bytes;
852
		}
853
	}
854
 
855
	if (wait_bytes) {
856
		ret = logical_ring_wait_for_space(req, wait_bytes);
857
		if (unlikely(ret))
858
			return ret;
859
 
860
		if (need_wrap)
861
			__wrap_ring_buffer(ringbuf);
862
	}
863
 
864
	return 0;
865
}
866
 
5354 serge 867
/**
6084 serge 868
 * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands
869
 *
870
 * @req: The request to start some new work for
871
 * @num_dwords: number of DWORDs that we plan to write to the ringbuffer.
872
 *
873
 * The ringbuffer might not be ready to accept the commands right away (maybe it needs to
874
 * be wrapped, or wait a bit for the tail to be updated). This function takes care of that
875
 * and also preallocates a request (every workload submission is still mediated through
876
 * requests, same as it did with legacy ringbuffer submission).
877
 *
878
 * Return: non-zero if the ringbuffer is not ready to be written to.
879
 */
880
int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
881
{
882
	struct drm_i915_private *dev_priv;
883
	int ret;
884
 
885
	WARN_ON(req == NULL);
886
	dev_priv = req->ring->dev->dev_private;
887
 
888
	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
889
				   dev_priv->mm.interruptible);
890
	if (ret)
891
		return ret;
892
 
893
	ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t));
894
	if (ret)
895
		return ret;
896
 
897
	req->ringbuf->space -= num_dwords * sizeof(uint32_t);
898
	return 0;
899
}
900
 
901
int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request)
902
{
903
	/*
904
	 * The first call merely notes the reserve request and is common for
905
	 * all back ends. The subsequent localised _begin() call actually
906
	 * ensures that the reservation is available. Without the begin, if
907
	 * the request creator immediately submitted the request without
908
	 * adding any commands to it then there might not actually be
909
	 * sufficient room for the submission commands.
910
	 */
911
	intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST);
912
 
913
	return intel_logical_ring_begin(request, 0);
914
}
915
 
916
/**
5354 serge 917
 * execlists_submission() - submit a batchbuffer for execution, Execlists style
918
 * @dev: DRM device.
919
 * @file: DRM file.
920
 * @ring: Engine Command Streamer to submit to.
921
 * @ctx: Context to employ for this submission.
922
 * @args: execbuffer call arguments.
923
 * @vmas: list of vmas.
924
 * @batch_obj: the batchbuffer to submit.
925
 * @exec_start: batchbuffer start virtual address pointer.
6084 serge 926
 * @dispatch_flags: translated execbuffer call flags.
5354 serge 927
 *
928
 * This is the evil twin version of i915_gem_ringbuffer_submission. It abstracts
929
 * away the submission details of the execbuffer ioctl call.
930
 *
931
 * Return: non-zero if the submission fails.
932
 */
6084 serge 933
int intel_execlists_submission(struct i915_execbuffer_params *params,
5354 serge 934
			       struct drm_i915_gem_execbuffer2 *args,
6084 serge 935
			       struct list_head *vmas)
5354 serge 936
{
6084 serge 937
	struct drm_device       *dev = params->dev;
938
	struct intel_engine_cs  *ring = params->ring;
5354 serge 939
	struct drm_i915_private *dev_priv = dev->dev_private;
6084 serge 940
	struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf;
941
	u64 exec_start;
5354 serge 942
	int instp_mode;
943
	u32 instp_mask;
944
	int ret;
945
 
946
	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
947
	instp_mask = I915_EXEC_CONSTANTS_MASK;
948
	switch (instp_mode) {
949
	case I915_EXEC_CONSTANTS_REL_GENERAL:
950
	case I915_EXEC_CONSTANTS_ABSOLUTE:
951
	case I915_EXEC_CONSTANTS_REL_SURFACE:
952
		if (instp_mode != 0 && ring != &dev_priv->ring[RCS]) {
953
			DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
954
			return -EINVAL;
955
		}
956
 
957
		if (instp_mode != dev_priv->relative_constants_mode) {
958
			if (instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
959
				DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
960
				return -EINVAL;
961
			}
962
 
963
			/* The HW changed the meaning on this bit on gen6 */
964
			instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
965
		}
966
		break;
967
	default:
968
		DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode);
969
		return -EINVAL;
970
	}
971
 
972
	if (args->flags & I915_EXEC_GEN7_SOL_RESET) {
973
		DRM_DEBUG("sol reset is gen7 only\n");
974
		return -EINVAL;
975
	}
976
 
6084 serge 977
	ret = execlists_move_to_gpu(params->request, vmas);
5354 serge 978
	if (ret)
979
		return ret;
980
 
981
	if (ring == &dev_priv->ring[RCS] &&
982
	    instp_mode != dev_priv->relative_constants_mode) {
6084 serge 983
		ret = intel_logical_ring_begin(params->request, 4);
5354 serge 984
		if (ret)
985
			return ret;
986
 
987
		intel_logical_ring_emit(ringbuf, MI_NOOP);
988
		intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
6937 serge 989
		intel_logical_ring_emit_reg(ringbuf, INSTPM);
5354 serge 990
		intel_logical_ring_emit(ringbuf, instp_mask << 16 | instp_mode);
991
		intel_logical_ring_advance(ringbuf);
992
 
993
		dev_priv->relative_constants_mode = instp_mode;
994
	}
995
 
6084 serge 996
	exec_start = params->batch_obj_vm_offset +
997
		     args->batch_start_offset;
998
 
999
	ret = ring->emit_bb_start(params->request, exec_start, params->dispatch_flags);
5354 serge 1000
	if (ret)
1001
		return ret;
1002
 
6084 serge 1003
	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
5354 serge 1004
 
6084 serge 1005
	i915_gem_execbuffer_move_to_active(vmas, params->request);
1006
	i915_gem_execbuffer_retire_commands(params);
1007
 
5354 serge 1008
	return 0;
1009
}
1010
 
1011
void intel_execlists_retire_requests(struct intel_engine_cs *ring)
1012
{
6084 serge 1013
	struct drm_i915_gem_request *req, *tmp;
5354 serge 1014
	struct list_head retired_list;
1015
 
1016
	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
1017
	if (list_empty(&ring->execlist_retired_req_list))
1018
		return;
1019
 
1020
	INIT_LIST_HEAD(&retired_list);
6084 serge 1021
	spin_lock_irq(&ring->execlist_lock);
5354 serge 1022
	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
6084 serge 1023
	spin_unlock_irq(&ring->execlist_lock);
5354 serge 1024
 
1025
	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
1026
		struct intel_context *ctx = req->ctx;
1027
		struct drm_i915_gem_object *ctx_obj =
1028
				ctx->engine[ring->id].state;
1029
 
7144 serge 1030
		if (ctx_obj && (ctx != req->i915->kernel_context))
1031
			intel_lr_context_unpin(ctx, ring);
1032
 
5354 serge 1033
		list_del(&req->execlist_link);
6084 serge 1034
		i915_gem_request_unreference(req);
5354 serge 1035
	}
1036
}
1037
 
1038
void intel_logical_ring_stop(struct intel_engine_cs *ring)
1039
{
1040
	struct drm_i915_private *dev_priv = ring->dev->dev_private;
1041
	int ret;
1042
 
1043
	if (!intel_ring_initialized(ring))
1044
		return;
1045
 
1046
	ret = intel_ring_idle(ring);
1047
	if (ret && !i915_reset_in_progress(&to_i915(ring->dev)->gpu_error))
1048
		DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
1049
			  ring->name, ret);
1050
 
1051
	/* TODO: Is this correct with Execlists enabled? */
1052
	I915_WRITE_MODE(ring, _MASKED_BIT_ENABLE(STOP_RING));
1053
	if (wait_for_atomic((I915_READ_MODE(ring) & MODE_IDLE) != 0, 1000)) {
1054
		DRM_ERROR("%s :timed out trying to stop ring\n", ring->name);
1055
		return;
1056
	}
1057
	I915_WRITE_MODE(ring, _MASKED_BIT_DISABLE(STOP_RING));
1058
}
1059
 
6084 serge 1060
int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
5354 serge 1061
{
6084 serge 1062
	struct intel_engine_cs *ring = req->ring;
5354 serge 1063
	int ret;
1064
 
1065
	if (!ring->gpu_caches_dirty)
1066
		return 0;
1067
 
6084 serge 1068
	ret = ring->emit_flush(req, 0, I915_GEM_GPU_DOMAINS);
5354 serge 1069
	if (ret)
1070
		return ret;
1071
 
1072
	ring->gpu_caches_dirty = false;
1073
	return 0;
1074
}
1075
 
7144 serge 1076
static int intel_lr_context_do_pin(struct intel_context *ctx,
1077
				   struct intel_engine_cs *ring)
5354 serge 1078
{
6084 serge 1079
	struct drm_device *dev = ring->dev;
1080
	struct drm_i915_private *dev_priv = dev->dev_private;
7144 serge 1081
	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
1082
	struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
1083
	struct page *lrc_state_page;
1084
	uint32_t *lrc_reg_state;
1085
	int ret;
5354 serge 1086
 
6084 serge 1087
	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
7144 serge 1088
 
6084 serge 1089
	ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
1090
			PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
1091
	if (ret)
1092
		return ret;
5354 serge 1093
 
7144 serge 1094
	lrc_state_page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
1095
	if (WARN_ON(!lrc_state_page)) {
1096
		ret = -ENODEV;
1097
		goto unpin_ctx_obj;
1098
	}
1099
 
6084 serge 1100
	ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
1101
	if (ret)
1102
		goto unpin_ctx_obj;
5354 serge 1103
 
7144 serge 1104
	ctx->engine[ring->id].lrc_vma = i915_gem_obj_to_ggtt(ctx_obj);
1105
	intel_lr_context_descriptor_update(ctx, ring);
1106
	lrc_reg_state = kmap(lrc_state_page);
1107
	lrc_reg_state[CTX_RING_BUFFER_START+1] = ringbuf->vma->node.start;
1108
	ctx->engine[ring->id].lrc_reg_state = lrc_reg_state;
6084 serge 1109
	ctx_obj->dirty = true;
1110
 
1111
	/* Invalidate GuC TLB. */
1112
	if (i915.enable_guc_submission)
1113
		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
1114
 
1115
	return ret;
1116
 
1117
unpin_ctx_obj:
1118
	i915_gem_object_ggtt_unpin(ctx_obj);
1119
 
1120
	return ret;
5354 serge 1121
}
1122
 
7144 serge 1123
static int intel_lr_context_pin(struct intel_context *ctx,
1124
				struct intel_engine_cs *engine)
5354 serge 1125
{
1126
	int ret = 0;
1127
 
7144 serge 1128
	if (ctx->engine[engine->id].pin_count++ == 0) {
1129
		ret = intel_lr_context_do_pin(ctx, engine);
5354 serge 1130
		if (ret)
6084 serge 1131
			goto reset_pin_count;
7144 serge 1132
 
1133
		i915_gem_context_reference(ctx);
5354 serge 1134
	}
1135
	return ret;
1136
 
6084 serge 1137
reset_pin_count:
7144 serge 1138
	ctx->engine[engine->id].pin_count = 0;
5354 serge 1139
	return ret;
1140
}
1141
 
7144 serge 1142
void intel_lr_context_unpin(struct intel_context *ctx,
1143
			    struct intel_engine_cs *engine)
5354 serge 1144
{
7144 serge 1145
	struct drm_i915_gem_object *ctx_obj = ctx->engine[engine->id].state;
5354 serge 1146
 
7144 serge 1147
	WARN_ON(!mutex_is_locked(&ctx->i915->dev->struct_mutex));
1148
	if (--ctx->engine[engine->id].pin_count == 0) {
1149
//		kunmap(kmap_to_page(ctx->engine[engine->id].lrc_reg_state));
1150
		intel_unpin_ringbuffer_obj(ctx->engine[engine->id].ringbuf);
1151
		i915_gem_object_ggtt_unpin(ctx_obj);
1152
		ctx->engine[engine->id].lrc_vma = NULL;
1153
		ctx->engine[engine->id].lrc_desc = 0;
1154
		ctx->engine[engine->id].lrc_reg_state = NULL;
1155
 
1156
		i915_gem_context_unreference(ctx);
5354 serge 1157
	}
1158
}
1159
 
6084 serge 1160
static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
5354 serge 1161
{
6084 serge 1162
	int ret, i;
1163
	struct intel_engine_cs *ring = req->ring;
1164
	struct intel_ringbuffer *ringbuf = req->ringbuf;
1165
	struct drm_device *dev = ring->dev;
1166
	struct drm_i915_private *dev_priv = dev->dev_private;
1167
	struct i915_workarounds *w = &dev_priv->workarounds;
5354 serge 1168
 
7144 serge 1169
	if (w->count == 0)
5354 serge 1170
		return 0;
1171
 
6084 serge 1172
	ring->gpu_caches_dirty = true;
1173
	ret = logical_ring_flush_all_caches(req);
1174
	if (ret)
1175
		return ret;
5354 serge 1176
 
6084 serge 1177
	ret = intel_logical_ring_begin(req, w->count * 2 + 2);
1178
	if (ret)
1179
		return ret;
5354 serge 1180
 
6084 serge 1181
	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count));
1182
	for (i = 0; i < w->count; i++) {
6937 serge 1183
		intel_logical_ring_emit_reg(ringbuf, w->reg[i].addr);
6084 serge 1184
		intel_logical_ring_emit(ringbuf, w->reg[i].value);
1185
	}
1186
	intel_logical_ring_emit(ringbuf, MI_NOOP);
5354 serge 1187
 
6084 serge 1188
	intel_logical_ring_advance(ringbuf);
5354 serge 1189
 
6084 serge 1190
	ring->gpu_caches_dirty = true;
1191
	ret = logical_ring_flush_all_caches(req);
1192
	if (ret)
1193
		return ret;
5354 serge 1194
 
6084 serge 1195
	return 0;
5354 serge 1196
}
1197
 
6084 serge 1198
#define wa_ctx_emit(batch, index, cmd)					\
1199
	do {								\
1200
		int __index = (index)++;				\
1201
		if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
1202
			return -ENOSPC;					\
1203
		}							\
1204
		batch[__index] = (cmd);					\
1205
	} while (0)
1206
 
6937 serge 1207
#define wa_ctx_emit_reg(batch, index, reg) \
1208
	wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg))
6084 serge 1209
 
1210
/*
1211
 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1212
 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1213
 * but there is a slight complication as this is applied in WA batch where the
1214
 * values are only initialized once so we cannot take register value at the
1215
 * beginning and reuse it further; hence we save its value to memory, upload a
1216
 * constant value with bit21 set and then we restore it back with the saved value.
1217
 * To simplify the WA, a constant value is formed by using the default value
1218
 * of this register. This shouldn't be a problem because we are only modifying
1219
 * it for a short period and this batch in non-premptible. We can ofcourse
1220
 * use additional instructions that read the actual value of the register
1221
 * at that time and set our bit of interest but it makes the WA complicated.
1222
 *
1223
 * This WA is also required for Gen9 so extracting as a function avoids
1224
 * code duplication.
1225
 */
1226
static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring,
1227
						uint32_t *const batch,
1228
						uint32_t index)
5354 serge 1229
{
6084 serge 1230
	uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES);
5354 serge 1231
 
6084 serge 1232
	/*
1233
	 * WaDisableLSQCROPERFforOCL:skl
1234
	 * This WA is implemented in skl_init_clock_gating() but since
1235
	 * this batch updates GEN8_L3SQCREG4 with default value we need to
1236
	 * set this bit here to retain the WA during flush.
1237
	 */
6937 serge 1238
	if (IS_SKL_REVID(ring->dev, 0, SKL_REVID_E0))
6084 serge 1239
		l3sqc4_flush |= GEN8_LQSC_RO_PERF_DIS;
5354 serge 1240
 
6084 serge 1241
	wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 |
1242
				   MI_SRM_LRM_GLOBAL_GTT));
6937 serge 1243
	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
6084 serge 1244
	wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
1245
	wa_ctx_emit(batch, index, 0);
5354 serge 1246
 
6084 serge 1247
	wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
6937 serge 1248
	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
6084 serge 1249
	wa_ctx_emit(batch, index, l3sqc4_flush);
5354 serge 1250
 
6084 serge 1251
	wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
1252
	wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL |
1253
				   PIPE_CONTROL_DC_FLUSH_ENABLE));
1254
	wa_ctx_emit(batch, index, 0);
1255
	wa_ctx_emit(batch, index, 0);
1256
	wa_ctx_emit(batch, index, 0);
1257
	wa_ctx_emit(batch, index, 0);
5354 serge 1258
 
6084 serge 1259
	wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 |
1260
				   MI_SRM_LRM_GLOBAL_GTT));
6937 serge 1261
	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
6084 serge 1262
	wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
1263
	wa_ctx_emit(batch, index, 0);
5354 serge 1264
 
6084 serge 1265
	return index;
1266
}
5354 serge 1267
 
6084 serge 1268
static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
1269
				    uint32_t offset,
1270
				    uint32_t start_alignment)
1271
{
1272
	return wa_ctx->offset = ALIGN(offset, start_alignment);
1273
}
1274
 
1275
static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx,
1276
			     uint32_t offset,
1277
			     uint32_t size_alignment)
1278
{
1279
	wa_ctx->size = offset - wa_ctx->offset;
1280
 
1281
	WARN(wa_ctx->size % size_alignment,
1282
	     "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n",
1283
	     wa_ctx->size, size_alignment);
5354 serge 1284
	return 0;
1285
}
1286
 
6084 serge 1287
/**
1288
 * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA
1289
 *
1290
 * @ring: only applicable for RCS
1291
 * @wa_ctx: structure representing wa_ctx
1292
 *  offset: specifies start of the batch, should be cache-aligned. This is updated
1293
 *    with the offset value received as input.
1294
 *  size: size of the batch in DWORDS but HW expects in terms of cachelines
1295
 * @batch: page in which WA are loaded
1296
 * @offset: This field specifies the start of the batch, it should be
1297
 *  cache-aligned otherwise it is adjusted accordingly.
1298
 *  Typically we only have one indirect_ctx and per_ctx batch buffer which are
1299
 *  initialized at the beginning and shared across all contexts but this field
1300
 *  helps us to have multiple batches at different offsets and select them based
1301
 *  on a criteria. At the moment this batch always start at the beginning of the page
1302
 *  and at this point we don't have multiple wa_ctx batch buffers.
1303
 *
1304
 *  The number of WA applied are not known at the beginning; we use this field
1305
 *  to return the no of DWORDS written.
1306
 *
1307
 *  It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1308
 *  so it adds NOOPs as padding to make it cacheline aligned.
1309
 *  MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1310
 *  makes a complete batch buffer.
1311
 *
1312
 * Return: non-zero if we exceed the PAGE_SIZE limit.
1313
 */
1314
 
1315
static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
1316
				    struct i915_wa_ctx_bb *wa_ctx,
1317
				    uint32_t *const batch,
1318
				    uint32_t *offset)
5354 serge 1319
{
6084 serge 1320
	uint32_t scratch_addr;
1321
	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
5354 serge 1322
 
6084 serge 1323
	/* WaDisableCtxRestoreArbitration:bdw,chv */
1324
	wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE);
5354 serge 1325
 
6084 serge 1326
	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1327
	if (IS_BROADWELL(ring->dev)) {
1328
		int rc = gen8_emit_flush_coherentl3_wa(ring, batch, index);
1329
		if (rc < 0)
1330
			return rc;
1331
		index = rc;
1332
	}
5354 serge 1333
 
6084 serge 1334
	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1335
	/* Actual scratch location is at 128 bytes offset */
1336
	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
5354 serge 1337
 
6084 serge 1338
	wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
1339
	wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 |
1340
				   PIPE_CONTROL_GLOBAL_GTT_IVB |
1341
				   PIPE_CONTROL_CS_STALL |
1342
				   PIPE_CONTROL_QW_WRITE));
1343
	wa_ctx_emit(batch, index, scratch_addr);
1344
	wa_ctx_emit(batch, index, 0);
1345
	wa_ctx_emit(batch, index, 0);
1346
	wa_ctx_emit(batch, index, 0);
5354 serge 1347
 
6084 serge 1348
	/* Pad to end of cacheline */
1349
	while (index % CACHELINE_DWORDS)
1350
		wa_ctx_emit(batch, index, MI_NOOP);
5354 serge 1351
 
6084 serge 1352
	/*
1353
	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1354
	 * execution depends on the length specified in terms of cache lines
1355
	 * in the register CTX_RCS_INDIRECT_CTX
1356
	 */
5354 serge 1357
 
6084 serge 1358
	return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
1359
}
5354 serge 1360
 
6084 serge 1361
/**
1362
 * gen8_init_perctx_bb() - initialize per ctx batch with WA
1363
 *
1364
 * @ring: only applicable for RCS
1365
 * @wa_ctx: structure representing wa_ctx
1366
 *  offset: specifies start of the batch, should be cache-aligned.
1367
 *  size: size of the batch in DWORDS but HW expects in terms of cachelines
1368
 * @batch: page in which WA are loaded
1369
 * @offset: This field specifies the start of this batch.
1370
 *   This batch is started immediately after indirect_ctx batch. Since we ensure
1371
 *   that indirect_ctx ends on a cacheline this batch is aligned automatically.
1372
 *
1373
 *   The number of DWORDS written are returned using this field.
1374
 *
1375
 *  This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
1376
 *  to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
1377
 */
1378
static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
1379
			       struct i915_wa_ctx_bb *wa_ctx,
1380
			       uint32_t *const batch,
1381
			       uint32_t *offset)
1382
{
1383
	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
1384
 
1385
	/* WaDisableCtxRestoreArbitration:bdw,chv */
1386
	wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
1387
 
1388
	wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
1389
 
1390
	return wa_ctx_end(wa_ctx, *offset = index, 1);
5354 serge 1391
}
1392
 
6084 serge 1393
static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring,
1394
				    struct i915_wa_ctx_bb *wa_ctx,
1395
				    uint32_t *const batch,
1396
				    uint32_t *offset)
5354 serge 1397
{
6084 serge 1398
	int ret;
1399
	struct drm_device *dev = ring->dev;
1400
	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
5354 serge 1401
 
6084 serge 1402
	/* WaDisableCtxRestoreArbitration:skl,bxt */
6937 serge 1403
	if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) ||
1404
	    IS_BXT_REVID(dev, 0, BXT_REVID_A1))
6084 serge 1405
		wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE);
5354 serge 1406
 
6084 serge 1407
	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */
1408
	ret = gen8_emit_flush_coherentl3_wa(ring, batch, index);
1409
	if (ret < 0)
1410
		return ret;
1411
	index = ret;
5354 serge 1412
 
6084 serge 1413
	/* Pad to end of cacheline */
1414
	while (index % CACHELINE_DWORDS)
1415
		wa_ctx_emit(batch, index, MI_NOOP);
5354 serge 1416
 
6084 serge 1417
	return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
5354 serge 1418
}
1419
 
6084 serge 1420
static int gen9_init_perctx_bb(struct intel_engine_cs *ring,
1421
			       struct i915_wa_ctx_bb *wa_ctx,
1422
			       uint32_t *const batch,
1423
			       uint32_t *offset)
5354 serge 1424
{
6084 serge 1425
	struct drm_device *dev = ring->dev;
1426
	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
5354 serge 1427
 
6084 serge 1428
	/* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
6937 serge 1429
	if (IS_SKL_REVID(dev, 0, SKL_REVID_B0) ||
1430
	    IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
6084 serge 1431
		wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
6937 serge 1432
		wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0);
6084 serge 1433
		wa_ctx_emit(batch, index,
1434
			    _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING));
1435
		wa_ctx_emit(batch, index, MI_NOOP);
5354 serge 1436
	}
1437
 
6084 serge 1438
	/* WaDisableCtxRestoreArbitration:skl,bxt */
6937 serge 1439
	if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) ||
1440
	    IS_BXT_REVID(dev, 0, BXT_REVID_A1))
6084 serge 1441
		wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
5354 serge 1442
 
6084 serge 1443
	wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
1444
 
1445
	return wa_ctx_end(wa_ctx, *offset = index, 1);
5354 serge 1446
}
1447
 
6084 serge 1448
static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
5354 serge 1449
{
1450
	int ret;
1451
 
6084 serge 1452
	ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size));
1453
	if (!ring->wa_ctx.obj) {
1454
		DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
1455
		return -ENOMEM;
1456
	}
5354 serge 1457
 
6084 serge 1458
	ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0);
1459
	if (ret) {
1460
		DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
1461
				 ret);
1462
		drm_gem_object_unreference(&ring->wa_ctx.obj->base);
5354 serge 1463
		return ret;
6084 serge 1464
	}
5354 serge 1465
 
1466
	return 0;
1467
}
1468
 
6084 serge 1469
static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
5354 serge 1470
{
6084 serge 1471
	if (ring->wa_ctx.obj) {
1472
		i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
1473
		drm_gem_object_unreference(&ring->wa_ctx.obj->base);
1474
		ring->wa_ctx.obj = NULL;
1475
	}
1476
}
5354 serge 1477
 
6084 serge 1478
static int intel_init_workaround_bb(struct intel_engine_cs *ring)
1479
{
1480
	int ret;
1481
	uint32_t *batch;
1482
	uint32_t offset;
1483
	struct page *page;
1484
	struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
1485
 
1486
	WARN_ON(ring->id != RCS);
1487
 
1488
	/* update this when WA for higher Gen are added */
1489
	if (INTEL_INFO(ring->dev)->gen > 9) {
1490
		DRM_ERROR("WA batch buffer is not initialized for Gen%d\n",
1491
			  INTEL_INFO(ring->dev)->gen);
5354 serge 1492
		return 0;
6084 serge 1493
	}
5354 serge 1494
 
6084 serge 1495
	/* some WA perform writes to scratch page, ensure it is valid */
1496
	if (ring->scratch.obj == NULL) {
1497
		DRM_ERROR("scratch page not allocated for %s\n", ring->name);
1498
		return -EINVAL;
1499
	}
5354 serge 1500
 
6084 serge 1501
	ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
1502
	if (ret) {
1503
		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
5354 serge 1504
		return ret;
6084 serge 1505
	}
5354 serge 1506
 
6937 serge 1507
	page = i915_gem_object_get_dirty_page(wa_ctx->obj, 0);
6084 serge 1508
	batch = kmap_atomic(page);
1509
	offset = 0;
1510
 
1511
	if (INTEL_INFO(ring->dev)->gen == 8) {
1512
		ret = gen8_init_indirectctx_bb(ring,
1513
					       &wa_ctx->indirect_ctx,
1514
					       batch,
1515
					       &offset);
1516
		if (ret)
1517
			goto out;
1518
 
1519
		ret = gen8_init_perctx_bb(ring,
1520
					  &wa_ctx->per_ctx,
1521
					  batch,
1522
					  &offset);
1523
		if (ret)
1524
			goto out;
1525
	} else if (INTEL_INFO(ring->dev)->gen == 9) {
1526
		ret = gen9_init_indirectctx_bb(ring,
1527
					       &wa_ctx->indirect_ctx,
1528
					       batch,
1529
					       &offset);
1530
		if (ret)
1531
			goto out;
1532
 
1533
		ret = gen9_init_perctx_bb(ring,
1534
					  &wa_ctx->per_ctx,
1535
					  batch,
1536
					  &offset);
1537
		if (ret)
1538
			goto out;
5354 serge 1539
	}
1540
 
6084 serge 1541
out:
1542
	kunmap_atomic(batch);
5354 serge 1543
	if (ret)
6084 serge 1544
		lrc_destroy_wa_ctx_obj(ring);
5354 serge 1545
 
6084 serge 1546
	return ret;
5354 serge 1547
}
1548
 
1549
static int gen8_init_common_ring(struct intel_engine_cs *ring)
1550
{
1551
	struct drm_device *dev = ring->dev;
1552
	struct drm_i915_private *dev_priv = dev->dev_private;
6084 serge 1553
	u8 next_context_status_buffer_hw;
5354 serge 1554
 
6084 serge 1555
	lrc_setup_hardware_status_page(ring,
7144 serge 1556
				dev_priv->kernel_context->engine[ring->id].state);
6084 serge 1557
 
5354 serge 1558
	I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask));
1559
	I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff);
1560
 
1561
	I915_WRITE(RING_MODE_GEN7(ring),
1562
		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
1563
		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
1564
	POSTING_READ(RING_MODE_GEN7(ring));
6084 serge 1565
 
1566
	/*
1567
	 * Instead of resetting the Context Status Buffer (CSB) read pointer to
1568
	 * zero, we need to read the write pointer from hardware and use its
1569
	 * value because "this register is power context save restored".
1570
	 * Effectively, these states have been observed:
1571
	 *
1572
	 *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
1573
	 * BDW  | CSB regs not reset       | CSB regs reset       |
1574
	 * CHT  | CSB regs not reset       | CSB regs not reset   |
7144 serge 1575
	 * SKL  |         ?                |         ?            |
1576
	 * BXT  |         ?                |         ?            |
6084 serge 1577
	 */
7144 serge 1578
	next_context_status_buffer_hw =
1579
		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(ring)));
6084 serge 1580
 
1581
	/*
1582
	 * When the CSB registers are reset (also after power-up / gpu reset),
1583
	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
1584
	 * this special case, so the first element read is CSB[0].
1585
	 */
1586
	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
1587
		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
1588
 
1589
	ring->next_context_status_buffer = next_context_status_buffer_hw;
5354 serge 1590
	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
1591
 
1592
	memset(&ring->hangcheck, 0, sizeof(ring->hangcheck));
1593
 
1594
	return 0;
1595
}
1596
 
1597
static int gen8_init_render_ring(struct intel_engine_cs *ring)
1598
{
1599
	struct drm_device *dev = ring->dev;
1600
	struct drm_i915_private *dev_priv = dev->dev_private;
1601
	int ret;
1602
 
1603
	ret = gen8_init_common_ring(ring);
1604
	if (ret)
1605
		return ret;
1606
 
1607
	/* We need to disable the AsyncFlip performance optimisations in order
1608
	 * to use MI_WAIT_FOR_EVENT within the CS. It should already be
1609
	 * programmed to '1' on all products.
1610
	 *
1611
	 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
1612
	 */
1613
	I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
1614
 
6084 serge 1615
	I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
1616
 
1617
	return init_workarounds_ring(ring);
1618
}
1619
 
1620
static int gen9_init_render_ring(struct intel_engine_cs *ring)
1621
{
1622
	int ret;
1623
 
1624
	ret = gen8_init_common_ring(ring);
5354 serge 1625
	if (ret)
1626
		return ret;
1627
 
1628
	return init_workarounds_ring(ring);
1629
}
1630
 
6084 serge 1631
static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
5354 serge 1632
{
6084 serge 1633
	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
1634
	struct intel_engine_cs *ring = req->ring;
1635
	struct intel_ringbuffer *ringbuf = req->ringbuf;
1636
	const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
1637
	int i, ret;
1638
 
1639
	ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2);
1640
	if (ret)
1641
		return ret;
1642
 
1643
	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(num_lri_cmds));
1644
	for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) {
1645
		const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
1646
 
6937 serge 1647
		intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_UDW(ring, i));
6084 serge 1648
		intel_logical_ring_emit(ringbuf, upper_32_bits(pd_daddr));
6937 serge 1649
		intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_LDW(ring, i));
6084 serge 1650
		intel_logical_ring_emit(ringbuf, lower_32_bits(pd_daddr));
1651
	}
1652
 
1653
	intel_logical_ring_emit(ringbuf, MI_NOOP);
1654
	intel_logical_ring_advance(ringbuf);
1655
 
1656
	return 0;
1657
}
1658
 
1659
static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
1660
			      u64 offset, unsigned dispatch_flags)
1661
{
1662
	struct intel_ringbuffer *ringbuf = req->ringbuf;
1663
	bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE);
5354 serge 1664
	int ret;
1665
 
6084 serge 1666
	/* Don't rely in hw updating PDPs, specially in lite-restore.
1667
	 * Ideally, we should set Force PD Restore in ctx descriptor,
1668
	 * but we can't. Force Restore would be a second option, but
1669
	 * it is unsafe in case of lite-restore (because the ctx is
1670
	 * not idle). PML4 is allocated during ppgtt init so this is
1671
	 * not needed in 48-bit.*/
1672
	if (req->ctx->ppgtt &&
1673
	    (intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) {
1674
		if (!USES_FULL_48BIT_PPGTT(req->i915) &&
1675
		    !intel_vgpu_active(req->i915->dev)) {
1676
			ret = intel_logical_ring_emit_pdps(req);
1677
			if (ret)
1678
				return ret;
1679
		}
1680
 
1681
		req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring);
1682
	}
1683
 
1684
	ret = intel_logical_ring_begin(req, 4);
5354 serge 1685
	if (ret)
1686
		return ret;
1687
 
1688
	/* FIXME(BDW): Address space and security selectors. */
6084 serge 1689
	intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 |
1690
				(ppgtt<<8) |
1691
				(dispatch_flags & I915_DISPATCH_RS ?
1692
				 MI_BATCH_RESOURCE_STREAMER : 0));
5354 serge 1693
	intel_logical_ring_emit(ringbuf, lower_32_bits(offset));
1694
	intel_logical_ring_emit(ringbuf, upper_32_bits(offset));
1695
	intel_logical_ring_emit(ringbuf, MI_NOOP);
1696
	intel_logical_ring_advance(ringbuf);
1697
 
1698
	return 0;
1699
}
1700
 
1701
static bool gen8_logical_ring_get_irq(struct intel_engine_cs *ring)
1702
{
1703
	struct drm_device *dev = ring->dev;
1704
	struct drm_i915_private *dev_priv = dev->dev_private;
1705
	unsigned long flags;
1706
 
1707
	if (WARN_ON(!intel_irqs_enabled(dev_priv)))
1708
		return false;
1709
 
1710
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1711
	if (ring->irq_refcount++ == 0) {
1712
		I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask));
1713
		POSTING_READ(RING_IMR(ring->mmio_base));
1714
	}
1715
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1716
 
1717
	return true;
1718
}
1719
 
1720
static void gen8_logical_ring_put_irq(struct intel_engine_cs *ring)
1721
{
1722
	struct drm_device *dev = ring->dev;
1723
	struct drm_i915_private *dev_priv = dev->dev_private;
1724
	unsigned long flags;
1725
 
1726
	spin_lock_irqsave(&dev_priv->irq_lock, flags);
1727
	if (--ring->irq_refcount == 0) {
1728
		I915_WRITE_IMR(ring, ~ring->irq_keep_mask);
1729
		POSTING_READ(RING_IMR(ring->mmio_base));
1730
	}
1731
	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
1732
}
1733
 
6084 serge 1734
static int gen8_emit_flush(struct drm_i915_gem_request *request,
5354 serge 1735
			   u32 invalidate_domains,
1736
			   u32 unused)
1737
{
6084 serge 1738
	struct intel_ringbuffer *ringbuf = request->ringbuf;
5354 serge 1739
	struct intel_engine_cs *ring = ringbuf->ring;
1740
	struct drm_device *dev = ring->dev;
1741
	struct drm_i915_private *dev_priv = dev->dev_private;
1742
	uint32_t cmd;
1743
	int ret;
1744
 
6084 serge 1745
	ret = intel_logical_ring_begin(request, 4);
5354 serge 1746
	if (ret)
1747
		return ret;
1748
 
1749
	cmd = MI_FLUSH_DW + 1;
1750
 
6084 serge 1751
	/* We always require a command barrier so that subsequent
1752
	 * commands, such as breadcrumb interrupts, are strictly ordered
1753
	 * wrt the contents of the write cache being flushed to memory
1754
	 * (and thus being coherent from the CPU).
1755
	 */
1756
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
1757
 
1758
	if (invalidate_domains & I915_GEM_GPU_DOMAINS) {
1759
		cmd |= MI_INVALIDATE_TLB;
1760
		if (ring == &dev_priv->ring[VCS])
1761
			cmd |= MI_INVALIDATE_BSD;
5354 serge 1762
	}
1763
 
1764
	intel_logical_ring_emit(ringbuf, cmd);
1765
	intel_logical_ring_emit(ringbuf,
1766
				I915_GEM_HWS_SCRATCH_ADDR |
1767
				MI_FLUSH_DW_USE_GTT);
1768
	intel_logical_ring_emit(ringbuf, 0); /* upper addr */
1769
	intel_logical_ring_emit(ringbuf, 0); /* value */
1770
	intel_logical_ring_advance(ringbuf);
1771
 
1772
	return 0;
1773
}
1774
 
6084 serge 1775
static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
5354 serge 1776
				  u32 invalidate_domains,
1777
				  u32 flush_domains)
1778
{
6084 serge 1779
	struct intel_ringbuffer *ringbuf = request->ringbuf;
5354 serge 1780
	struct intel_engine_cs *ring = ringbuf->ring;
1781
	u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
7144 serge 1782
	bool vf_flush_wa = false;
5354 serge 1783
	u32 flags = 0;
1784
	int ret;
1785
 
1786
	flags |= PIPE_CONTROL_CS_STALL;
1787
 
1788
	if (flush_domains) {
1789
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
1790
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
6320 serge 1791
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
6084 serge 1792
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
5354 serge 1793
	}
1794
 
1795
	if (invalidate_domains) {
1796
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
1797
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
1798
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
1799
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
1800
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
1801
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
1802
		flags |= PIPE_CONTROL_QW_WRITE;
1803
		flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
7144 serge 1804
 
1805
		/*
1806
		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
1807
		 * pipe control.
1808
		 */
1809
		if (IS_GEN9(ring->dev))
1810
			vf_flush_wa = true;
5354 serge 1811
	}
1812
 
6084 serge 1813
	ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6);
5354 serge 1814
	if (ret)
1815
		return ret;
1816
 
6084 serge 1817
	if (vf_flush_wa) {
1818
		intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
1819
		intel_logical_ring_emit(ringbuf, 0);
1820
		intel_logical_ring_emit(ringbuf, 0);
1821
		intel_logical_ring_emit(ringbuf, 0);
1822
		intel_logical_ring_emit(ringbuf, 0);
1823
		intel_logical_ring_emit(ringbuf, 0);
1824
	}
1825
 
5354 serge 1826
	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
1827
	intel_logical_ring_emit(ringbuf, flags);
1828
	intel_logical_ring_emit(ringbuf, scratch_addr);
1829
	intel_logical_ring_emit(ringbuf, 0);
1830
	intel_logical_ring_emit(ringbuf, 0);
1831
	intel_logical_ring_emit(ringbuf, 0);
1832
	intel_logical_ring_advance(ringbuf);
1833
 
1834
	return 0;
1835
}
1836
 
1837
static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
1838
{
1839
	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
1840
}
1841
 
1842
static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno)
1843
{
1844
	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
1845
}
1846
 
6084 serge 1847
static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
5354 serge 1848
{
6084 serge 1849
 
1850
	/*
1851
	 * On BXT A steppings there is a HW coherency issue whereby the
1852
	 * MI_STORE_DATA_IMM storing the completed request's seqno
1853
	 * occasionally doesn't invalidate the CPU cache. Work around this by
1854
	 * clflushing the corresponding cacheline whenever the caller wants
1855
	 * the coherency to be guaranteed. Note that this cacheline is known
1856
	 * to be clean at this point, since we only write it in
1857
	 * bxt_a_set_seqno(), where we also do a clflush after the write. So
1858
	 * this clflush in practice becomes an invalidate operation.
1859
	 */
1860
 
1861
	if (!lazy_coherency)
1862
		intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
1863
 
1864
	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
1865
}
1866
 
1867
static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno)
1868
{
1869
	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
1870
 
1871
	/* See bxt_a_get_seqno() explaining the reason for the clflush. */
1872
	intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
1873
}
1874
 
7144 serge 1875
/*
1876
 * Reserve space for 2 NOOPs at the end of each request to be
1877
 * used as a workaround for not being allowed to do lite
1878
 * restore with HEAD==TAIL (WaIdleLiteRestore).
1879
 */
1880
#define WA_TAIL_DWORDS 2
1881
 
1882
static inline u32 hws_seqno_address(struct intel_engine_cs *engine)
1883
{
1884
	return engine->status_page.gfx_addr + I915_GEM_HWS_INDEX_ADDR;
1885
}
1886
 
6084 serge 1887
static int gen8_emit_request(struct drm_i915_gem_request *request)
1888
{
1889
	struct intel_ringbuffer *ringbuf = request->ringbuf;
5354 serge 1890
	int ret;
1891
 
7144 serge 1892
	ret = intel_logical_ring_begin(request, 6 + WA_TAIL_DWORDS);
5354 serge 1893
	if (ret)
1894
		return ret;
1895
 
7144 serge 1896
	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
1897
	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
5354 serge 1898
 
1899
	intel_logical_ring_emit(ringbuf,
7144 serge 1900
				(MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
1901
	intel_logical_ring_emit(ringbuf,
1902
				hws_seqno_address(request->ring) |
1903
				MI_FLUSH_DW_USE_GTT);
5354 serge 1904
	intel_logical_ring_emit(ringbuf, 0);
6084 serge 1905
	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
5354 serge 1906
	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
1907
	intel_logical_ring_emit(ringbuf, MI_NOOP);
7144 serge 1908
	return intel_logical_ring_advance_and_submit(request);
1909
}
5354 serge 1910
 
7144 serge 1911
static int gen8_emit_request_render(struct drm_i915_gem_request *request)
1912
{
1913
	struct intel_ringbuffer *ringbuf = request->ringbuf;
1914
	int ret;
1915
 
1916
	ret = intel_logical_ring_begin(request, 8 + WA_TAIL_DWORDS);
1917
	if (ret)
1918
		return ret;
1919
 
1920
	/* We're using qword write, seqno should be aligned to 8 bytes. */
1921
	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
1922
 
1923
	/* w/a for post sync ops following a GPGPU operation we
1924
	 * need a prior CS_STALL, which is emitted by the flush
1925
	 * following the batch.
6084 serge 1926
	 */
7144 serge 1927
	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
1928
	intel_logical_ring_emit(ringbuf,
1929
				(PIPE_CONTROL_GLOBAL_GTT_IVB |
1930
				 PIPE_CONTROL_CS_STALL |
1931
				 PIPE_CONTROL_QW_WRITE));
1932
	intel_logical_ring_emit(ringbuf, hws_seqno_address(request->ring));
1933
	intel_logical_ring_emit(ringbuf, 0);
1934
	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
1935
	/* We're thrashing one dword of HWS. */
1936
	intel_logical_ring_emit(ringbuf, 0);
1937
	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
6084 serge 1938
	intel_logical_ring_emit(ringbuf, MI_NOOP);
7144 serge 1939
	return intel_logical_ring_advance_and_submit(request);
5354 serge 1940
}
1941
 
6084 serge 1942
static int intel_lr_context_render_state_init(struct drm_i915_gem_request *req)
1943
{
1944
	struct render_state so;
1945
	int ret;
1946
 
1947
	ret = i915_gem_render_state_prepare(req->ring, &so);
1948
	if (ret)
1949
		return ret;
1950
 
1951
	if (so.rodata == NULL)
1952
		return 0;
1953
 
1954
	ret = req->ring->emit_bb_start(req, so.ggtt_offset,
1955
				       I915_DISPATCH_SECURE);
1956
	if (ret)
1957
		goto out;
1958
 
1959
	ret = req->ring->emit_bb_start(req,
1960
				       (so.ggtt_offset + so.aux_batch_offset),
1961
				       I915_DISPATCH_SECURE);
1962
	if (ret)
1963
		goto out;
1964
 
1965
	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req);
1966
 
1967
out:
1968
	i915_gem_render_state_fini(&so);
1969
	return ret;
1970
}
1971
 
1972
static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
1973
{
1974
	int ret;
1975
 
1976
	ret = intel_logical_ring_workarounds_emit(req);
1977
	if (ret)
1978
		return ret;
1979
 
1980
	ret = intel_rcs_context_init_mocs(req);
1981
	/*
1982
	 * Failing to program the MOCS is non-fatal.The system will not
1983
	 * run at peak performance. So generate an error and carry on.
1984
	 */
1985
	if (ret)
1986
		DRM_ERROR("MOCS failed to program: expect performance issues.\n");
1987
 
1988
	return intel_lr_context_render_state_init(req);
1989
}
1990
 
5354 serge 1991
/**
1992
 * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
1993
 *
1994
 * @ring: Engine Command Streamer.
1995
 *
1996
 */
1997
void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
1998
{
1999
	struct drm_i915_private *dev_priv;
2000
 
2001
	if (!intel_ring_initialized(ring))
2002
		return;
2003
 
2004
	dev_priv = ring->dev->dev_private;
2005
 
6937 serge 2006
	if (ring->buffer) {
7144 serge 2007
		intel_logical_ring_stop(ring);
2008
		WARN_ON((I915_READ_MODE(ring) & MODE_IDLE) == 0);
6937 serge 2009
	}
5354 serge 2010
 
2011
	if (ring->cleanup)
2012
		ring->cleanup(ring);
2013
 
2014
	i915_cmd_parser_fini_ring(ring);
6084 serge 2015
	i915_gem_batch_pool_fini(&ring->batch_pool);
5354 serge 2016
 
2017
	if (ring->status_page.obj) {
2018
		kunmap(sg_page(ring->status_page.obj->pages->sgl));
2019
		ring->status_page.obj = NULL;
2020
	}
6084 serge 2021
 
7144 serge 2022
	ring->disable_lite_restore_wa = false;
2023
	ring->ctx_desc_template = 0;
2024
 
6084 serge 2025
	lrc_destroy_wa_ctx_obj(ring);
6937 serge 2026
	ring->dev = NULL;
5354 serge 2027
}
2028
 
7144 serge 2029
static void
2030
logical_ring_default_vfuncs(struct drm_device *dev,
2031
			    struct intel_engine_cs *ring)
5354 serge 2032
{
7144 serge 2033
	/* Default vfuncs which can be overriden by each engine. */
2034
	ring->init_hw = gen8_init_common_ring;
2035
	ring->emit_request = gen8_emit_request;
2036
	ring->emit_flush = gen8_emit_flush;
2037
	ring->irq_get = gen8_logical_ring_get_irq;
2038
	ring->irq_put = gen8_logical_ring_put_irq;
2039
	ring->emit_bb_start = gen8_emit_bb_start;
2040
	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
2041
		ring->get_seqno = bxt_a_get_seqno;
2042
		ring->set_seqno = bxt_a_set_seqno;
2043
	} else {
2044
		ring->get_seqno = gen8_get_seqno;
2045
		ring->set_seqno = gen8_set_seqno;
2046
	}
2047
}
2048
 
2049
static inline void
2050
logical_ring_default_irqs(struct intel_engine_cs *ring, unsigned shift)
2051
{
2052
	ring->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
2053
	ring->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
2054
}
2055
 
2056
static int
2057
logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring)
2058
{
2059
	struct intel_context *dctx = to_i915(dev)->kernel_context;
5354 serge 2060
	int ret;
2061
 
2062
	/* Intentionally left blank. */
2063
	ring->buffer = NULL;
2064
 
2065
	ring->dev = dev;
2066
	INIT_LIST_HEAD(&ring->active_list);
2067
	INIT_LIST_HEAD(&ring->request_list);
6084 serge 2068
	i915_gem_batch_pool_init(dev, &ring->batch_pool);
5354 serge 2069
	init_waitqueue_head(&ring->irq_queue);
2070
 
6937 serge 2071
	INIT_LIST_HEAD(&ring->buffers);
5354 serge 2072
	INIT_LIST_HEAD(&ring->execlist_queue);
2073
	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
2074
	spin_lock_init(&ring->execlist_lock);
2075
 
7144 serge 2076
	logical_ring_init_platform_invariants(ring);
2077
 
5354 serge 2078
	ret = i915_cmd_parser_init_ring(ring);
2079
	if (ret)
6937 serge 2080
		goto error;
5354 serge 2081
 
7144 serge 2082
	ret = intel_lr_context_deferred_alloc(dctx, ring);
6084 serge 2083
	if (ret)
6937 serge 2084
		goto error;
6084 serge 2085
 
2086
	/* As this is the default context, always pin it */
7144 serge 2087
	ret = intel_lr_context_do_pin(dctx, ring);
6084 serge 2088
	if (ret) {
2089
		DRM_ERROR(
2090
			"Failed to pin and map ringbuffer %s: %d\n",
2091
			ring->name, ret);
6937 serge 2092
		goto error;
5354 serge 2093
	}
2094
 
6937 serge 2095
	return 0;
2096
 
2097
error:
2098
	intel_logical_ring_cleanup(ring);
5354 serge 2099
	return ret;
2100
}
2101
 
2102
static int logical_render_ring_init(struct drm_device *dev)
2103
{
2104
	struct drm_i915_private *dev_priv = dev->dev_private;
2105
	struct intel_engine_cs *ring = &dev_priv->ring[RCS];
6084 serge 2106
	int ret;
5354 serge 2107
 
2108
	ring->name = "render ring";
2109
	ring->id = RCS;
7144 serge 2110
	ring->exec_id = I915_EXEC_RENDER;
2111
	ring->guc_id = GUC_RENDER_ENGINE;
5354 serge 2112
	ring->mmio_base = RENDER_RING_BASE;
7144 serge 2113
 
2114
	logical_ring_default_irqs(ring, GEN8_RCS_IRQ_SHIFT);
5354 serge 2115
	if (HAS_L3_DPF(dev))
2116
		ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
2117
 
7144 serge 2118
	logical_ring_default_vfuncs(dev, ring);
2119
 
2120
	/* Override some for render ring. */
6084 serge 2121
	if (INTEL_INFO(dev)->gen >= 9)
2122
		ring->init_hw = gen9_init_render_ring;
2123
	else
2124
		ring->init_hw = gen8_init_render_ring;
2125
	ring->init_context = gen8_init_rcs_context;
5354 serge 2126
	ring->cleanup = intel_fini_pipe_control;
2127
	ring->emit_flush = gen8_emit_flush_render;
7144 serge 2128
	ring->emit_request = gen8_emit_request_render;
5354 serge 2129
 
6084 serge 2130
	ring->dev = dev;
2131
 
2132
	ret = intel_init_pipe_control(ring);
2133
	if (ret)
2134
		return ret;
2135
 
2136
	ret = intel_init_workaround_bb(ring);
2137
	if (ret) {
2138
		/*
2139
		 * We continue even if we fail to initialize WA batch
2140
		 * because we only expect rare glitches but nothing
2141
		 * critical to prevent us from using GPU
2142
		 */
2143
		DRM_ERROR("WA batch buffer initialization failed: %d\n",
2144
			  ret);
2145
	}
2146
 
2147
	ret = logical_ring_init(dev, ring);
2148
	if (ret) {
2149
		lrc_destroy_wa_ctx_obj(ring);
2150
	}
2151
 
2152
	return ret;
5354 serge 2153
}
2154
 
2155
static int logical_bsd_ring_init(struct drm_device *dev)
2156
{
2157
	struct drm_i915_private *dev_priv = dev->dev_private;
2158
	struct intel_engine_cs *ring = &dev_priv->ring[VCS];
2159
 
2160
	ring->name = "bsd ring";
2161
	ring->id = VCS;
7144 serge 2162
	ring->exec_id = I915_EXEC_BSD;
2163
	ring->guc_id = GUC_VIDEO_ENGINE;
5354 serge 2164
	ring->mmio_base = GEN6_BSD_RING_BASE;
2165
 
7144 serge 2166
	logical_ring_default_irqs(ring, GEN8_VCS1_IRQ_SHIFT);
2167
	logical_ring_default_vfuncs(dev, ring);
5354 serge 2168
 
2169
	return logical_ring_init(dev, ring);
2170
}
2171
 
2172
static int logical_bsd2_ring_init(struct drm_device *dev)
2173
{
2174
	struct drm_i915_private *dev_priv = dev->dev_private;
2175
	struct intel_engine_cs *ring = &dev_priv->ring[VCS2];
2176
 
7144 serge 2177
	ring->name = "bsd2 ring";
5354 serge 2178
	ring->id = VCS2;
7144 serge 2179
	ring->exec_id = I915_EXEC_BSD;
2180
	ring->guc_id = GUC_VIDEO_ENGINE2;
5354 serge 2181
	ring->mmio_base = GEN8_BSD2_RING_BASE;
2182
 
7144 serge 2183
	logical_ring_default_irqs(ring, GEN8_VCS2_IRQ_SHIFT);
2184
	logical_ring_default_vfuncs(dev, ring);
5354 serge 2185
 
2186
	return logical_ring_init(dev, ring);
2187
}
2188
 
2189
static int logical_blt_ring_init(struct drm_device *dev)
2190
{
2191
	struct drm_i915_private *dev_priv = dev->dev_private;
2192
	struct intel_engine_cs *ring = &dev_priv->ring[BCS];
2193
 
2194
	ring->name = "blitter ring";
2195
	ring->id = BCS;
7144 serge 2196
	ring->exec_id = I915_EXEC_BLT;
2197
	ring->guc_id = GUC_BLITTER_ENGINE;
5354 serge 2198
	ring->mmio_base = BLT_RING_BASE;
2199
 
7144 serge 2200
	logical_ring_default_irqs(ring, GEN8_BCS_IRQ_SHIFT);
2201
	logical_ring_default_vfuncs(dev, ring);
5354 serge 2202
 
2203
	return logical_ring_init(dev, ring);
2204
}
2205
 
2206
static int logical_vebox_ring_init(struct drm_device *dev)
2207
{
2208
	struct drm_i915_private *dev_priv = dev->dev_private;
2209
	struct intel_engine_cs *ring = &dev_priv->ring[VECS];
2210
 
2211
	ring->name = "video enhancement ring";
2212
	ring->id = VECS;
7144 serge 2213
	ring->exec_id = I915_EXEC_VEBOX;
2214
	ring->guc_id = GUC_VIDEOENHANCE_ENGINE;
5354 serge 2215
	ring->mmio_base = VEBOX_RING_BASE;
2216
 
7144 serge 2217
	logical_ring_default_irqs(ring, GEN8_VECS_IRQ_SHIFT);
2218
	logical_ring_default_vfuncs(dev, ring);
5354 serge 2219
 
2220
	return logical_ring_init(dev, ring);
2221
}
2222
 
2223
/**
2224
 * intel_logical_rings_init() - allocate, populate and init the Engine Command Streamers
2225
 * @dev: DRM device.
2226
 *
2227
 * This function inits the engines for an Execlists submission style (the equivalent in the
2228
 * legacy ringbuffer submission world would be i915_gem_init_rings). It does it only for
2229
 * those engines that are present in the hardware.
2230
 *
2231
 * Return: non-zero if the initialization failed.
2232
 */
2233
int intel_logical_rings_init(struct drm_device *dev)
2234
{
2235
	struct drm_i915_private *dev_priv = dev->dev_private;
2236
	int ret;
2237
 
2238
	ret = logical_render_ring_init(dev);
2239
	if (ret)
2240
		return ret;
2241
 
2242
	if (HAS_BSD(dev)) {
2243
		ret = logical_bsd_ring_init(dev);
2244
		if (ret)
2245
			goto cleanup_render_ring;
2246
	}
2247
 
2248
	if (HAS_BLT(dev)) {
2249
		ret = logical_blt_ring_init(dev);
2250
		if (ret)
2251
			goto cleanup_bsd_ring;
2252
	}
2253
 
2254
	if (HAS_VEBOX(dev)) {
2255
		ret = logical_vebox_ring_init(dev);
2256
		if (ret)
2257
			goto cleanup_blt_ring;
2258
	}
2259
 
2260
	if (HAS_BSD2(dev)) {
2261
		ret = logical_bsd2_ring_init(dev);
2262
		if (ret)
2263
			goto cleanup_vebox_ring;
2264
	}
2265
 
2266
	return 0;
2267
 
2268
cleanup_vebox_ring:
2269
	intel_logical_ring_cleanup(&dev_priv->ring[VECS]);
2270
cleanup_blt_ring:
2271
	intel_logical_ring_cleanup(&dev_priv->ring[BCS]);
2272
cleanup_bsd_ring:
2273
	intel_logical_ring_cleanup(&dev_priv->ring[VCS]);
2274
cleanup_render_ring:
2275
	intel_logical_ring_cleanup(&dev_priv->ring[RCS]);
2276
 
2277
	return ret;
2278
}
2279
 
6084 serge 2280
static u32
2281
make_rpcs(struct drm_device *dev)
5354 serge 2282
{
6084 serge 2283
	u32 rpcs = 0;
5354 serge 2284
 
6084 serge 2285
	/*
2286
	 * No explicit RPCS request is needed to ensure full
2287
	 * slice/subslice/EU enablement prior to Gen9.
2288
	*/
2289
	if (INTEL_INFO(dev)->gen < 9)
5354 serge 2290
		return 0;
2291
 
6084 serge 2292
	/*
2293
	 * Starting in Gen9, render power gating can leave
2294
	 * slice/subslice/EU in a partially enabled state. We
2295
	 * must make an explicit request through RPCS for full
2296
	 * enablement.
2297
	*/
2298
	if (INTEL_INFO(dev)->has_slice_pg) {
2299
		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
2300
		rpcs |= INTEL_INFO(dev)->slice_total <<
2301
			GEN8_RPCS_S_CNT_SHIFT;
2302
		rpcs |= GEN8_RPCS_ENABLE;
2303
	}
5354 serge 2304
 
6084 serge 2305
	if (INTEL_INFO(dev)->has_subslice_pg) {
2306
		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
2307
		rpcs |= INTEL_INFO(dev)->subslice_per_slice <<
2308
			GEN8_RPCS_SS_CNT_SHIFT;
2309
		rpcs |= GEN8_RPCS_ENABLE;
2310
	}
5354 serge 2311
 
6084 serge 2312
	if (INTEL_INFO(dev)->has_eu_pg) {
2313
		rpcs |= INTEL_INFO(dev)->eu_per_subslice <<
2314
			GEN8_RPCS_EU_MIN_SHIFT;
2315
		rpcs |= INTEL_INFO(dev)->eu_per_subslice <<
2316
			GEN8_RPCS_EU_MAX_SHIFT;
2317
		rpcs |= GEN8_RPCS_ENABLE;
2318
	}
2319
 
2320
	return rpcs;
5354 serge 2321
}
2322
 
7144 serge 2323
static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *ring)
2324
{
2325
	u32 indirect_ctx_offset;
2326
 
2327
	switch (INTEL_INFO(ring->dev)->gen) {
2328
	default:
2329
		MISSING_CASE(INTEL_INFO(ring->dev)->gen);
2330
		/* fall through */
2331
	case 9:
2332
		indirect_ctx_offset =
2333
			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2334
		break;
2335
	case 8:
2336
		indirect_ctx_offset =
2337
			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
2338
		break;
2339
	}
2340
 
2341
	return indirect_ctx_offset;
2342
}
2343
 
5354 serge 2344
static int
2345
populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj,
2346
		    struct intel_engine_cs *ring, struct intel_ringbuffer *ringbuf)
2347
{
2348
	struct drm_device *dev = ring->dev;
2349
	struct drm_i915_private *dev_priv = dev->dev_private;
2350
	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
2351
	struct page *page;
2352
	uint32_t *reg_state;
2353
	int ret;
2354
 
2355
	if (!ppgtt)
2356
		ppgtt = dev_priv->mm.aliasing_ppgtt;
2357
 
2358
	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
2359
	if (ret) {
2360
		DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
2361
		return ret;
2362
	}
2363
 
2364
	ret = i915_gem_object_get_pages(ctx_obj);
2365
	if (ret) {
2366
		DRM_DEBUG_DRIVER("Could not get object pages\n");
2367
		return ret;
2368
	}
2369
 
2370
	i915_gem_object_pin_pages(ctx_obj);
2371
 
2372
	/* The second page of the context object contains some fields which must
2373
	 * be set up prior to the first execution. */
6937 serge 2374
	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
5354 serge 2375
	reg_state = kmap_atomic(page);
2376
 
2377
	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
2378
	 * commands followed by (reg, value) pairs. The values we are setting here are
2379
	 * only for the first context restore: on a subsequent save, the GPU will
2380
	 * recreate this batchbuffer with new values (including all the missing
2381
	 * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */
6937 serge 2382
	reg_state[CTX_LRI_HEADER_0] =
2383
		MI_LOAD_REGISTER_IMM(ring->id == RCS ? 14 : 11) | MI_LRI_FORCE_POSTED;
2384
	ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(ring),
7144 serge 2385
		       _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
2386
					  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
2387
					  (HAS_RESOURCE_STREAMER(dev) ?
2388
					    CTX_CTRL_RS_CTX_ENABLE : 0)));
6937 serge 2389
	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0);
2390
	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0);
5354 serge 2391
	/* Ring buffer start address is not known until the buffer is pinned.
2392
	 * It is written to the context image in execlists_update_context()
2393
	 */
6937 serge 2394
	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0);
2395
	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base),
2396
		       ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID);
2397
	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U, RING_BBADDR_UDW(ring->mmio_base), 0);
2398
	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L, RING_BBADDR(ring->mmio_base), 0);
2399
	ASSIGN_CTX_REG(reg_state, CTX_BB_STATE, RING_BBSTATE(ring->mmio_base),
2400
		       RING_BB_PPGTT);
2401
	ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(ring->mmio_base), 0);
2402
	ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(ring->mmio_base), 0);
2403
	ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE, RING_SBBSTATE(ring->mmio_base), 0);
5354 serge 2404
	if (ring->id == RCS) {
6937 serge 2405
		ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(ring->mmio_base), 0);
2406
		ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(ring->mmio_base), 0);
2407
		ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET, RING_INDIRECT_CTX_OFFSET(ring->mmio_base), 0);
6084 serge 2408
		if (ring->wa_ctx.obj) {
2409
			struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
2410
			uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
2411
 
2412
			reg_state[CTX_RCS_INDIRECT_CTX+1] =
2413
				(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
2414
				(wa_ctx->indirect_ctx.size / CACHELINE_DWORDS);
2415
 
2416
			reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
7144 serge 2417
				intel_lr_indirect_ctx_offset(ring) << 6;
6084 serge 2418
 
2419
			reg_state[CTX_BB_PER_CTX_PTR+1] =
2420
				(ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) |
2421
				0x01;
2422
		}
5354 serge 2423
	}
6937 serge 2424
	reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED;
2425
	ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(ring->mmio_base), 0);
2426
	/* PDP values well be assigned later if needed */
2427
	ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(ring, 3), 0);
2428
	ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(ring, 3), 0);
2429
	ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(ring, 2), 0);
2430
	ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(ring, 2), 0);
2431
	ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(ring, 1), 0);
2432
	ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(ring, 1), 0);
2433
	ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(ring, 0), 0);
2434
	ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(ring, 0), 0);
6084 serge 2435
 
2436
	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
2437
		/* 64b PPGTT (48bit canonical)
2438
		 * PDP0_DESCRIPTOR contains the base address to PML4 and
2439
		 * other PDP Descriptors are ignored.
2440
		 */
2441
		ASSIGN_CTX_PML4(ppgtt, reg_state);
2442
	} else {
2443
		/* 32b PPGTT
2444
		 * PDP*_DESCRIPTOR contains the base address of space supported.
2445
		 * With dynamic page allocation, PDPs may not be allocated at
2446
		 * this point. Point the unallocated PDPs to the scratch page
2447
		 */
2448
		ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
2449
		ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
2450
		ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
2451
		ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
2452
	}
2453
 
5354 serge 2454
	if (ring->id == RCS) {
2455
		reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
6937 serge 2456
		ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
2457
			       make_rpcs(dev));
5354 serge 2458
	}
2459
 
2460
	kunmap_atomic(reg_state);
6088 serge 2461
	i915_gem_object_unpin_pages(ctx_obj);
5354 serge 2462
 
2463
	return 0;
2464
}
2465
 
2466
/**
2467
 * intel_lr_context_free() - free the LRC specific bits of a context
2468
 * @ctx: the LR context to free.
2469
 *
2470
 * The real context freeing is done in i915_gem_context_free: this only
2471
 * takes care of the bits that are LRC related: the per-engine backing
2472
 * objects and the logical ringbuffer.
2473
 */
2474
void intel_lr_context_free(struct intel_context *ctx)
2475
{
2476
	int i;
2477
 
7144 serge 2478
	for (i = I915_NUM_RINGS; --i >= 0; ) {
2479
		struct intel_ringbuffer *ringbuf = ctx->engine[i].ringbuf;
5354 serge 2480
		struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
2481
 
7144 serge 2482
		if (!ctx_obj)
2483
			continue;
5354 serge 2484
 
7144 serge 2485
		if (ctx == ctx->i915->kernel_context) {
2486
			intel_unpin_ringbuffer_obj(ringbuf);
2487
			i915_gem_object_ggtt_unpin(ctx_obj);
5354 serge 2488
		}
7144 serge 2489
 
2490
		WARN_ON(ctx->engine[i].pin_count);
2491
		intel_ringbuffer_free(ringbuf);
2492
		drm_gem_object_unreference(&ctx_obj->base);
5354 serge 2493
	}
2494
}
2495
 
7144 serge 2496
/**
2497
 * intel_lr_context_size() - return the size of the context for an engine
2498
 * @ring: which engine to find the context size for
2499
 *
2500
 * Each engine may require a different amount of space for a context image,
2501
 * so when allocating (or copying) an image, this function can be used to
2502
 * find the right size for the specific engine.
2503
 *
2504
 * Return: size (in bytes) of an engine-specific context image
2505
 *
2506
 * Note: this size includes the HWSP, which is part of the context image
2507
 * in LRC mode, but does not include the "shared data page" used with
2508
 * GuC submission. The caller should account for this if using the GuC.
2509
 */
2510
uint32_t intel_lr_context_size(struct intel_engine_cs *ring)
5354 serge 2511
{
2512
	int ret = 0;
2513
 
2514
	WARN_ON(INTEL_INFO(ring->dev)->gen < 8);
2515
 
2516
	switch (ring->id) {
2517
	case RCS:
2518
		if (INTEL_INFO(ring->dev)->gen >= 9)
2519
			ret = GEN9_LR_CONTEXT_RENDER_SIZE;
2520
		else
2521
			ret = GEN8_LR_CONTEXT_RENDER_SIZE;
2522
		break;
2523
	case VCS:
2524
	case BCS:
2525
	case VECS:
2526
	case VCS2:
2527
		ret = GEN8_LR_CONTEXT_OTHER_SIZE;
2528
		break;
2529
	}
2530
 
2531
	return ret;
2532
}
2533
 
2534
static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
2535
		struct drm_i915_gem_object *default_ctx_obj)
2536
{
2537
	struct drm_i915_private *dev_priv = ring->dev->dev_private;
6084 serge 2538
	struct page *page;
5354 serge 2539
 
6084 serge 2540
	/* The HWSP is part of the default context object in LRC mode. */
2541
	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
2542
			+ LRC_PPHWSP_PN * PAGE_SIZE;
2543
	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
2544
	ring->status_page.page_addr = kmap(page);
5354 serge 2545
	ring->status_page.obj = default_ctx_obj;
2546
 
2547
	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
2548
			(u32)ring->status_page.gfx_addr);
2549
	POSTING_READ(RING_HWS_PGA(ring->mmio_base));
2550
}
2551
 
2552
/**
6084 serge 2553
 * intel_lr_context_deferred_alloc() - create the LRC specific bits of a context
5354 serge 2554
 * @ctx: LR context to create.
2555
 * @ring: engine to be used with the context.
2556
 *
2557
 * This function can be called more than once, with different engines, if we plan
2558
 * to use the context with them. The context backing objects and the ringbuffers
2559
 * (specially the ringbuffer backing objects) suck a lot of memory up, and that's why
2560
 * the creation is a deferred call: it's better to make sure first that we need to use
2561
 * a given ring with the context.
2562
 *
2563
 * Return: non-zero on error.
2564
 */
6084 serge 2565
 
2566
int intel_lr_context_deferred_alloc(struct intel_context *ctx,
7144 serge 2567
				    struct intel_engine_cs *ring)
5354 serge 2568
{
2569
	struct drm_device *dev = ring->dev;
2570
	struct drm_i915_gem_object *ctx_obj;
2571
	uint32_t context_size;
2572
	struct intel_ringbuffer *ringbuf;
2573
	int ret;
2574
 
2575
	WARN_ON(ctx->legacy_hw_ctx.rcs_state != NULL);
6084 serge 2576
	WARN_ON(ctx->engine[ring->id].state);
5354 serge 2577
 
7144 serge 2578
	context_size = round_up(intel_lr_context_size(ring), 4096);
5354 serge 2579
 
6084 serge 2580
	/* One extra page as the sharing data between driver and GuC */
2581
	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
5354 serge 2582
 
6084 serge 2583
	ctx_obj = i915_gem_alloc_object(dev, context_size);
2584
	if (!ctx_obj) {
2585
		DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
2586
		return -ENOMEM;
5354 serge 2587
	}
2588
 
6084 serge 2589
	ringbuf = intel_engine_create_ringbuffer(ring, 4 * PAGE_SIZE);
2590
	if (IS_ERR(ringbuf)) {
2591
		ret = PTR_ERR(ringbuf);
2592
		goto error_deref_obj;
5354 serge 2593
	}
2594
 
2595
	ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf);
2596
	if (ret) {
2597
		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
6084 serge 2598
		goto error_ringbuf;
5354 serge 2599
	}
2600
 
2601
	ctx->engine[ring->id].ringbuf = ringbuf;
2602
	ctx->engine[ring->id].state = ctx_obj;
2603
 
7144 serge 2604
	if (ctx != ctx->i915->kernel_context && ring->init_context) {
6084 serge 2605
		struct drm_i915_gem_request *req;
5354 serge 2606
 
7144 serge 2607
		req = i915_gem_request_alloc(ring, ctx);
2608
		if (IS_ERR(req)) {
2609
			ret = PTR_ERR(req);
2610
			DRM_ERROR("ring create req: %d\n", ret);
6084 serge 2611
			goto error_ringbuf;
5354 serge 2612
		}
2613
 
6084 serge 2614
		ret = ring->init_context(req);
5354 serge 2615
		if (ret) {
6084 serge 2616
			DRM_ERROR("ring init context: %d\n",
2617
				ret);
2618
			i915_gem_request_cancel(req);
2619
			goto error_ringbuf;
5354 serge 2620
		}
6084 serge 2621
		i915_add_request_no_flush(req);
5354 serge 2622
	}
2623
	return 0;
2624
 
6084 serge 2625
error_ringbuf:
2626
	intel_ringbuffer_free(ringbuf);
2627
error_deref_obj:
5354 serge 2628
	drm_gem_object_unreference(&ctx_obj->base);
6084 serge 2629
	ctx->engine[ring->id].ringbuf = NULL;
2630
	ctx->engine[ring->id].state = NULL;
5354 serge 2631
	return ret;
2632
}
6084 serge 2633
 
2634
void intel_lr_context_reset(struct drm_device *dev,
2635
			struct intel_context *ctx)
2636
{
2637
	struct drm_i915_private *dev_priv = dev->dev_private;
2638
	struct intel_engine_cs *ring;
2639
	int i;
2640
 
2641
	for_each_ring(ring, dev_priv, i) {
2642
		struct drm_i915_gem_object *ctx_obj =
2643
				ctx->engine[ring->id].state;
2644
		struct intel_ringbuffer *ringbuf =
2645
				ctx->engine[ring->id].ringbuf;
2646
		uint32_t *reg_state;
2647
		struct page *page;
2648
 
2649
		if (!ctx_obj)
2650
			continue;
2651
 
2652
		if (i915_gem_object_get_pages(ctx_obj)) {
2653
			WARN(1, "Failed get_pages for context obj\n");
2654
			continue;
2655
		}
6937 serge 2656
		page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
6084 serge 2657
		reg_state = kmap_atomic(page);
2658
 
2659
		reg_state[CTX_RING_HEAD+1] = 0;
2660
		reg_state[CTX_RING_TAIL+1] = 0;
2661
 
2662
		kunmap_atomic(reg_state);
2663
 
2664
		ringbuf->head = 0;
2665
		ringbuf->tail = 0;
2666
	}
2667
}