Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /************************************************************************** |
2 | * |
||
3 | * Copyright 2006 VMware, Inc. |
||
4 | * All Rights Reserved. |
||
5 | * |
||
6 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
7 | * copy of this software and associated documentation files (the |
||
8 | * "Software"), to deal in the Software without restriction, including |
||
9 | * without limitation the rights to use, copy, modify, merge, publish, |
||
10 | * distribute, sub license, and/or sell copies of the Software, and to |
||
11 | * permit persons to whom the Software is furnished to do so, subject to |
||
12 | * the following conditions: |
||
13 | * |
||
14 | * The above copyright notice and this permission notice (including the |
||
15 | * next paragraph) shall be included in all copies or substantial portions |
||
16 | * of the Software. |
||
17 | * |
||
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||
19 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||
20 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||
21 | * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||
22 | * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||
23 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||
24 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
25 | * |
||
26 | **************************************************************************/ |
||
27 | |||
28 | #include "intel_batchbuffer.h" |
||
29 | #include "intel_buffer_objects.h" |
||
30 | #include "intel_reg.h" |
||
31 | #include "intel_bufmgr.h" |
||
32 | #include "intel_buffers.h" |
||
33 | #include "intel_fbo.h" |
||
34 | #include "brw_context.h" |
||
35 | |||
36 | #include |
||
37 | #include |
||
38 | |||
39 | static void |
||
40 | intel_batchbuffer_reset(struct brw_context *brw); |
||
41 | |||
42 | void |
||
43 | intel_batchbuffer_init(struct brw_context *brw) |
||
44 | { |
||
45 | intel_batchbuffer_reset(brw); |
||
46 | |||
47 | if (brw->gen >= 6) { |
||
48 | /* We can't just use brw_state_batch to get a chunk of space for |
||
49 | * the gen6 workaround because it involves actually writing to |
||
50 | * the buffer, and the kernel doesn't let us write to the batch. |
||
51 | */ |
||
52 | brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr, |
||
53 | "pipe_control workaround", |
||
54 | 4096, 4096); |
||
55 | } |
||
56 | |||
57 | if (!brw->has_llc) { |
||
58 | brw->batch.cpu_map = malloc(BATCH_SZ); |
||
59 | brw->batch.map = brw->batch.cpu_map; |
||
60 | } |
||
61 | } |
||
62 | |||
63 | static void |
||
64 | intel_batchbuffer_reset(struct brw_context *brw) |
||
65 | { |
||
66 | if (brw->batch.last_bo != NULL) { |
||
67 | drm_intel_bo_unreference(brw->batch.last_bo); |
||
68 | brw->batch.last_bo = NULL; |
||
69 | } |
||
70 | brw->batch.last_bo = brw->batch.bo; |
||
71 | |||
72 | brw_render_cache_set_clear(brw); |
||
73 | |||
74 | brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer", |
||
75 | BATCH_SZ, 4096); |
||
76 | if (brw->has_llc) { |
||
77 | drm_intel_bo_map(brw->batch.bo, true); |
||
78 | brw->batch.map = brw->batch.bo->virtual; |
||
79 | } |
||
80 | |||
81 | brw->batch.reserved_space = BATCH_RESERVED; |
||
82 | brw->batch.state_batch_offset = brw->batch.bo->size; |
||
83 | brw->batch.used = 0; |
||
84 | brw->batch.needs_sol_reset = false; |
||
85 | brw->batch.pipe_controls_since_last_cs_stall = 0; |
||
86 | |||
87 | /* We don't know what ring the new batch will be sent to until we see the |
||
88 | * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown. |
||
89 | */ |
||
90 | brw->batch.ring = UNKNOWN_RING; |
||
91 | } |
||
92 | |||
93 | void |
||
94 | intel_batchbuffer_save_state(struct brw_context *brw) |
||
95 | { |
||
96 | brw->batch.saved.used = brw->batch.used; |
||
97 | brw->batch.saved.reloc_count = |
||
98 | drm_intel_gem_bo_get_reloc_count(brw->batch.bo); |
||
99 | } |
||
100 | |||
101 | void |
||
102 | intel_batchbuffer_reset_to_saved(struct brw_context *brw) |
||
103 | { |
||
104 | drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count); |
||
105 | |||
106 | brw->batch.used = brw->batch.saved.used; |
||
107 | if (brw->batch.used == 0) |
||
108 | brw->batch.ring = UNKNOWN_RING; |
||
109 | } |
||
110 | |||
111 | void |
||
112 | intel_batchbuffer_free(struct brw_context *brw) |
||
113 | { |
||
114 | free(brw->batch.cpu_map); |
||
115 | drm_intel_bo_unreference(brw->batch.last_bo); |
||
116 | drm_intel_bo_unreference(brw->batch.bo); |
||
117 | drm_intel_bo_unreference(brw->batch.workaround_bo); |
||
118 | } |
||
119 | |||
120 | static void |
||
121 | do_batch_dump(struct brw_context *brw) |
||
122 | { |
||
123 | struct drm_intel_decode *decode; |
||
124 | struct intel_batchbuffer *batch = &brw->batch; |
||
125 | int ret; |
||
126 | |||
127 | decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID); |
||
128 | if (!decode) |
||
129 | return; |
||
130 | |||
131 | ret = drm_intel_bo_map(batch->bo, false); |
||
132 | if (ret == 0) { |
||
133 | drm_intel_decode_set_batch_pointer(decode, |
||
134 | batch->bo->virtual, |
||
135 | batch->bo->offset64, |
||
136 | batch->used); |
||
137 | } else { |
||
138 | fprintf(stderr, |
||
139 | "WARNING: failed to map batchbuffer (%s), " |
||
140 | "dumping uploaded data instead.\n", strerror(ret)); |
||
141 | |||
142 | drm_intel_decode_set_batch_pointer(decode, |
||
143 | batch->map, |
||
144 | batch->bo->offset64, |
||
145 | batch->used); |
||
146 | } |
||
147 | |||
148 | drm_intel_decode_set_output_file(decode, stderr); |
||
149 | drm_intel_decode(decode); |
||
150 | |||
151 | drm_intel_decode_context_free(decode); |
||
152 | |||
153 | if (ret == 0) { |
||
154 | drm_intel_bo_unmap(batch->bo); |
||
155 | |||
156 | brw_debug_batch(brw); |
||
157 | } |
||
158 | } |
||
159 | |||
160 | void |
||
161 | intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw) |
||
162 | { |
||
163 | /* We may need to enable and snapshot OA counters. */ |
||
164 | brw_perf_monitor_new_batch(brw); |
||
165 | } |
||
166 | |||
167 | /** |
||
168 | * Called when starting a new batch buffer. |
||
169 | */ |
||
170 | static void |
||
171 | brw_new_batch(struct brw_context *brw) |
||
172 | { |
||
173 | /* Create a new batchbuffer and reset the associated state: */ |
||
174 | drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0); |
||
175 | intel_batchbuffer_reset(brw); |
||
176 | |||
177 | /* If the kernel supports hardware contexts, then most hardware state is |
||
178 | * preserved between batches; we only need to re-emit state that is required |
||
179 | * to be in every batch. Otherwise we need to re-emit all the state that |
||
180 | * would otherwise be stored in the context (which for all intents and |
||
181 | * purposes means everything). |
||
182 | */ |
||
183 | if (brw->hw_ctx == NULL) |
||
184 | brw->ctx.NewDriverState |= BRW_NEW_CONTEXT; |
||
185 | |||
186 | brw->ctx.NewDriverState |= BRW_NEW_BATCH; |
||
187 | |||
188 | brw->state_batch_count = 0; |
||
189 | |||
190 | brw->ib.type = -1; |
||
191 | |||
192 | /* We need to periodically reap the shader time results, because rollover |
||
193 | * happens every few seconds. We also want to see results every once in a |
||
194 | * while, because many programs won't cleanly destroy our context, so the |
||
195 | * end-of-run printout may not happen. |
||
196 | */ |
||
197 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
198 | brw_collect_and_report_shader_time(brw); |
||
199 | |||
200 | if (INTEL_DEBUG & DEBUG_PERFMON) |
||
201 | brw_dump_perf_monitors(brw); |
||
202 | } |
||
203 | |||
204 | /** |
||
205 | * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and |
||
206 | * sending it off. |
||
207 | * |
||
208 | * This function can emit state (say, to preserve registers that aren't saved |
||
209 | * between batches). All of this state MUST fit in the reserved space at the |
||
210 | * end of the batchbuffer. If you add more GPU state, increase the reserved |
||
211 | * space by updating the BATCH_RESERVED macro. |
||
212 | */ |
||
213 | static void |
||
214 | brw_finish_batch(struct brw_context *brw) |
||
215 | { |
||
216 | /* Capture the closing pipeline statistics register values necessary to |
||
217 | * support query objects (in the non-hardware context world). |
||
218 | */ |
||
219 | brw_emit_query_end(brw); |
||
220 | |||
221 | /* We may also need to snapshot and disable OA counters. */ |
||
222 | if (brw->batch.ring == RENDER_RING) |
||
223 | brw_perf_monitor_finish_batch(brw); |
||
224 | |||
225 | /* Mark that the current program cache BO has been used by the GPU. |
||
226 | * It will be reallocated if we need to put new programs in for the |
||
227 | * next batch. |
||
228 | */ |
||
229 | brw->cache.bo_used_by_gpu = true; |
||
230 | } |
||
231 | |||
232 | static void |
||
233 | throttle(struct brw_context *brw) |
||
234 | { |
||
235 | /* Wait for the swapbuffers before the one we just emitted, so we |
||
236 | * don't get too many swaps outstanding for apps that are GPU-heavy |
||
237 | * but not CPU-heavy. |
||
238 | * |
||
239 | * We're using intelDRI2Flush (called from the loader before |
||
240 | * swapbuffer) and glFlush (for front buffer rendering) as the |
||
241 | * indicator that a frame is done and then throttle when we get |
||
242 | * here as we prepare to render the next frame. At this point for |
||
243 | * round trips for swap/copy and getting new buffers are done and |
||
244 | * we'll spend less time waiting on the GPU. |
||
245 | * |
||
246 | * Unfortunately, we don't have a handle to the batch containing |
||
247 | * the swap, and getting our hands on that doesn't seem worth it, |
||
248 | * so we just use the first batch we emitted after the last swap. |
||
249 | */ |
||
250 | if (brw->need_swap_throttle && brw->throttle_batch[0]) { |
||
251 | if (brw->throttle_batch[1]) { |
||
252 | if (!brw->disable_throttling) |
||
253 | drm_intel_bo_wait_rendering(brw->throttle_batch[1]); |
||
254 | drm_intel_bo_unreference(brw->throttle_batch[1]); |
||
255 | } |
||
256 | brw->throttle_batch[1] = brw->throttle_batch[0]; |
||
257 | brw->throttle_batch[0] = NULL; |
||
258 | brw->need_swap_throttle = false; |
||
259 | /* Throttling here is more precise than the throttle ioctl, so skip it */ |
||
260 | brw->need_flush_throttle = false; |
||
261 | } |
||
262 | |||
263 | if (brw->need_flush_throttle) { |
||
264 | __DRIscreen *psp = brw->intelScreen->driScrnPriv; |
||
265 | drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE); |
||
266 | brw->need_flush_throttle = false; |
||
267 | } |
||
268 | } |
||
269 | |||
270 | /* TODO: Push this whole function into bufmgr. |
||
271 | */ |
||
272 | static int |
||
273 | do_flush_locked(struct brw_context *brw) |
||
274 | { |
||
275 | struct intel_batchbuffer *batch = &brw->batch; |
||
276 | int ret = 0; |
||
277 | |||
278 | if (brw->has_llc) { |
||
279 | drm_intel_bo_unmap(batch->bo); |
||
280 | } else { |
||
281 | ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map); |
||
282 | if (ret == 0 && batch->state_batch_offset != batch->bo->size) { |
||
283 | ret = drm_intel_bo_subdata(batch->bo, |
||
284 | batch->state_batch_offset, |
||
285 | batch->bo->size - batch->state_batch_offset, |
||
286 | (char *)batch->map + batch->state_batch_offset); |
||
287 | } |
||
288 | } |
||
289 | |||
290 | if (!brw->intelScreen->no_hw) { |
||
291 | int flags; |
||
292 | |||
293 | if (brw->gen >= 6 && batch->ring == BLT_RING) { |
||
294 | flags = I915_EXEC_BLT; |
||
295 | } else { |
||
296 | flags = I915_EXEC_RENDER; |
||
297 | } |
||
298 | if (batch->needs_sol_reset) |
||
299 | flags |= I915_EXEC_GEN7_SOL_RESET; |
||
300 | |||
301 | if (ret == 0) { |
||
302 | if (unlikely(INTEL_DEBUG & DEBUG_AUB)) |
||
303 | brw_annotate_aub(brw); |
||
304 | |||
305 | if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) { |
||
306 | ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0, |
||
307 | flags); |
||
308 | } else { |
||
309 | ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx, |
||
310 | 4 * batch->used, flags); |
||
311 | } |
||
312 | } |
||
313 | |||
314 | throttle(brw); |
||
315 | } |
||
316 | |||
317 | if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) |
||
318 | do_batch_dump(brw); |
||
319 | |||
320 | if (ret != 0) { |
||
321 | fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret)); |
||
322 | exit(1); |
||
323 | } |
||
324 | |||
325 | return ret; |
||
326 | } |
||
327 | |||
328 | int |
||
329 | _intel_batchbuffer_flush(struct brw_context *brw, |
||
330 | const char *file, int line) |
||
331 | { |
||
332 | int ret; |
||
333 | |||
334 | if (brw->batch.used == 0) |
||
335 | return 0; |
||
336 | |||
337 | if (brw->throttle_batch[0] == NULL) { |
||
338 | brw->throttle_batch[0] = brw->batch.bo; |
||
339 | drm_intel_bo_reference(brw->throttle_batch[0]); |
||
340 | } |
||
341 | |||
342 | if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { |
||
343 | int bytes_for_commands = 4 * brw->batch.used; |
||
344 | int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset; |
||
345 | int total_bytes = bytes_for_commands + bytes_for_state; |
||
346 | fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + " |
||
347 | "%4db (state) = %4db (%0.1f%%)\n", file, line, |
||
348 | bytes_for_commands, bytes_for_state, |
||
349 | total_bytes, |
||
350 | 100.0f * total_bytes / BATCH_SZ); |
||
351 | } |
||
352 | |||
353 | brw->batch.reserved_space = 0; |
||
354 | |||
355 | brw_finish_batch(brw); |
||
356 | |||
357 | /* Mark the end of the buffer. */ |
||
358 | intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END); |
||
359 | if (brw->batch.used & 1) { |
||
360 | /* Round batchbuffer usage to 2 DWORDs. */ |
||
361 | intel_batchbuffer_emit_dword(brw, MI_NOOP); |
||
362 | } |
||
363 | |||
364 | intel_upload_finish(brw); |
||
365 | |||
366 | /* Check that we didn't just wrap our batchbuffer at a bad time. */ |
||
367 | assert(!brw->no_batch_wrap); |
||
368 | |||
369 | ret = do_flush_locked(brw); |
||
370 | |||
371 | if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) { |
||
372 | fprintf(stderr, "waiting for idle\n"); |
||
373 | drm_intel_bo_wait_rendering(brw->batch.bo); |
||
374 | } |
||
375 | |||
376 | /* Start a new batch buffer. */ |
||
377 | brw_new_batch(brw); |
||
378 | |||
379 | return ret; |
||
380 | } |
||
381 | |||
382 | |||
383 | /* This is the only way buffers get added to the validate list. |
||
384 | */ |
||
385 | bool |
||
386 | intel_batchbuffer_emit_reloc(struct brw_context *brw, |
||
387 | drm_intel_bo *buffer, |
||
388 | uint32_t read_domains, uint32_t write_domain, |
||
389 | uint32_t delta) |
||
390 | { |
||
391 | int ret; |
||
392 | |||
393 | ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, |
||
394 | buffer, delta, |
||
395 | read_domains, write_domain); |
||
396 | assert(ret == 0); |
||
397 | (void)ret; |
||
398 | |||
399 | /* Using the old buffer offset, write in what the right data would be, in |
||
400 | * case the buffer doesn't move and we can short-circuit the relocation |
||
401 | * processing in the kernel |
||
402 | */ |
||
403 | intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta); |
||
404 | |||
405 | return true; |
||
406 | } |
||
407 | |||
408 | bool |
||
409 | intel_batchbuffer_emit_reloc64(struct brw_context *brw, |
||
410 | drm_intel_bo *buffer, |
||
411 | uint32_t read_domains, uint32_t write_domain, |
||
412 | uint32_t delta) |
||
413 | { |
||
414 | int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used, |
||
415 | buffer, delta, |
||
416 | read_domains, write_domain); |
||
417 | assert(ret == 0); |
||
418 | (void) ret; |
||
419 | |||
420 | /* Using the old buffer offset, write in what the right data would be, in |
||
421 | * case the buffer doesn't move and we can short-circuit the relocation |
||
422 | * processing in the kernel |
||
423 | */ |
||
424 | uint64_t offset = buffer->offset64 + delta; |
||
425 | intel_batchbuffer_emit_dword(brw, offset); |
||
426 | intel_batchbuffer_emit_dword(brw, offset >> 32); |
||
427 | |||
428 | return true; |
||
429 | } |
||
430 | |||
431 | |||
432 | void |
||
433 | intel_batchbuffer_data(struct brw_context *brw, |
||
434 | const void *data, GLuint bytes, enum brw_gpu_ring ring) |
||
435 | { |
||
436 | assert((bytes & 3) == 0); |
||
437 | intel_batchbuffer_require_space(brw, bytes, ring); |
||
438 | memcpy(brw->batch.map + brw->batch.used, data, bytes); |
||
439 | brw->batch.used += bytes >> 2; |
||
440 | } |
||
441 | |||
442 | /** |
||
443 | * According to the latest documentation, any PIPE_CONTROL with the |
||
444 | * "Command Streamer Stall" bit set must also have another bit set, |
||
445 | * with five different options: |
||
446 | * |
||
447 | * - Render Target Cache Flush |
||
448 | * - Depth Cache Flush |
||
449 | * - Stall at Pixel Scoreboard |
||
450 | * - Post-Sync Operation |
||
451 | * - Depth Stall |
||
452 | * |
||
453 | * I chose "Stall at Pixel Scoreboard" since we've used it effectively |
||
454 | * in the past, but the choice is fairly arbitrary. |
||
455 | */ |
||
456 | static void |
||
457 | gen8_add_cs_stall_workaround_bits(uint32_t *flags) |
||
458 | { |
||
459 | uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH | |
||
460 | PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
||
461 | PIPE_CONTROL_WRITE_IMMEDIATE | |
||
462 | PIPE_CONTROL_WRITE_DEPTH_COUNT | |
||
463 | PIPE_CONTROL_WRITE_TIMESTAMP | |
||
464 | PIPE_CONTROL_STALL_AT_SCOREBOARD | |
||
465 | PIPE_CONTROL_DEPTH_STALL; |
||
466 | |||
467 | /* If we're doing a CS stall, and don't already have one of the |
||
468 | * workaround bits set, add "Stall at Pixel Scoreboard." |
||
469 | */ |
||
470 | if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0) |
||
471 | *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; |
||
472 | } |
||
473 | |||
474 | /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT: |
||
475 | * |
||
476 | * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with |
||
477 | * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set." |
||
478 | * |
||
479 | * Note that the kernel does CS stalls between batches, so we only need |
||
480 | * to count them within a batch. |
||
481 | */ |
||
482 | static uint32_t |
||
483 | gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags) |
||
484 | { |
||
485 | if (brw->gen == 7 && !brw->is_haswell) { |
||
486 | if (flags & PIPE_CONTROL_CS_STALL) { |
||
487 | /* If we're doing a CS stall, reset the counter and carry on. */ |
||
488 | brw->batch.pipe_controls_since_last_cs_stall = 0; |
||
489 | return 0; |
||
490 | } |
||
491 | |||
492 | /* If this is the fourth pipe control without a CS stall, do one now. */ |
||
493 | if (++brw->batch.pipe_controls_since_last_cs_stall == 4) { |
||
494 | brw->batch.pipe_controls_since_last_cs_stall = 0; |
||
495 | return PIPE_CONTROL_CS_STALL; |
||
496 | } |
||
497 | } |
||
498 | return 0; |
||
499 | } |
||
500 | |||
501 | /** |
||
502 | * Emit a PIPE_CONTROL with various flushing flags. |
||
503 | * |
||
504 | * The caller is responsible for deciding what flags are appropriate for the |
||
505 | * given generation. |
||
506 | */ |
||
507 | void |
||
508 | brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags) |
||
509 | { |
||
510 | if (brw->gen >= 8) { |
||
511 | gen8_add_cs_stall_workaround_bits(&flags); |
||
512 | |||
513 | BEGIN_BATCH(6); |
||
514 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); |
||
515 | OUT_BATCH(flags); |
||
516 | OUT_BATCH(0); |
||
517 | OUT_BATCH(0); |
||
518 | OUT_BATCH(0); |
||
519 | OUT_BATCH(0); |
||
520 | ADVANCE_BATCH(); |
||
521 | } else if (brw->gen >= 6) { |
||
522 | flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags); |
||
523 | |||
524 | BEGIN_BATCH(5); |
||
525 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); |
||
526 | OUT_BATCH(flags); |
||
527 | OUT_BATCH(0); |
||
528 | OUT_BATCH(0); |
||
529 | OUT_BATCH(0); |
||
530 | ADVANCE_BATCH(); |
||
531 | } else { |
||
532 | BEGIN_BATCH(4); |
||
533 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2)); |
||
534 | OUT_BATCH(0); |
||
535 | OUT_BATCH(0); |
||
536 | OUT_BATCH(0); |
||
537 | ADVANCE_BATCH(); |
||
538 | } |
||
539 | } |
||
540 | |||
541 | /** |
||
542 | * Emit a PIPE_CONTROL that writes to a buffer object. |
||
543 | * |
||
544 | * \p flags should contain one of the following items: |
||
545 | * - PIPE_CONTROL_WRITE_IMMEDIATE |
||
546 | * - PIPE_CONTROL_WRITE_TIMESTAMP |
||
547 | * - PIPE_CONTROL_WRITE_DEPTH_COUNT |
||
548 | */ |
||
549 | void |
||
550 | brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags, |
||
551 | drm_intel_bo *bo, uint32_t offset, |
||
552 | uint32_t imm_lower, uint32_t imm_upper) |
||
553 | { |
||
554 | if (brw->gen >= 8) { |
||
555 | gen8_add_cs_stall_workaround_bits(&flags); |
||
556 | |||
557 | BEGIN_BATCH(6); |
||
558 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); |
||
559 | OUT_BATCH(flags); |
||
560 | OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, |
||
561 | offset); |
||
562 | OUT_BATCH(imm_lower); |
||
563 | OUT_BATCH(imm_upper); |
||
564 | ADVANCE_BATCH(); |
||
565 | } else if (brw->gen >= 6) { |
||
566 | flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags); |
||
567 | |||
568 | /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24 |
||
569 | * on later platforms. We always use PPGTT on Gen7+. |
||
570 | */ |
||
571 | unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0; |
||
572 | |||
573 | BEGIN_BATCH(5); |
||
574 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2)); |
||
575 | OUT_BATCH(flags); |
||
576 | OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, |
||
577 | gen6_gtt | offset); |
||
578 | OUT_BATCH(imm_lower); |
||
579 | OUT_BATCH(imm_upper); |
||
580 | ADVANCE_BATCH(); |
||
581 | } else { |
||
582 | BEGIN_BATCH(4); |
||
583 | OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2)); |
||
584 | OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, |
||
585 | PIPE_CONTROL_GLOBAL_GTT_WRITE | offset); |
||
586 | OUT_BATCH(imm_lower); |
||
587 | OUT_BATCH(imm_upper); |
||
588 | ADVANCE_BATCH(); |
||
589 | } |
||
590 | } |
||
591 | |||
592 | /** |
||
593 | * Restriction [DevSNB, DevIVB]: |
||
594 | * |
||
595 | * Prior to changing Depth/Stencil Buffer state (i.e. any combination of |
||
596 | * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER, |
||
597 | * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall |
||
598 | * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth |
||
599 | * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by |
||
600 | * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set), |
||
601 | * unless SW can otherwise guarantee that the pipeline from WM onwards is |
||
602 | * already flushed (e.g., via a preceding MI_FLUSH). |
||
603 | */ |
||
604 | void |
||
605 | intel_emit_depth_stall_flushes(struct brw_context *brw) |
||
606 | { |
||
607 | assert(brw->gen >= 6 && brw->gen <= 9); |
||
608 | |||
609 | brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL); |
||
610 | brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH); |
||
611 | brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL); |
||
612 | } |
||
613 | |||
614 | /** |
||
615 | * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input): |
||
616 | * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth |
||
617 | * stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS, |
||
618 | * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS, |
||
619 | * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL needs |
||
620 | * to be sent before any combination of VS associated 3DSTATE." |
||
621 | */ |
||
622 | void |
||
623 | gen7_emit_vs_workaround_flush(struct brw_context *brw) |
||
624 | { |
||
625 | assert(brw->gen == 7); |
||
626 | brw_emit_pipe_control_write(brw, |
||
627 | PIPE_CONTROL_WRITE_IMMEDIATE |
||
628 | | PIPE_CONTROL_DEPTH_STALL, |
||
629 | brw->batch.workaround_bo, 0, |
||
630 | 0, 0); |
||
631 | } |
||
632 | |||
633 | |||
634 | /** |
||
635 | * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set. |
||
636 | */ |
||
637 | void |
||
638 | gen7_emit_cs_stall_flush(struct brw_context *brw) |
||
639 | { |
||
640 | brw_emit_pipe_control_write(brw, |
||
641 | PIPE_CONTROL_CS_STALL |
||
642 | | PIPE_CONTROL_WRITE_IMMEDIATE, |
||
643 | brw->batch.workaround_bo, 0, |
||
644 | 0, 0); |
||
645 | } |
||
646 | |||
647 | |||
648 | /** |
||
649 | * Emits a PIPE_CONTROL with a non-zero post-sync operation, for |
||
650 | * implementing two workarounds on gen6. From section 1.4.7.1 |
||
651 | * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: |
||
652 | * |
||
653 | * [DevSNB-C+{W/A}] Before any depth stall flush (including those |
||
654 | * produced by non-pipelined state commands), software needs to first |
||
655 | * send a PIPE_CONTROL with no bits set except Post-Sync Operation != |
||
656 | * 0. |
||
657 | * |
||
658 | * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable |
||
659 | * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. |
||
660 | * |
||
661 | * And the workaround for these two requires this workaround first: |
||
662 | * |
||
663 | * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent |
||
664 | * BEFORE the pipe-control with a post-sync op and no write-cache |
||
665 | * flushes. |
||
666 | * |
||
667 | * And this last workaround is tricky because of the requirements on |
||
668 | * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM |
||
669 | * volume 2 part 1: |
||
670 | * |
||
671 | * "1 of the following must also be set: |
||
672 | * - Render Target Cache Flush Enable ([12] of DW1) |
||
673 | * - Depth Cache Flush Enable ([0] of DW1) |
||
674 | * - Stall at Pixel Scoreboard ([1] of DW1) |
||
675 | * - Depth Stall ([13] of DW1) |
||
676 | * - Post-Sync Operation ([13] of DW1) |
||
677 | * - Notify Enable ([8] of DW1)" |
||
678 | * |
||
679 | * The cache flushes require the workaround flush that triggered this |
||
680 | * one, so we can't use it. Depth stall would trigger the same. |
||
681 | * Post-sync nonzero is what triggered this second workaround, so we |
||
682 | * can't use that one either. Notify enable is IRQs, which aren't |
||
683 | * really our business. That leaves only stall at scoreboard. |
||
684 | */ |
||
685 | void |
||
686 | intel_emit_post_sync_nonzero_flush(struct brw_context *brw) |
||
687 | { |
||
688 | brw_emit_pipe_control_flush(brw, |
||
689 | PIPE_CONTROL_CS_STALL | |
||
690 | PIPE_CONTROL_STALL_AT_SCOREBOARD); |
||
691 | |||
692 | brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE, |
||
693 | brw->batch.workaround_bo, 0, 0, 0); |
||
694 | } |
||
695 | |||
696 | /* Emit a pipelined flush to either flush render and texture cache for |
||
697 | * reading from a FBO-drawn texture, or flush so that frontbuffer |
||
698 | * render appears on the screen in DRI1. |
||
699 | * |
||
700 | * This is also used for the always_flush_cache driconf debug option. |
||
701 | */ |
||
702 | void |
||
703 | intel_batchbuffer_emit_mi_flush(struct brw_context *brw) |
||
704 | { |
||
705 | if (brw->batch.ring == BLT_RING && brw->gen >= 6) { |
||
706 | BEGIN_BATCH_BLT(4); |
||
707 | OUT_BATCH(MI_FLUSH_DW); |
||
708 | OUT_BATCH(0); |
||
709 | OUT_BATCH(0); |
||
710 | OUT_BATCH(0); |
||
711 | ADVANCE_BATCH(); |
||
712 | } else { |
||
713 | int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH; |
||
714 | if (brw->gen >= 6) { |
||
715 | if (brw->gen == 9) { |
||
716 | /* Hardware workaround: SKL |
||
717 | * |
||
718 | * Emit Pipe Control with all bits set to zero before emitting |
||
719 | * a Pipe Control with VF Cache Invalidate set. |
||
720 | */ |
||
721 | brw_emit_pipe_control_flush(brw, 0); |
||
722 | } |
||
723 | |||
724 | flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE | |
||
725 | PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
||
726 | PIPE_CONTROL_VF_CACHE_INVALIDATE | |
||
727 | PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | |
||
728 | PIPE_CONTROL_CS_STALL; |
||
729 | |||
730 | if (brw->gen == 6) { |
||
731 | /* Hardware workaround: SNB B-Spec says: |
||
732 | * |
||
733 | * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache |
||
734 | * Flush Enable =1, a PIPE_CONTROL with any non-zero |
||
735 | * post-sync-op is required. |
||
736 | */ |
||
737 | intel_emit_post_sync_nonzero_flush(brw); |
||
738 | } |
||
739 | } |
||
740 | brw_emit_pipe_control_flush(brw, flags); |
||
741 | } |
||
742 | |||
743 | brw_render_cache_set_clear(brw); |
||
744 | } |
||
745 | |||
746 | static void |
||
747 | load_sized_register_mem(struct brw_context *brw, |
||
748 | uint32_t reg, |
||
749 | drm_intel_bo *bo, |
||
750 | uint32_t read_domains, uint32_t write_domain, |
||
751 | uint32_t offset, |
||
752 | int size) |
||
753 | { |
||
754 | int i; |
||
755 | |||
756 | /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */ |
||
757 | assert(brw->gen >= 7); |
||
758 | |||
759 | if (brw->gen >= 8) { |
||
760 | BEGIN_BATCH(4 * size); |
||
761 | for (i = 0; i < size; i++) { |
||
762 | OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2)); |
||
763 | OUT_BATCH(reg + i * 4); |
||
764 | OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4); |
||
765 | } |
||
766 | ADVANCE_BATCH(); |
||
767 | } else { |
||
768 | BEGIN_BATCH(3 * size); |
||
769 | for (i = 0; i < size; i++) { |
||
770 | OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2)); |
||
771 | OUT_BATCH(reg + i * 4); |
||
772 | OUT_RELOC(bo, read_domains, write_domain, offset + i * 4); |
||
773 | } |
||
774 | ADVANCE_BATCH(); |
||
775 | } |
||
776 | } |
||
777 | |||
778 | void |
||
779 | brw_load_register_mem(struct brw_context *brw, |
||
780 | uint32_t reg, |
||
781 | drm_intel_bo *bo, |
||
782 | uint32_t read_domains, uint32_t write_domain, |
||
783 | uint32_t offset) |
||
784 | { |
||
785 | load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1); |
||
786 | } |
||
787 | |||
788 | void |
||
789 | brw_load_register_mem64(struct brw_context *brw, |
||
790 | uint32_t reg, |
||
791 | drm_intel_bo *bo, |
||
792 | uint32_t read_domains, uint32_t write_domain, |
||
793 | uint32_t offset) |
||
794 | { |
||
795 | load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2); |
||
796 | }>>=> |