Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/**************************************************************************
2
 *
3
 * Copyright 2006 VMware, Inc.
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sub license, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial portions
16
 * of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 *
26
 **************************************************************************/
27
 
28
#include "intel_batchbuffer.h"
29
#include "intel_buffer_objects.h"
30
#include "intel_reg.h"
31
#include "intel_bufmgr.h"
32
#include "intel_buffers.h"
33
#include "intel_fbo.h"
34
#include "brw_context.h"
35
 
36
#include 
37
#include 
38
 
39
static void
40
intel_batchbuffer_reset(struct brw_context *brw);
41
 
42
void
43
intel_batchbuffer_init(struct brw_context *brw)
44
{
45
   intel_batchbuffer_reset(brw);
46
 
47
   if (brw->gen >= 6) {
48
      /* We can't just use brw_state_batch to get a chunk of space for
49
       * the gen6 workaround because it involves actually writing to
50
       * the buffer, and the kernel doesn't let us write to the batch.
51
       */
52
      brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
53
						      "pipe_control workaround",
54
						      4096, 4096);
55
   }
56
 
57
   if (!brw->has_llc) {
58
      brw->batch.cpu_map = malloc(BATCH_SZ);
59
      brw->batch.map = brw->batch.cpu_map;
60
   }
61
}
62
 
63
static void
64
intel_batchbuffer_reset(struct brw_context *brw)
65
{
66
   if (brw->batch.last_bo != NULL) {
67
      drm_intel_bo_unreference(brw->batch.last_bo);
68
      brw->batch.last_bo = NULL;
69
   }
70
   brw->batch.last_bo = brw->batch.bo;
71
 
72
   brw_render_cache_set_clear(brw);
73
 
74
   brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
75
					BATCH_SZ, 4096);
76
   if (brw->has_llc) {
77
      drm_intel_bo_map(brw->batch.bo, true);
78
      brw->batch.map = brw->batch.bo->virtual;
79
   }
80
 
81
   brw->batch.reserved_space = BATCH_RESERVED;
82
   brw->batch.state_batch_offset = brw->batch.bo->size;
83
   brw->batch.used = 0;
84
   brw->batch.needs_sol_reset = false;
85
   brw->batch.pipe_controls_since_last_cs_stall = 0;
86
 
87
   /* We don't know what ring the new batch will be sent to until we see the
88
    * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
89
    */
90
   brw->batch.ring = UNKNOWN_RING;
91
}
92
 
93
void
94
intel_batchbuffer_save_state(struct brw_context *brw)
95
{
96
   brw->batch.saved.used = brw->batch.used;
97
   brw->batch.saved.reloc_count =
98
      drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
99
}
100
 
101
void
102
intel_batchbuffer_reset_to_saved(struct brw_context *brw)
103
{
104
   drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
105
 
106
   brw->batch.used = brw->batch.saved.used;
107
   if (brw->batch.used == 0)
108
      brw->batch.ring = UNKNOWN_RING;
109
}
110
 
111
void
112
intel_batchbuffer_free(struct brw_context *brw)
113
{
114
   free(brw->batch.cpu_map);
115
   drm_intel_bo_unreference(brw->batch.last_bo);
116
   drm_intel_bo_unreference(brw->batch.bo);
117
   drm_intel_bo_unreference(brw->batch.workaround_bo);
118
}
119
 
120
static void
121
do_batch_dump(struct brw_context *brw)
122
{
123
   struct drm_intel_decode *decode;
124
   struct intel_batchbuffer *batch = &brw->batch;
125
   int ret;
126
 
127
   decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
128
   if (!decode)
129
      return;
130
 
131
   ret = drm_intel_bo_map(batch->bo, false);
132
   if (ret == 0) {
133
      drm_intel_decode_set_batch_pointer(decode,
134
					 batch->bo->virtual,
135
					 batch->bo->offset64,
136
					 batch->used);
137
   } else {
138
      fprintf(stderr,
139
	      "WARNING: failed to map batchbuffer (%s), "
140
	      "dumping uploaded data instead.\n", strerror(ret));
141
 
142
      drm_intel_decode_set_batch_pointer(decode,
143
					 batch->map,
144
					 batch->bo->offset64,
145
					 batch->used);
146
   }
147
 
148
   drm_intel_decode_set_output_file(decode, stderr);
149
   drm_intel_decode(decode);
150
 
151
   drm_intel_decode_context_free(decode);
152
 
153
   if (ret == 0) {
154
      drm_intel_bo_unmap(batch->bo);
155
 
156
      brw_debug_batch(brw);
157
   }
158
}
159
 
160
void
161
intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
162
{
163
   /* We may need to enable and snapshot OA counters. */
164
   brw_perf_monitor_new_batch(brw);
165
}
166
 
167
/**
168
 * Called when starting a new batch buffer.
169
 */
170
static void
171
brw_new_batch(struct brw_context *brw)
172
{
173
   /* Create a new batchbuffer and reset the associated state: */
174
   drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
175
   intel_batchbuffer_reset(brw);
176
 
177
   /* If the kernel supports hardware contexts, then most hardware state is
178
    * preserved between batches; we only need to re-emit state that is required
179
    * to be in every batch.  Otherwise we need to re-emit all the state that
180
    * would otherwise be stored in the context (which for all intents and
181
    * purposes means everything).
182
    */
183
   if (brw->hw_ctx == NULL)
184
      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
185
 
186
   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
187
 
188
   brw->state_batch_count = 0;
189
 
190
   brw->ib.type = -1;
191
 
192
   /* We need to periodically reap the shader time results, because rollover
193
    * happens every few seconds.  We also want to see results every once in a
194
    * while, because many programs won't cleanly destroy our context, so the
195
    * end-of-run printout may not happen.
196
    */
197
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
198
      brw_collect_and_report_shader_time(brw);
199
 
200
   if (INTEL_DEBUG & DEBUG_PERFMON)
201
      brw_dump_perf_monitors(brw);
202
}
203
 
204
/**
205
 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
206
 * sending it off.
207
 *
208
 * This function can emit state (say, to preserve registers that aren't saved
209
 * between batches).  All of this state MUST fit in the reserved space at the
210
 * end of the batchbuffer.  If you add more GPU state, increase the reserved
211
 * space by updating the BATCH_RESERVED macro.
212
 */
213
static void
214
brw_finish_batch(struct brw_context *brw)
215
{
216
   /* Capture the closing pipeline statistics register values necessary to
217
    * support query objects (in the non-hardware context world).
218
    */
219
   brw_emit_query_end(brw);
220
 
221
   /* We may also need to snapshot and disable OA counters. */
222
   if (brw->batch.ring == RENDER_RING)
223
      brw_perf_monitor_finish_batch(brw);
224
 
225
   /* Mark that the current program cache BO has been used by the GPU.
226
    * It will be reallocated if we need to put new programs in for the
227
    * next batch.
228
    */
229
   brw->cache.bo_used_by_gpu = true;
230
}
231
 
232
static void
233
throttle(struct brw_context *brw)
234
{
235
   /* Wait for the swapbuffers before the one we just emitted, so we
236
    * don't get too many swaps outstanding for apps that are GPU-heavy
237
    * but not CPU-heavy.
238
    *
239
    * We're using intelDRI2Flush (called from the loader before
240
    * swapbuffer) and glFlush (for front buffer rendering) as the
241
    * indicator that a frame is done and then throttle when we get
242
    * here as we prepare to render the next frame.  At this point for
243
    * round trips for swap/copy and getting new buffers are done and
244
    * we'll spend less time waiting on the GPU.
245
    *
246
    * Unfortunately, we don't have a handle to the batch containing
247
    * the swap, and getting our hands on that doesn't seem worth it,
248
    * so we just use the first batch we emitted after the last swap.
249
    */
250
   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
251
      if (brw->throttle_batch[1]) {
252
         if (!brw->disable_throttling)
253
            drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
254
         drm_intel_bo_unreference(brw->throttle_batch[1]);
255
      }
256
      brw->throttle_batch[1] = brw->throttle_batch[0];
257
      brw->throttle_batch[0] = NULL;
258
      brw->need_swap_throttle = false;
259
      /* Throttling here is more precise than the throttle ioctl, so skip it */
260
      brw->need_flush_throttle = false;
261
   }
262
 
263
   if (brw->need_flush_throttle) {
264
      __DRIscreen *psp = brw->intelScreen->driScrnPriv;
265
      drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
266
      brw->need_flush_throttle = false;
267
   }
268
}
269
 
270
/* TODO: Push this whole function into bufmgr.
271
 */
272
static int
273
do_flush_locked(struct brw_context *brw)
274
{
275
   struct intel_batchbuffer *batch = &brw->batch;
276
   int ret = 0;
277
 
278
   if (brw->has_llc) {
279
      drm_intel_bo_unmap(batch->bo);
280
   } else {
281
      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
282
      if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
283
	 ret = drm_intel_bo_subdata(batch->bo,
284
				    batch->state_batch_offset,
285
				    batch->bo->size - batch->state_batch_offset,
286
				    (char *)batch->map + batch->state_batch_offset);
287
      }
288
   }
289
 
290
   if (!brw->intelScreen->no_hw) {
291
      int flags;
292
 
293
      if (brw->gen >= 6 && batch->ring == BLT_RING) {
294
         flags = I915_EXEC_BLT;
295
      } else {
296
         flags = I915_EXEC_RENDER;
297
      }
298
      if (batch->needs_sol_reset)
299
	 flags |= I915_EXEC_GEN7_SOL_RESET;
300
 
301
      if (ret == 0) {
302
         if (unlikely(INTEL_DEBUG & DEBUG_AUB))
303
            brw_annotate_aub(brw);
304
 
305
	 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
306
	    ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
307
					flags);
308
	 } else {
309
	    ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
310
						4 * batch->used, flags);
311
	 }
312
      }
313
 
314
      throttle(brw);
315
   }
316
 
317
   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
318
      do_batch_dump(brw);
319
 
320
   if (ret != 0) {
321
      fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
322
      exit(1);
323
   }
324
 
325
   return ret;
326
}
327
 
328
int
329
_intel_batchbuffer_flush(struct brw_context *brw,
330
			 const char *file, int line)
331
{
332
   int ret;
333
 
334
   if (brw->batch.used == 0)
335
      return 0;
336
 
337
   if (brw->throttle_batch[0] == NULL) {
338
      brw->throttle_batch[0] = brw->batch.bo;
339
      drm_intel_bo_reference(brw->throttle_batch[0]);
340
   }
341
 
342
   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
343
      int bytes_for_commands = 4 * brw->batch.used;
344
      int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
345
      int total_bytes = bytes_for_commands + bytes_for_state;
346
      fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
347
              "%4db (state) = %4db (%0.1f%%)\n", file, line,
348
              bytes_for_commands, bytes_for_state,
349
              total_bytes,
350
              100.0f * total_bytes / BATCH_SZ);
351
   }
352
 
353
   brw->batch.reserved_space = 0;
354
 
355
   brw_finish_batch(brw);
356
 
357
   /* Mark the end of the buffer. */
358
   intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
359
   if (brw->batch.used & 1) {
360
      /* Round batchbuffer usage to 2 DWORDs. */
361
      intel_batchbuffer_emit_dword(brw, MI_NOOP);
362
   }
363
 
364
   intel_upload_finish(brw);
365
 
366
   /* Check that we didn't just wrap our batchbuffer at a bad time. */
367
   assert(!brw->no_batch_wrap);
368
 
369
   ret = do_flush_locked(brw);
370
 
371
   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
372
      fprintf(stderr, "waiting for idle\n");
373
      drm_intel_bo_wait_rendering(brw->batch.bo);
374
   }
375
 
376
   /* Start a new batch buffer. */
377
   brw_new_batch(brw);
378
 
379
   return ret;
380
}
381
 
382
 
383
/*  This is the only way buffers get added to the validate list.
384
 */
385
bool
386
intel_batchbuffer_emit_reloc(struct brw_context *brw,
387
                             drm_intel_bo *buffer,
388
                             uint32_t read_domains, uint32_t write_domain,
389
			     uint32_t delta)
390
{
391
   int ret;
392
 
393
   ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
394
				 buffer, delta,
395
				 read_domains, write_domain);
396
   assert(ret == 0);
397
   (void)ret;
398
 
399
   /* Using the old buffer offset, write in what the right data would be, in
400
    * case the buffer doesn't move and we can short-circuit the relocation
401
    * processing in the kernel
402
    */
403
   intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
404
 
405
   return true;
406
}
407
 
408
bool
409
intel_batchbuffer_emit_reloc64(struct brw_context *brw,
410
                               drm_intel_bo *buffer,
411
                               uint32_t read_domains, uint32_t write_domain,
412
			       uint32_t delta)
413
{
414
   int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
415
                                     buffer, delta,
416
                                     read_domains, write_domain);
417
   assert(ret == 0);
418
   (void) ret;
419
 
420
   /* Using the old buffer offset, write in what the right data would be, in
421
    * case the buffer doesn't move and we can short-circuit the relocation
422
    * processing in the kernel
423
    */
424
   uint64_t offset = buffer->offset64 + delta;
425
   intel_batchbuffer_emit_dword(brw, offset);
426
   intel_batchbuffer_emit_dword(brw, offset >> 32);
427
 
428
   return true;
429
}
430
 
431
 
432
void
433
intel_batchbuffer_data(struct brw_context *brw,
434
                       const void *data, GLuint bytes, enum brw_gpu_ring ring)
435
{
436
   assert((bytes & 3) == 0);
437
   intel_batchbuffer_require_space(brw, bytes, ring);
438
   memcpy(brw->batch.map + brw->batch.used, data, bytes);
439
   brw->batch.used += bytes >> 2;
440
}
441
 
442
/**
443
 * According to the latest documentation, any PIPE_CONTROL with the
444
 * "Command Streamer Stall" bit set must also have another bit set,
445
 * with five different options:
446
 *
447
 *  - Render Target Cache Flush
448
 *  - Depth Cache Flush
449
 *  - Stall at Pixel Scoreboard
450
 *  - Post-Sync Operation
451
 *  - Depth Stall
452
 *
453
 * I chose "Stall at Pixel Scoreboard" since we've used it effectively
454
 * in the past, but the choice is fairly arbitrary.
455
 */
456
static void
457
gen8_add_cs_stall_workaround_bits(uint32_t *flags)
458
{
459
   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
460
                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
461
                      PIPE_CONTROL_WRITE_IMMEDIATE |
462
                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
463
                      PIPE_CONTROL_WRITE_TIMESTAMP |
464
                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
465
                      PIPE_CONTROL_DEPTH_STALL;
466
 
467
   /* If we're doing a CS stall, and don't already have one of the
468
    * workaround bits set, add "Stall at Pixel Scoreboard."
469
    */
470
   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
471
      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
472
}
473
 
474
/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
475
 *
476
 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
477
 *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
478
 *
479
 * Note that the kernel does CS stalls between batches, so we only need
480
 * to count them within a batch.
481
 */
482
static uint32_t
483
gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
484
{
485
   if (brw->gen == 7 && !brw->is_haswell) {
486
      if (flags & PIPE_CONTROL_CS_STALL) {
487
         /* If we're doing a CS stall, reset the counter and carry on. */
488
         brw->batch.pipe_controls_since_last_cs_stall = 0;
489
         return 0;
490
      }
491
 
492
      /* If this is the fourth pipe control without a CS stall, do one now. */
493
      if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
494
         brw->batch.pipe_controls_since_last_cs_stall = 0;
495
         return PIPE_CONTROL_CS_STALL;
496
      }
497
   }
498
   return 0;
499
}
500
 
501
/**
502
 * Emit a PIPE_CONTROL with various flushing flags.
503
 *
504
 * The caller is responsible for deciding what flags are appropriate for the
505
 * given generation.
506
 */
507
void
508
brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
509
{
510
   if (brw->gen >= 8) {
511
      gen8_add_cs_stall_workaround_bits(&flags);
512
 
513
      BEGIN_BATCH(6);
514
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
515
      OUT_BATCH(flags);
516
      OUT_BATCH(0);
517
      OUT_BATCH(0);
518
      OUT_BATCH(0);
519
      OUT_BATCH(0);
520
      ADVANCE_BATCH();
521
   } else if (brw->gen >= 6) {
522
      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
523
 
524
      BEGIN_BATCH(5);
525
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
526
      OUT_BATCH(flags);
527
      OUT_BATCH(0);
528
      OUT_BATCH(0);
529
      OUT_BATCH(0);
530
      ADVANCE_BATCH();
531
   } else {
532
      BEGIN_BATCH(4);
533
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
534
      OUT_BATCH(0);
535
      OUT_BATCH(0);
536
      OUT_BATCH(0);
537
      ADVANCE_BATCH();
538
   }
539
}
540
 
541
/**
542
 * Emit a PIPE_CONTROL that writes to a buffer object.
543
 *
544
 * \p flags should contain one of the following items:
545
 *  - PIPE_CONTROL_WRITE_IMMEDIATE
546
 *  - PIPE_CONTROL_WRITE_TIMESTAMP
547
 *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
548
 */
549
void
550
brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
551
                            drm_intel_bo *bo, uint32_t offset,
552
                            uint32_t imm_lower, uint32_t imm_upper)
553
{
554
   if (brw->gen >= 8) {
555
      gen8_add_cs_stall_workaround_bits(&flags);
556
 
557
      BEGIN_BATCH(6);
558
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
559
      OUT_BATCH(flags);
560
      OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
561
                  offset);
562
      OUT_BATCH(imm_lower);
563
      OUT_BATCH(imm_upper);
564
      ADVANCE_BATCH();
565
   } else if (brw->gen >= 6) {
566
      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
567
 
568
      /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
569
       * on later platforms.  We always use PPGTT on Gen7+.
570
       */
571
      unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
572
 
573
      BEGIN_BATCH(5);
574
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
575
      OUT_BATCH(flags);
576
      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
577
                gen6_gtt | offset);
578
      OUT_BATCH(imm_lower);
579
      OUT_BATCH(imm_upper);
580
      ADVANCE_BATCH();
581
   } else {
582
      BEGIN_BATCH(4);
583
      OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
584
      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
585
                PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
586
      OUT_BATCH(imm_lower);
587
      OUT_BATCH(imm_upper);
588
      ADVANCE_BATCH();
589
   }
590
}
591
 
592
/**
593
 * Restriction [DevSNB, DevIVB]:
594
 *
595
 * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
596
 * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
597
 * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
598
 * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
599
 * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
600
 * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
601
 * unless SW can otherwise guarantee that the pipeline from WM onwards is
602
 * already flushed (e.g., via a preceding MI_FLUSH).
603
 */
604
void
605
intel_emit_depth_stall_flushes(struct brw_context *brw)
606
{
607
   assert(brw->gen >= 6 && brw->gen <= 9);
608
 
609
   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
610
   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
611
   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
612
}
613
 
614
/**
615
 * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
616
 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
617
 *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
618
 *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
619
 *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
620
 *  to be sent before any combination of VS associated 3DSTATE."
621
 */
622
void
623
gen7_emit_vs_workaround_flush(struct brw_context *brw)
624
{
625
   assert(brw->gen == 7);
626
   brw_emit_pipe_control_write(brw,
627
                               PIPE_CONTROL_WRITE_IMMEDIATE
628
                               | PIPE_CONTROL_DEPTH_STALL,
629
                               brw->batch.workaround_bo, 0,
630
                               0, 0);
631
}
632
 
633
 
634
/**
635
 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
636
 */
637
void
638
gen7_emit_cs_stall_flush(struct brw_context *brw)
639
{
640
   brw_emit_pipe_control_write(brw,
641
                               PIPE_CONTROL_CS_STALL
642
                               | PIPE_CONTROL_WRITE_IMMEDIATE,
643
                               brw->batch.workaround_bo, 0,
644
                               0, 0);
645
}
646
 
647
 
648
/**
649
 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
650
 * implementing two workarounds on gen6.  From section 1.4.7.1
651
 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
652
 *
653
 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
654
 * produced by non-pipelined state commands), software needs to first
655
 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
656
 * 0.
657
 *
658
 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
659
 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
660
 *
661
 * And the workaround for these two requires this workaround first:
662
 *
663
 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
664
 * BEFORE the pipe-control with a post-sync op and no write-cache
665
 * flushes.
666
 *
667
 * And this last workaround is tricky because of the requirements on
668
 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
669
 * volume 2 part 1:
670
 *
671
 *     "1 of the following must also be set:
672
 *      - Render Target Cache Flush Enable ([12] of DW1)
673
 *      - Depth Cache Flush Enable ([0] of DW1)
674
 *      - Stall at Pixel Scoreboard ([1] of DW1)
675
 *      - Depth Stall ([13] of DW1)
676
 *      - Post-Sync Operation ([13] of DW1)
677
 *      - Notify Enable ([8] of DW1)"
678
 *
679
 * The cache flushes require the workaround flush that triggered this
680
 * one, so we can't use it.  Depth stall would trigger the same.
681
 * Post-sync nonzero is what triggered this second workaround, so we
682
 * can't use that one either.  Notify enable is IRQs, which aren't
683
 * really our business.  That leaves only stall at scoreboard.
684
 */
685
void
686
intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
687
{
688
   brw_emit_pipe_control_flush(brw,
689
                               PIPE_CONTROL_CS_STALL |
690
                               PIPE_CONTROL_STALL_AT_SCOREBOARD);
691
 
692
   brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
693
                               brw->batch.workaround_bo, 0, 0, 0);
694
}
695
 
696
/* Emit a pipelined flush to either flush render and texture cache for
697
 * reading from a FBO-drawn texture, or flush so that frontbuffer
698
 * render appears on the screen in DRI1.
699
 *
700
 * This is also used for the always_flush_cache driconf debug option.
701
 */
702
void
703
intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
704
{
705
   if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
706
      BEGIN_BATCH_BLT(4);
707
      OUT_BATCH(MI_FLUSH_DW);
708
      OUT_BATCH(0);
709
      OUT_BATCH(0);
710
      OUT_BATCH(0);
711
      ADVANCE_BATCH();
712
   } else {
713
      int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
714
      if (brw->gen >= 6) {
715
         if (brw->gen == 9) {
716
            /* Hardware workaround: SKL
717
             *
718
             * Emit Pipe Control with all bits set to zero before emitting
719
             * a Pipe Control with VF Cache Invalidate set.
720
             */
721
            brw_emit_pipe_control_flush(brw, 0);
722
         }
723
 
724
         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
725
                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
726
                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
727
                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
728
                  PIPE_CONTROL_CS_STALL;
729
 
730
         if (brw->gen == 6) {
731
            /* Hardware workaround: SNB B-Spec says:
732
             *
733
             * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
734
             * Flush Enable =1, a PIPE_CONTROL with any non-zero
735
             * post-sync-op is required.
736
             */
737
            intel_emit_post_sync_nonzero_flush(brw);
738
         }
739
      }
740
      brw_emit_pipe_control_flush(brw, flags);
741
   }
742
 
743
   brw_render_cache_set_clear(brw);
744
}
745
 
746
static void
747
load_sized_register_mem(struct brw_context *brw,
748
                        uint32_t reg,
749
                        drm_intel_bo *bo,
750
                        uint32_t read_domains, uint32_t write_domain,
751
                        uint32_t offset,
752
                        int size)
753
{
754
   int i;
755
 
756
   /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
757
   assert(brw->gen >= 7);
758
 
759
   if (brw->gen >= 8) {
760
      BEGIN_BATCH(4 * size);
761
      for (i = 0; i < size; i++) {
762
         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
763
         OUT_BATCH(reg + i * 4);
764
         OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
765
      }
766
      ADVANCE_BATCH();
767
   } else {
768
      BEGIN_BATCH(3 * size);
769
      for (i = 0; i < size; i++) {
770
         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
771
         OUT_BATCH(reg + i * 4);
772
         OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
773
      }
774
      ADVANCE_BATCH();
775
   }
776
}
777
 
778
void
779
brw_load_register_mem(struct brw_context *brw,
780
                      uint32_t reg,
781
                      drm_intel_bo *bo,
782
                      uint32_t read_domains, uint32_t write_domain,
783
                      uint32_t offset)
784
{
785
   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
786
}
787
 
788
void
789
brw_load_register_mem64(struct brw_context *brw,
790
                        uint32_t reg,
791
                        drm_intel_bo *bo,
792
                        uint32_t read_domains, uint32_t write_domain,
793
                        uint32_t offset)
794
{
795
   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
796
}