Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2006 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28. #include "intel_batchbuffer.h"
  29. #include "intel_buffer_objects.h"
  30. #include "intel_reg.h"
  31. #include "intel_bufmgr.h"
  32. #include "intel_buffers.h"
  33. #include "intel_fbo.h"
  34. #include "brw_context.h"
  35.  
  36. #include <xf86drm.h>
  37. #include <i915_drm.h>
  38.  
  39. static void
  40. intel_batchbuffer_reset(struct brw_context *brw);
  41.  
  42. void
  43. intel_batchbuffer_init(struct brw_context *brw)
  44. {
  45.    intel_batchbuffer_reset(brw);
  46.  
  47.    if (brw->gen >= 6) {
  48.       /* We can't just use brw_state_batch to get a chunk of space for
  49.        * the gen6 workaround because it involves actually writing to
  50.        * the buffer, and the kernel doesn't let us write to the batch.
  51.        */
  52.       brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
  53.                                                       "pipe_control workaround",
  54.                                                       4096, 4096);
  55.    }
  56.  
  57.    if (!brw->has_llc) {
  58.       brw->batch.cpu_map = malloc(BATCH_SZ);
  59.       brw->batch.map = brw->batch.cpu_map;
  60.    }
  61. }
  62.  
  63. static void
  64. intel_batchbuffer_reset(struct brw_context *brw)
  65. {
  66.    if (brw->batch.last_bo != NULL) {
  67.       drm_intel_bo_unreference(brw->batch.last_bo);
  68.       brw->batch.last_bo = NULL;
  69.    }
  70.    brw->batch.last_bo = brw->batch.bo;
  71.  
  72.    brw_render_cache_set_clear(brw);
  73.  
  74.    brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
  75.                                         BATCH_SZ, 4096);
  76.    if (brw->has_llc) {
  77.       drm_intel_bo_map(brw->batch.bo, true);
  78.       brw->batch.map = brw->batch.bo->virtual;
  79.    }
  80.  
  81.    brw->batch.reserved_space = BATCH_RESERVED;
  82.    brw->batch.state_batch_offset = brw->batch.bo->size;
  83.    brw->batch.used = 0;
  84.    brw->batch.needs_sol_reset = false;
  85.    brw->batch.pipe_controls_since_last_cs_stall = 0;
  86.  
  87.    /* We don't know what ring the new batch will be sent to until we see the
  88.     * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
  89.     */
  90.    brw->batch.ring = UNKNOWN_RING;
  91. }
  92.  
  93. void
  94. intel_batchbuffer_save_state(struct brw_context *brw)
  95. {
  96.    brw->batch.saved.used = brw->batch.used;
  97.    brw->batch.saved.reloc_count =
  98.       drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
  99. }
  100.  
  101. void
  102. intel_batchbuffer_reset_to_saved(struct brw_context *brw)
  103. {
  104.    drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
  105.  
  106.    brw->batch.used = brw->batch.saved.used;
  107.    if (brw->batch.used == 0)
  108.       brw->batch.ring = UNKNOWN_RING;
  109. }
  110.  
  111. void
  112. intel_batchbuffer_free(struct brw_context *brw)
  113. {
  114.    free(brw->batch.cpu_map);
  115.    drm_intel_bo_unreference(brw->batch.last_bo);
  116.    drm_intel_bo_unreference(brw->batch.bo);
  117.    drm_intel_bo_unreference(brw->batch.workaround_bo);
  118. }
  119.  
  120. static void
  121. do_batch_dump(struct brw_context *brw)
  122. {
  123.    struct drm_intel_decode *decode;
  124.    struct intel_batchbuffer *batch = &brw->batch;
  125.    int ret;
  126.  
  127.    decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
  128.    if (!decode)
  129.       return;
  130.  
  131.    ret = drm_intel_bo_map(batch->bo, false);
  132.    if (ret == 0) {
  133.       drm_intel_decode_set_batch_pointer(decode,
  134.                                          batch->bo->virtual,
  135.                                          batch->bo->offset64,
  136.                                          batch->used);
  137.    } else {
  138.       fprintf(stderr,
  139.               "WARNING: failed to map batchbuffer (%s), "
  140.               "dumping uploaded data instead.\n", strerror(ret));
  141.  
  142.       drm_intel_decode_set_batch_pointer(decode,
  143.                                          batch->map,
  144.                                          batch->bo->offset64,
  145.                                          batch->used);
  146.    }
  147.  
  148.    drm_intel_decode_set_output_file(decode, stderr);
  149.    drm_intel_decode(decode);
  150.  
  151.    drm_intel_decode_context_free(decode);
  152.  
  153.    if (ret == 0) {
  154.       drm_intel_bo_unmap(batch->bo);
  155.  
  156.       brw_debug_batch(brw);
  157.    }
  158. }
  159.  
  160. void
  161. intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
  162. {
  163.    /* We may need to enable and snapshot OA counters. */
  164.    brw_perf_monitor_new_batch(brw);
  165. }
  166.  
  167. /**
  168.  * Called when starting a new batch buffer.
  169.  */
  170. static void
  171. brw_new_batch(struct brw_context *brw)
  172. {
  173.    /* Create a new batchbuffer and reset the associated state: */
  174.    drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
  175.    intel_batchbuffer_reset(brw);
  176.  
  177.    /* If the kernel supports hardware contexts, then most hardware state is
  178.     * preserved between batches; we only need to re-emit state that is required
  179.     * to be in every batch.  Otherwise we need to re-emit all the state that
  180.     * would otherwise be stored in the context (which for all intents and
  181.     * purposes means everything).
  182.     */
  183.    if (brw->hw_ctx == NULL)
  184.       brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
  185.  
  186.    brw->ctx.NewDriverState |= BRW_NEW_BATCH;
  187.  
  188.    brw->state_batch_count = 0;
  189.  
  190.    brw->ib.type = -1;
  191.  
  192.    /* We need to periodically reap the shader time results, because rollover
  193.     * happens every few seconds.  We also want to see results every once in a
  194.     * while, because many programs won't cleanly destroy our context, so the
  195.     * end-of-run printout may not happen.
  196.     */
  197.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  198.       brw_collect_and_report_shader_time(brw);
  199.  
  200.    if (INTEL_DEBUG & DEBUG_PERFMON)
  201.       brw_dump_perf_monitors(brw);
  202. }
  203.  
  204. /**
  205.  * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
  206.  * sending it off.
  207.  *
  208.  * This function can emit state (say, to preserve registers that aren't saved
  209.  * between batches).  All of this state MUST fit in the reserved space at the
  210.  * end of the batchbuffer.  If you add more GPU state, increase the reserved
  211.  * space by updating the BATCH_RESERVED macro.
  212.  */
  213. static void
  214. brw_finish_batch(struct brw_context *brw)
  215. {
  216.    /* Capture the closing pipeline statistics register values necessary to
  217.     * support query objects (in the non-hardware context world).
  218.     */
  219.    brw_emit_query_end(brw);
  220.  
  221.    /* We may also need to snapshot and disable OA counters. */
  222.    if (brw->batch.ring == RENDER_RING)
  223.       brw_perf_monitor_finish_batch(brw);
  224.  
  225.    /* Mark that the current program cache BO has been used by the GPU.
  226.     * It will be reallocated if we need to put new programs in for the
  227.     * next batch.
  228.     */
  229.    brw->cache.bo_used_by_gpu = true;
  230. }
  231.  
  232. static void
  233. throttle(struct brw_context *brw)
  234. {
  235.    /* Wait for the swapbuffers before the one we just emitted, so we
  236.     * don't get too many swaps outstanding for apps that are GPU-heavy
  237.     * but not CPU-heavy.
  238.     *
  239.     * We're using intelDRI2Flush (called from the loader before
  240.     * swapbuffer) and glFlush (for front buffer rendering) as the
  241.     * indicator that a frame is done and then throttle when we get
  242.     * here as we prepare to render the next frame.  At this point for
  243.     * round trips for swap/copy and getting new buffers are done and
  244.     * we'll spend less time waiting on the GPU.
  245.     *
  246.     * Unfortunately, we don't have a handle to the batch containing
  247.     * the swap, and getting our hands on that doesn't seem worth it,
  248.     * so we just use the first batch we emitted after the last swap.
  249.     */
  250.    if (brw->need_swap_throttle && brw->throttle_batch[0]) {
  251.       if (brw->throttle_batch[1]) {
  252.          if (!brw->disable_throttling)
  253.             drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
  254.          drm_intel_bo_unreference(brw->throttle_batch[1]);
  255.       }
  256.       brw->throttle_batch[1] = brw->throttle_batch[0];
  257.       brw->throttle_batch[0] = NULL;
  258.       brw->need_swap_throttle = false;
  259.       /* Throttling here is more precise than the throttle ioctl, so skip it */
  260.       brw->need_flush_throttle = false;
  261.    }
  262.  
  263.    if (brw->need_flush_throttle) {
  264.       __DRIscreen *psp = brw->intelScreen->driScrnPriv;
  265.       drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
  266.       brw->need_flush_throttle = false;
  267.    }
  268. }
  269.  
  270. /* TODO: Push this whole function into bufmgr.
  271.  */
  272. static int
  273. do_flush_locked(struct brw_context *brw)
  274. {
  275.    struct intel_batchbuffer *batch = &brw->batch;
  276.    int ret = 0;
  277.  
  278.    if (brw->has_llc) {
  279.       drm_intel_bo_unmap(batch->bo);
  280.    } else {
  281.       ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
  282.       if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
  283.          ret = drm_intel_bo_subdata(batch->bo,
  284.                                     batch->state_batch_offset,
  285.                                     batch->bo->size - batch->state_batch_offset,
  286.                                     (char *)batch->map + batch->state_batch_offset);
  287.       }
  288.    }
  289.  
  290.    if (!brw->intelScreen->no_hw) {
  291.       int flags;
  292.  
  293.       if (brw->gen >= 6 && batch->ring == BLT_RING) {
  294.          flags = I915_EXEC_BLT;
  295.       } else {
  296.          flags = I915_EXEC_RENDER;
  297.       }
  298.       if (batch->needs_sol_reset)
  299.          flags |= I915_EXEC_GEN7_SOL_RESET;
  300.  
  301.       if (ret == 0) {
  302.          if (unlikely(INTEL_DEBUG & DEBUG_AUB))
  303.             brw_annotate_aub(brw);
  304.  
  305.          if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
  306.             ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
  307.                                         flags);
  308.          } else {
  309.             ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
  310.                                                 4 * batch->used, flags);
  311.          }
  312.       }
  313.  
  314.       throttle(brw);
  315.    }
  316.  
  317.    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
  318.       do_batch_dump(brw);
  319.  
  320.    if (ret != 0) {
  321.       fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
  322.       exit(1);
  323.    }
  324.  
  325.    return ret;
  326. }
  327.  
  328. int
  329. _intel_batchbuffer_flush(struct brw_context *brw,
  330.                          const char *file, int line)
  331. {
  332.    int ret;
  333.  
  334.    if (brw->batch.used == 0)
  335.       return 0;
  336.  
  337.    if (brw->throttle_batch[0] == NULL) {
  338.       brw->throttle_batch[0] = brw->batch.bo;
  339.       drm_intel_bo_reference(brw->throttle_batch[0]);
  340.    }
  341.  
  342.    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
  343.       int bytes_for_commands = 4 * brw->batch.used;
  344.       int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
  345.       int total_bytes = bytes_for_commands + bytes_for_state;
  346.       fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
  347.               "%4db (state) = %4db (%0.1f%%)\n", file, line,
  348.               bytes_for_commands, bytes_for_state,
  349.               total_bytes,
  350.               100.0f * total_bytes / BATCH_SZ);
  351.    }
  352.  
  353.    brw->batch.reserved_space = 0;
  354.  
  355.    brw_finish_batch(brw);
  356.  
  357.    /* Mark the end of the buffer. */
  358.    intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
  359.    if (brw->batch.used & 1) {
  360.       /* Round batchbuffer usage to 2 DWORDs. */
  361.       intel_batchbuffer_emit_dword(brw, MI_NOOP);
  362.    }
  363.  
  364.    intel_upload_finish(brw);
  365.  
  366.    /* Check that we didn't just wrap our batchbuffer at a bad time. */
  367.    assert(!brw->no_batch_wrap);
  368.  
  369.    ret = do_flush_locked(brw);
  370.  
  371.    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
  372.       fprintf(stderr, "waiting for idle\n");
  373.       drm_intel_bo_wait_rendering(brw->batch.bo);
  374.    }
  375.  
  376.    /* Start a new batch buffer. */
  377.    brw_new_batch(brw);
  378.  
  379.    return ret;
  380. }
  381.  
  382.  
  383. /*  This is the only way buffers get added to the validate list.
  384.  */
  385. bool
  386. intel_batchbuffer_emit_reloc(struct brw_context *brw,
  387.                              drm_intel_bo *buffer,
  388.                              uint32_t read_domains, uint32_t write_domain,
  389.                              uint32_t delta)
  390. {
  391.    int ret;
  392.  
  393.    ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
  394.                                  buffer, delta,
  395.                                  read_domains, write_domain);
  396.    assert(ret == 0);
  397.    (void)ret;
  398.  
  399.    /* Using the old buffer offset, write in what the right data would be, in
  400.     * case the buffer doesn't move and we can short-circuit the relocation
  401.     * processing in the kernel
  402.     */
  403.    intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
  404.  
  405.    return true;
  406. }
  407.  
  408. bool
  409. intel_batchbuffer_emit_reloc64(struct brw_context *brw,
  410.                                drm_intel_bo *buffer,
  411.                                uint32_t read_domains, uint32_t write_domain,
  412.                                uint32_t delta)
  413. {
  414.    int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
  415.                                      buffer, delta,
  416.                                      read_domains, write_domain);
  417.    assert(ret == 0);
  418.    (void) ret;
  419.  
  420.    /* Using the old buffer offset, write in what the right data would be, in
  421.     * case the buffer doesn't move and we can short-circuit the relocation
  422.     * processing in the kernel
  423.     */
  424.    uint64_t offset = buffer->offset64 + delta;
  425.    intel_batchbuffer_emit_dword(brw, offset);
  426.    intel_batchbuffer_emit_dword(brw, offset >> 32);
  427.  
  428.    return true;
  429. }
  430.  
  431.  
  432. void
  433. intel_batchbuffer_data(struct brw_context *brw,
  434.                        const void *data, GLuint bytes, enum brw_gpu_ring ring)
  435. {
  436.    assert((bytes & 3) == 0);
  437.    intel_batchbuffer_require_space(brw, bytes, ring);
  438.    memcpy(brw->batch.map + brw->batch.used, data, bytes);
  439.    brw->batch.used += bytes >> 2;
  440. }
  441.  
  442. /**
  443.  * According to the latest documentation, any PIPE_CONTROL with the
  444.  * "Command Streamer Stall" bit set must also have another bit set,
  445.  * with five different options:
  446.  *
  447.  *  - Render Target Cache Flush
  448.  *  - Depth Cache Flush
  449.  *  - Stall at Pixel Scoreboard
  450.  *  - Post-Sync Operation
  451.  *  - Depth Stall
  452.  *
  453.  * I chose "Stall at Pixel Scoreboard" since we've used it effectively
  454.  * in the past, but the choice is fairly arbitrary.
  455.  */
  456. static void
  457. gen8_add_cs_stall_workaround_bits(uint32_t *flags)
  458. {
  459.    uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
  460.                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
  461.                       PIPE_CONTROL_WRITE_IMMEDIATE |
  462.                       PIPE_CONTROL_WRITE_DEPTH_COUNT |
  463.                       PIPE_CONTROL_WRITE_TIMESTAMP |
  464.                       PIPE_CONTROL_STALL_AT_SCOREBOARD |
  465.                       PIPE_CONTROL_DEPTH_STALL;
  466.  
  467.    /* If we're doing a CS stall, and don't already have one of the
  468.     * workaround bits set, add "Stall at Pixel Scoreboard."
  469.     */
  470.    if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
  471.       *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
  472. }
  473.  
  474. /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
  475.  *
  476.  * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
  477.  *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
  478.  *
  479.  * Note that the kernel does CS stalls between batches, so we only need
  480.  * to count them within a batch.
  481.  */
  482. static uint32_t
  483. gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
  484. {
  485.    if (brw->gen == 7 && !brw->is_haswell) {
  486.       if (flags & PIPE_CONTROL_CS_STALL) {
  487.          /* If we're doing a CS stall, reset the counter and carry on. */
  488.          brw->batch.pipe_controls_since_last_cs_stall = 0;
  489.          return 0;
  490.       }
  491.  
  492.       /* If this is the fourth pipe control without a CS stall, do one now. */
  493.       if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
  494.          brw->batch.pipe_controls_since_last_cs_stall = 0;
  495.          return PIPE_CONTROL_CS_STALL;
  496.       }
  497.    }
  498.    return 0;
  499. }
  500.  
  501. /**
  502.  * Emit a PIPE_CONTROL with various flushing flags.
  503.  *
  504.  * The caller is responsible for deciding what flags are appropriate for the
  505.  * given generation.
  506.  */
  507. void
  508. brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
  509. {
  510.    if (brw->gen >= 8) {
  511.       gen8_add_cs_stall_workaround_bits(&flags);
  512.  
  513.       BEGIN_BATCH(6);
  514.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
  515.       OUT_BATCH(flags);
  516.       OUT_BATCH(0);
  517.       OUT_BATCH(0);
  518.       OUT_BATCH(0);
  519.       OUT_BATCH(0);
  520.       ADVANCE_BATCH();
  521.    } else if (brw->gen >= 6) {
  522.       flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
  523.  
  524.       BEGIN_BATCH(5);
  525.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
  526.       OUT_BATCH(flags);
  527.       OUT_BATCH(0);
  528.       OUT_BATCH(0);
  529.       OUT_BATCH(0);
  530.       ADVANCE_BATCH();
  531.    } else {
  532.       BEGIN_BATCH(4);
  533.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
  534.       OUT_BATCH(0);
  535.       OUT_BATCH(0);
  536.       OUT_BATCH(0);
  537.       ADVANCE_BATCH();
  538.    }
  539. }
  540.  
  541. /**
  542.  * Emit a PIPE_CONTROL that writes to a buffer object.
  543.  *
  544.  * \p flags should contain one of the following items:
  545.  *  - PIPE_CONTROL_WRITE_IMMEDIATE
  546.  *  - PIPE_CONTROL_WRITE_TIMESTAMP
  547.  *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
  548.  */
  549. void
  550. brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
  551.                             drm_intel_bo *bo, uint32_t offset,
  552.                             uint32_t imm_lower, uint32_t imm_upper)
  553. {
  554.    if (brw->gen >= 8) {
  555.       gen8_add_cs_stall_workaround_bits(&flags);
  556.  
  557.       BEGIN_BATCH(6);
  558.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
  559.       OUT_BATCH(flags);
  560.       OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
  561.                   offset);
  562.       OUT_BATCH(imm_lower);
  563.       OUT_BATCH(imm_upper);
  564.       ADVANCE_BATCH();
  565.    } else if (brw->gen >= 6) {
  566.       flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
  567.  
  568.       /* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
  569.        * on later platforms.  We always use PPGTT on Gen7+.
  570.        */
  571.       unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
  572.  
  573.       BEGIN_BATCH(5);
  574.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
  575.       OUT_BATCH(flags);
  576.       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
  577.                 gen6_gtt | offset);
  578.       OUT_BATCH(imm_lower);
  579.       OUT_BATCH(imm_upper);
  580.       ADVANCE_BATCH();
  581.    } else {
  582.       BEGIN_BATCH(4);
  583.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | flags | (4 - 2));
  584.       OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
  585.                 PIPE_CONTROL_GLOBAL_GTT_WRITE | offset);
  586.       OUT_BATCH(imm_lower);
  587.       OUT_BATCH(imm_upper);
  588.       ADVANCE_BATCH();
  589.    }
  590. }
  591.  
  592. /**
  593.  * Restriction [DevSNB, DevIVB]:
  594.  *
  595.  * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
  596.  * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
  597.  * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
  598.  * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
  599.  * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
  600.  * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
  601.  * unless SW can otherwise guarantee that the pipeline from WM onwards is
  602.  * already flushed (e.g., via a preceding MI_FLUSH).
  603.  */
  604. void
  605. intel_emit_depth_stall_flushes(struct brw_context *brw)
  606. {
  607.    assert(brw->gen >= 6 && brw->gen <= 9);
  608.  
  609.    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
  610.    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
  611.    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
  612. }
  613.  
  614. /**
  615.  * From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
  616.  * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
  617.  *  stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
  618.  *  3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
  619.  *  3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL needs
  620.  *  to be sent before any combination of VS associated 3DSTATE."
  621.  */
  622. void
  623. gen7_emit_vs_workaround_flush(struct brw_context *brw)
  624. {
  625.    assert(brw->gen == 7);
  626.    brw_emit_pipe_control_write(brw,
  627.                                PIPE_CONTROL_WRITE_IMMEDIATE
  628.                                | PIPE_CONTROL_DEPTH_STALL,
  629.                                brw->batch.workaround_bo, 0,
  630.                                0, 0);
  631. }
  632.  
  633.  
  634. /**
  635.  * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
  636.  */
  637. void
  638. gen7_emit_cs_stall_flush(struct brw_context *brw)
  639. {
  640.    brw_emit_pipe_control_write(brw,
  641.                                PIPE_CONTROL_CS_STALL
  642.                                | PIPE_CONTROL_WRITE_IMMEDIATE,
  643.                                brw->batch.workaround_bo, 0,
  644.                                0, 0);
  645. }
  646.  
  647.  
  648. /**
  649.  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
  650.  * implementing two workarounds on gen6.  From section 1.4.7.1
  651.  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
  652.  *
  653.  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
  654.  * produced by non-pipelined state commands), software needs to first
  655.  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
  656.  * 0.
  657.  *
  658.  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
  659.  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
  660.  *
  661.  * And the workaround for these two requires this workaround first:
  662.  *
  663.  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
  664.  * BEFORE the pipe-control with a post-sync op and no write-cache
  665.  * flushes.
  666.  *
  667.  * And this last workaround is tricky because of the requirements on
  668.  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
  669.  * volume 2 part 1:
  670.  *
  671.  *     "1 of the following must also be set:
  672.  *      - Render Target Cache Flush Enable ([12] of DW1)
  673.  *      - Depth Cache Flush Enable ([0] of DW1)
  674.  *      - Stall at Pixel Scoreboard ([1] of DW1)
  675.  *      - Depth Stall ([13] of DW1)
  676.  *      - Post-Sync Operation ([13] of DW1)
  677.  *      - Notify Enable ([8] of DW1)"
  678.  *
  679.  * The cache flushes require the workaround flush that triggered this
  680.  * one, so we can't use it.  Depth stall would trigger the same.
  681.  * Post-sync nonzero is what triggered this second workaround, so we
  682.  * can't use that one either.  Notify enable is IRQs, which aren't
  683.  * really our business.  That leaves only stall at scoreboard.
  684.  */
  685. void
  686. intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
  687. {
  688.    brw_emit_pipe_control_flush(brw,
  689.                                PIPE_CONTROL_CS_STALL |
  690.                                PIPE_CONTROL_STALL_AT_SCOREBOARD);
  691.  
  692.    brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
  693.                                brw->batch.workaround_bo, 0, 0, 0);
  694. }
  695.  
  696. /* Emit a pipelined flush to either flush render and texture cache for
  697.  * reading from a FBO-drawn texture, or flush so that frontbuffer
  698.  * render appears on the screen in DRI1.
  699.  *
  700.  * This is also used for the always_flush_cache driconf debug option.
  701.  */
  702. void
  703. intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
  704. {
  705.    if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
  706.       BEGIN_BATCH_BLT(4);
  707.       OUT_BATCH(MI_FLUSH_DW);
  708.       OUT_BATCH(0);
  709.       OUT_BATCH(0);
  710.       OUT_BATCH(0);
  711.       ADVANCE_BATCH();
  712.    } else {
  713.       int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
  714.       if (brw->gen >= 6) {
  715.          if (brw->gen == 9) {
  716.             /* Hardware workaround: SKL
  717.              *
  718.              * Emit Pipe Control with all bits set to zero before emitting
  719.              * a Pipe Control with VF Cache Invalidate set.
  720.              */
  721.             brw_emit_pipe_control_flush(brw, 0);
  722.          }
  723.  
  724.          flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
  725.                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
  726.                   PIPE_CONTROL_VF_CACHE_INVALIDATE |
  727.                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
  728.                   PIPE_CONTROL_CS_STALL;
  729.  
  730.          if (brw->gen == 6) {
  731.             /* Hardware workaround: SNB B-Spec says:
  732.              *
  733.              * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
  734.              * Flush Enable =1, a PIPE_CONTROL with any non-zero
  735.              * post-sync-op is required.
  736.              */
  737.             intel_emit_post_sync_nonzero_flush(brw);
  738.          }
  739.       }
  740.       brw_emit_pipe_control_flush(brw, flags);
  741.    }
  742.  
  743.    brw_render_cache_set_clear(brw);
  744. }
  745.  
  746. static void
  747. load_sized_register_mem(struct brw_context *brw,
  748.                         uint32_t reg,
  749.                         drm_intel_bo *bo,
  750.                         uint32_t read_domains, uint32_t write_domain,
  751.                         uint32_t offset,
  752.                         int size)
  753. {
  754.    int i;
  755.  
  756.    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
  757.    assert(brw->gen >= 7);
  758.  
  759.    if (brw->gen >= 8) {
  760.       BEGIN_BATCH(4 * size);
  761.       for (i = 0; i < size; i++) {
  762.          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
  763.          OUT_BATCH(reg + i * 4);
  764.          OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
  765.       }
  766.       ADVANCE_BATCH();
  767.    } else {
  768.       BEGIN_BATCH(3 * size);
  769.       for (i = 0; i < size; i++) {
  770.          OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
  771.          OUT_BATCH(reg + i * 4);
  772.          OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
  773.       }
  774.       ADVANCE_BATCH();
  775.    }
  776. }
  777.  
  778. void
  779. brw_load_register_mem(struct brw_context *brw,
  780.                       uint32_t reg,
  781.                       drm_intel_bo *bo,
  782.                       uint32_t read_domains, uint32_t write_domain,
  783.                       uint32_t offset)
  784. {
  785.    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
  786. }
  787.  
  788. void
  789. brw_load_register_mem64(struct brw_context *brw,
  790.                         uint32_t reg,
  791.                         drm_intel_bo *bo,
  792.                         uint32_t read_domains, uint32_t write_domain,
  793.                         uint32_t offset)
  794. {
  795.    load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
  796. }
  797.