Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2008 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  *
  23.  * Authors:
  24.  *    Eric Anholt <eric@anholt.net>
  25.  *
  26.  */
  27.  
  28. /** @file brw_queryobj.c
  29.  *
  30.  * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query,
  31.  * GL_EXT_transform_feedback, and friends).
  32.  *
  33.  * The hardware provides a PIPE_CONTROL command that can report the number of
  34.  * fragments that passed the depth test, or the hardware timer.  They are
  35.  * appropriately synced with the stage of the pipeline for our extensions'
  36.  * needs.
  37.  */
  38. #include "main/imports.h"
  39.  
  40. #include "brw_context.h"
  41. #include "brw_defines.h"
  42. #include "brw_state.h"
  43. #include "intel_batchbuffer.h"
  44. #include "intel_reg.h"
  45.  
  46. /**
  47.  * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer.
  48.  */
  49. static void
  50. write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
  51. {
  52.    if (brw->gen >= 6) {
  53.       /* Emit workaround flushes: */
  54.       if (brw->gen == 6) {
  55.          /* The timestamp write below is a non-zero post-sync op, which on
  56.           * Gen6 necessitates a CS stall.  CS stalls need stall at scoreboard
  57.           * set.  See the comments for intel_emit_post_sync_nonzero_flush().
  58.           */
  59.          BEGIN_BATCH(4);
  60.          OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2));
  61.          OUT_BATCH(PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD);
  62.          OUT_BATCH(0);
  63.          OUT_BATCH(0);
  64.          ADVANCE_BATCH();
  65.       }
  66.  
  67.       BEGIN_BATCH(5);
  68.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (5 - 2));
  69.       OUT_BATCH(PIPE_CONTROL_WRITE_TIMESTAMP);
  70.       OUT_RELOC(query_bo,
  71.                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
  72.                 PIPE_CONTROL_GLOBAL_GTT_WRITE |
  73.                 idx * sizeof(uint64_t));
  74.       OUT_BATCH(0);
  75.       OUT_BATCH(0);
  76.       ADVANCE_BATCH();
  77.    } else {
  78.       BEGIN_BATCH(4);
  79.       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
  80.                 PIPE_CONTROL_WRITE_TIMESTAMP);
  81.       OUT_RELOC(query_bo,
  82.                 I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
  83.                 PIPE_CONTROL_GLOBAL_GTT_WRITE |
  84.                 idx * sizeof(uint64_t));
  85.       OUT_BATCH(0);
  86.       OUT_BATCH(0);
  87.       ADVANCE_BATCH();
  88.    }
  89. }
  90.  
  91. /**
  92.  * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer.
  93.  */
  94. static void
  95. write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
  96. {
  97.    assert(brw->gen < 6);
  98.  
  99.    BEGIN_BATCH(4);
  100.    OUT_BATCH(_3DSTATE_PIPE_CONTROL | (4 - 2) |
  101.              PIPE_CONTROL_DEPTH_STALL | PIPE_CONTROL_WRITE_DEPTH_COUNT);
  102.    /* This object could be mapped cacheable, but we don't have an exposed
  103.     * mechanism to support that.  Since it's going uncached, tell GEM that
  104.     * we're writing to it.  The usual clflush should be all that's required
  105.     * to pick up the results.
  106.     */
  107.    OUT_RELOC(query_bo,
  108.              I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
  109.              PIPE_CONTROL_GLOBAL_GTT_WRITE |
  110.              (idx * sizeof(uint64_t)));
  111.    OUT_BATCH(0);
  112.    OUT_BATCH(0);
  113.    ADVANCE_BATCH();
  114. }
  115.  
  116. /**
  117.  * Wait on the query object's BO and calculate the final result.
  118.  */
  119. static void
  120. brw_queryobj_get_results(struct gl_context *ctx,
  121.                          struct brw_query_object *query)
  122. {
  123.    struct brw_context *brw = brw_context(ctx);
  124.  
  125.    int i;
  126.    uint64_t *results;
  127.  
  128.    assert(brw->gen < 6);
  129.  
  130.    if (query->bo == NULL)
  131.       return;
  132.  
  133.    /* If the application has requested the query result, but this batch is
  134.     * still contributing to it, flush it now so the results will be present
  135.     * when mapped.
  136.     */
  137.    if (drm_intel_bo_references(brw->batch.bo, query->bo))
  138.       intel_batchbuffer_flush(brw);
  139.  
  140.    if (unlikely(brw->perf_debug)) {
  141.       if (drm_intel_bo_busy(query->bo)) {
  142.          perf_debug("Stalling on the GPU waiting for a query object.\n");
  143.       }
  144.    }
  145.  
  146.    drm_intel_bo_map(query->bo, false);
  147.    results = query->bo->virtual;
  148.    switch (query->Base.Target) {
  149.    case GL_TIME_ELAPSED_EXT:
  150.       /* The query BO contains the starting and ending timestamps.
  151.        * Subtract the two and convert to nanoseconds.
  152.        */
  153.       query->Base.Result += 1000 * ((results[1] >> 32) - (results[0] >> 32));
  154.       break;
  155.  
  156.    case GL_TIMESTAMP:
  157.       /* The query BO contains a single timestamp value in results[0]. */
  158.       query->Base.Result = 1000 * (results[0] >> 32);
  159.       break;
  160.  
  161.    case GL_SAMPLES_PASSED_ARB:
  162.       /* Loop over pairs of values from the BO, which are the PS_DEPTH_COUNT
  163.        * value at the start and end of the batchbuffer.  Subtract them to
  164.        * get the number of fragments which passed the depth test in each
  165.        * individual batch, and add those differences up to get the number
  166.        * of fragments for the entire query.
  167.        *
  168.        * Note that query->Base.Result may already be non-zero.  We may have
  169.        * run out of space in the query's BO and allocated a new one.  If so,
  170.        * this function was already called to accumulate the results so far.
  171.        */
  172.       for (i = 0; i < query->last_index; i++) {
  173.          query->Base.Result += results[i * 2 + 1] - results[i * 2];
  174.       }
  175.       break;
  176.  
  177.    case GL_ANY_SAMPLES_PASSED:
  178.    case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
  179.       /* If the starting and ending PS_DEPTH_COUNT from any of the batches
  180.        * differ, then some fragments passed the depth test.
  181.        */
  182.       for (i = 0; i < query->last_index; i++) {
  183.          if (results[i * 2 + 1] != results[i * 2]) {
  184.             query->Base.Result = GL_TRUE;
  185.             break;
  186.          }
  187.       }
  188.       break;
  189.  
  190.    default:
  191.       assert(!"Unrecognized query target in brw_queryobj_get_results()");
  192.       break;
  193.    }
  194.    drm_intel_bo_unmap(query->bo);
  195.  
  196.    /* Now that we've processed the data stored in the query's buffer object,
  197.     * we can release it.
  198.     */
  199.    drm_intel_bo_unreference(query->bo);
  200.    query->bo = NULL;
  201. }
  202.  
  203. /**
  204.  * The NewQueryObject() driver hook.
  205.  *
  206.  * Allocates and initializes a new query object.
  207.  */
  208. static struct gl_query_object *
  209. brw_new_query_object(struct gl_context *ctx, GLuint id)
  210. {
  211.    struct brw_query_object *query;
  212.  
  213.    query = calloc(1, sizeof(struct brw_query_object));
  214.  
  215.    query->Base.Id = id;
  216.    query->Base.Result = 0;
  217.    query->Base.Active = false;
  218.    query->Base.Ready = true;
  219.  
  220.    return &query->Base;
  221. }
  222.  
  223. /**
  224.  * The DeleteQuery() driver hook.
  225.  */
  226. static void
  227. brw_delete_query(struct gl_context *ctx, struct gl_query_object *q)
  228. {
  229.    struct brw_query_object *query = (struct brw_query_object *)q;
  230.  
  231.    drm_intel_bo_unreference(query->bo);
  232.    free(query);
  233. }
  234.  
  235. /**
  236.  * Gen4-5 driver hook for glBeginQuery().
  237.  *
  238.  * Initializes driver structures and emits any GPU commands required to begin
  239.  * recording data for the query.
  240.  */
  241. static void
  242. brw_begin_query(struct gl_context *ctx, struct gl_query_object *q)
  243. {
  244.    struct brw_context *brw = brw_context(ctx);
  245.    struct brw_query_object *query = (struct brw_query_object *)q;
  246.  
  247.    assert(brw->gen < 6);
  248.  
  249.    switch (query->Base.Target) {
  250.    case GL_TIME_ELAPSED_EXT:
  251.       /* For timestamp queries, we record the starting time right away so that
  252.        * we measure the full time between BeginQuery and EndQuery.  There's
  253.        * some debate about whether this is the right thing to do.  Our decision
  254.        * is based on the following text from the ARB_timer_query extension:
  255.        *
  256.        * "(5) Should the extension measure total time elapsed between the full
  257.        *      completion of the BeginQuery and EndQuery commands, or just time
  258.        *      spent in the graphics library?
  259.        *
  260.        *  RESOLVED:  This extension will measure the total time elapsed
  261.        *  between the full completion of these commands.  Future extensions
  262.        *  may implement a query to determine time elapsed at different stages
  263.        *  of the graphics pipeline."
  264.        *
  265.        * We write a starting timestamp now (at index 0).  At EndQuery() time,
  266.        * we'll write a second timestamp (at index 1), and subtract the two to
  267.        * obtain the time elapsed.  Notably, this includes time elapsed while
  268.        * the system was doing other work, such as running other applications.
  269.        */
  270.       drm_intel_bo_unreference(query->bo);
  271.       query->bo = drm_intel_bo_alloc(brw->bufmgr, "timer query", 4096, 4096);
  272.       write_timestamp(brw, query->bo, 0);
  273.       break;
  274.  
  275.    case GL_ANY_SAMPLES_PASSED:
  276.    case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
  277.    case GL_SAMPLES_PASSED_ARB:
  278.       /* For occlusion queries, we delay taking an initial sample until the
  279.        * first drawing occurs in this batch.  See the reasoning in the comments
  280.        * for brw_emit_query_begin() below.
  281.        *
  282.        * Since we're starting a new query, we need to be sure to throw away
  283.        * any previous occlusion query results.
  284.        */
  285.       drm_intel_bo_unreference(query->bo);
  286.       query->bo = NULL;
  287.       query->last_index = -1;
  288.  
  289.       brw->query.obj = query;
  290.  
  291.       /* Depth statistics on Gen4 require strange workarounds, so we try to
  292.        * avoid them when necessary.  They're required for occlusion queries,
  293.        * so turn them on now.
  294.        */
  295.       brw->stats_wm++;
  296.       brw->state.dirty.brw |= BRW_NEW_STATS_WM;
  297.       break;
  298.  
  299.    default:
  300.       assert(!"Unrecognized query target in brw_begin_query()");
  301.       break;
  302.    }
  303. }
  304.  
  305. /**
  306.  * Gen4-5 driver hook for glEndQuery().
  307.  *
  308.  * Emits GPU commands to record a final query value, ending any data capturing.
  309.  * However, the final result isn't necessarily available until the GPU processes
  310.  * those commands.  brw_queryobj_get_results() processes the captured data to
  311.  * produce the final result.
  312.  */
  313. static void
  314. brw_end_query(struct gl_context *ctx, struct gl_query_object *q)
  315. {
  316.    struct brw_context *brw = brw_context(ctx);
  317.    struct brw_query_object *query = (struct brw_query_object *)q;
  318.  
  319.    assert(brw->gen < 6);
  320.  
  321.    switch (query->Base.Target) {
  322.    case GL_TIME_ELAPSED_EXT:
  323.       /* Write the final timestamp. */
  324.       write_timestamp(brw, query->bo, 1);
  325.       break;
  326.  
  327.    case GL_ANY_SAMPLES_PASSED:
  328.    case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
  329.    case GL_SAMPLES_PASSED_ARB:
  330.  
  331.       /* No query->bo means that EndQuery was called after BeginQuery with no
  332.        * intervening drawing. Rather than doing nothing at all here in this
  333.        * case, we emit the query_begin and query_end state to the
  334.        * hardware. This is to guarantee that waiting on the result of this
  335.        * empty state will cause all previous queries to complete at all, as
  336.        * required by the specification:
  337.        *
  338.        *        It must always be true that if any query object
  339.        *        returns a result available of TRUE, all queries of the
  340.        *        same type issued prior to that query must also return
  341.        *        TRUE. [Open GL 4.3 (Core Profile) Section 4.2.1]
  342.        */
  343.       if (!query->bo) {
  344.          brw_emit_query_begin(brw);
  345.       }
  346.  
  347.       assert(query->bo);
  348.  
  349.       brw_emit_query_end(brw);
  350.  
  351.       brw->query.obj = NULL;
  352.  
  353.       brw->stats_wm--;
  354.       brw->state.dirty.brw |= BRW_NEW_STATS_WM;
  355.       break;
  356.  
  357.    default:
  358.       assert(!"Unrecognized query target in brw_end_query()");
  359.       break;
  360.    }
  361. }
  362.  
  363. /**
  364.  * The Gen4-5 WaitQuery() driver hook.
  365.  *
  366.  * Wait for a query result to become available and return it.  This is the
  367.  * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname.
  368.  */
  369. static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q)
  370. {
  371.    struct brw_query_object *query = (struct brw_query_object *)q;
  372.  
  373.    assert(brw_context(ctx)->gen < 6);
  374.  
  375.    brw_queryobj_get_results(ctx, query);
  376.    query->Base.Ready = true;
  377. }
  378.  
  379. /**
  380.  * The Gen4-5 CheckQuery() driver hook.
  381.  *
  382.  * Checks whether a query result is ready yet.  If not, flushes.
  383.  * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname.
  384.  */
  385. static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q)
  386. {
  387.    struct brw_context *brw = brw_context(ctx);
  388.    struct brw_query_object *query = (struct brw_query_object *)q;
  389.  
  390.    assert(brw->gen < 6);
  391.  
  392.    /* From the GL_ARB_occlusion_query spec:
  393.     *
  394.     *     "Instead of allowing for an infinite loop, performing a
  395.     *      QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is
  396.     *      not ready yet on the first time it is queried.  This ensures that
  397.     *      the async query will return true in finite time.
  398.     */
  399.    if (query->bo && drm_intel_bo_references(brw->batch.bo, query->bo))
  400.       intel_batchbuffer_flush(brw);
  401.  
  402.    if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
  403.       brw_queryobj_get_results(ctx, query);
  404.       query->Base.Ready = true;
  405.    }
  406. }
  407.  
  408. /**
  409.  * Ensure there query's BO has enough space to store a new pair of values.
  410.  *
  411.  * If not, gather the existing BO's results and create a new buffer of the
  412.  * same size.
  413.  */
  414. static void
  415. ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query)
  416. {
  417.    struct brw_context *brw = brw_context(ctx);
  418.  
  419.    assert(brw->gen < 6);
  420.  
  421.    if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
  422.  
  423.       if (query->bo != NULL) {
  424.          /* The old query BO did not have enough space, so we allocated a new
  425.           * one.  Gather the results so far (adding up the differences) and
  426.           * release the old BO.
  427.           */
  428.          brw_queryobj_get_results(ctx, query);
  429.       }
  430.  
  431.       query->bo = drm_intel_bo_alloc(brw->bufmgr, "query", 4096, 1);
  432.       query->last_index = 0;
  433.    }
  434. }
  435.  
  436. /**
  437.  * Record the PS_DEPTH_COUNT value (for occlusion queries) just before
  438.  * primitive drawing.
  439.  *
  440.  * In a pre-hardware context world, the single PS_DEPTH_COUNT register is
  441.  * shared among all applications using the GPU.  However, our query value
  442.  * needs to only include fragments generated by our application/GL context.
  443.  *
  444.  * To accommodate this, we record PS_DEPTH_COUNT at the start and end of
  445.  * each batchbuffer (technically, the first primitive drawn and flush time).
  446.  * Subtracting each pair of values calculates the change in PS_DEPTH_COUNT
  447.  * caused by a batchbuffer.  Since there is no preemption inside batches,
  448.  * this is guaranteed to only measure the effects of our current application.
  449.  *
  450.  * Adding each of these differences (in case drawing is done over many batches)
  451.  * produces the final expected value.
  452.  *
  453.  * In a world with hardware contexts, PS_DEPTH_COUNT is saved and restored
  454.  * as part of the context state, so this is unnecessary, and skipped.
  455.  */
  456. void
  457. brw_emit_query_begin(struct brw_context *brw)
  458. {
  459.    struct gl_context *ctx = &brw->ctx;
  460.    struct brw_query_object *query = brw->query.obj;
  461.  
  462.    if (brw->hw_ctx)
  463.       return;
  464.  
  465.    /* Skip if we're not doing any queries, or we've already recorded the
  466.     * initial query value for this batchbuffer.
  467.     */
  468.    if (!query || brw->query.begin_emitted)
  469.       return;
  470.  
  471.    ensure_bo_has_space(ctx, query);
  472.  
  473.    write_depth_count(brw, query->bo, query->last_index * 2);
  474.  
  475.    brw->query.begin_emitted = true;
  476. }
  477.  
  478. /**
  479.  * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT
  480.  * (for non-hardware context platforms).
  481.  *
  482.  * See the explanation in brw_emit_query_begin().
  483.  */
  484. void
  485. brw_emit_query_end(struct brw_context *brw)
  486. {
  487.    struct brw_query_object *query = brw->query.obj;
  488.  
  489.    if (brw->hw_ctx)
  490.       return;
  491.  
  492.    if (!brw->query.begin_emitted)
  493.       return;
  494.  
  495.    write_depth_count(brw, query->bo, query->last_index * 2 + 1);
  496.  
  497.    brw->query.begin_emitted = false;
  498.    query->last_index++;
  499. }
  500.  
  501. /**
  502.  * Driver hook for glQueryCounter().
  503.  *
  504.  * This handles GL_TIMESTAMP queries, which perform a pipelined read of the
  505.  * current GPU time.  This is unlike GL_TIME_ELAPSED, which measures the
  506.  * time while the query is active.
  507.  */
  508. static void
  509. brw_query_counter(struct gl_context *ctx, struct gl_query_object *q)
  510. {
  511.    struct brw_context *brw = brw_context(ctx);
  512.    struct brw_query_object *query = (struct brw_query_object *) q;
  513.  
  514.    assert(q->Target == GL_TIMESTAMP);
  515.  
  516.    drm_intel_bo_unreference(query->bo);
  517.    query->bo = drm_intel_bo_alloc(brw->bufmgr, "timestamp query", 4096, 4096);
  518.    write_timestamp(brw, query->bo, 0);
  519. }
  520.  
  521. /**
  522.  * Read the TIMESTAMP register immediately (in a non-pipelined fashion).
  523.  *
  524.  * This is used to implement the GetTimestamp() driver hook.
  525.  */
  526. static uint64_t
  527. brw_get_timestamp(struct gl_context *ctx)
  528. {
  529.    struct brw_context *brw = brw_context(ctx);
  530.    uint64_t result = 0;
  531.  
  532.    drm_intel_reg_read(brw->bufmgr, TIMESTAMP, &result);
  533.  
  534.    /* See logic in brw_queryobj_get_results() */
  535.    result = result >> 32;
  536.    result *= 80;
  537.    result &= (1ull << 36) - 1;
  538.  
  539.    return result;
  540. }
  541.  
  542. /* Initialize query object functions used on all generations. */
  543. void brw_init_common_queryobj_functions(struct dd_function_table *functions)
  544. {
  545.    functions->NewQueryObject = brw_new_query_object;
  546.    functions->DeleteQuery = brw_delete_query;
  547.    functions->QueryCounter = brw_query_counter;
  548.    functions->GetTimestamp = brw_get_timestamp;
  549. }
  550.  
  551. /* Initialize Gen4/5-specific query object functions. */
  552. void gen4_init_queryobj_functions(struct dd_function_table *functions)
  553. {
  554.    functions->BeginQuery = brw_begin_query;
  555.    functions->EndQuery = brw_end_query;
  556.    functions->CheckQuery = brw_check_query;
  557.    functions->WaitQuery = brw_wait_query;
  558. }
  559.