Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * on the rights to use, copy, modify, merge, publish, distribute, sub
  8.  * license, and/or sell copies of the Software, and to permit persons to whom
  9.  * the Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18.  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19.  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20.  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21.  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22.  *
  23.  * Authors:
  24.  *      Jerome Glisse
  25.  */
  26. #include "r600_hw_context_priv.h"
  27. #include "radeonsi_pm4.h"
  28. #include "radeonsi_pipe.h"
  29. #include "sid.h"
  30. #include "util/u_memory.h"
  31. #include <errno.h>
  32.  
  33. #define GROUP_FORCE_NEW_BLOCK   0
  34.  
  35. /* Get backends mask */
  36. void si_get_backend_mask(struct r600_context *ctx)
  37. {
  38.         struct radeon_winsys_cs *cs = ctx->cs;
  39.         struct si_resource *buffer;
  40.         uint32_t *results;
  41.         unsigned num_backends = ctx->screen->info.r600_num_backends;
  42.         unsigned i, mask = 0;
  43.  
  44.         /* if backend_map query is supported by the kernel */
  45.         if (ctx->screen->info.r600_backend_map_valid) {
  46.                 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
  47.                 unsigned backend_map = ctx->screen->info.r600_backend_map;
  48.                 unsigned item_width = 4, item_mask = 0x7;
  49.  
  50.                 while(num_tile_pipes--) {
  51.                         i = backend_map & item_mask;
  52.                         mask |= (1<<i);
  53.                         backend_map >>= item_width;
  54.                 }
  55.                 if (mask != 0) {
  56.                         ctx->backend_mask = mask;
  57.                         return;
  58.                 }
  59.         }
  60.  
  61.         /* otherwise backup path for older kernels */
  62.  
  63.         /* create buffer for event data */
  64.         buffer = si_resource_create_custom(&ctx->screen->screen,
  65.                                            PIPE_USAGE_STAGING,
  66.                                            ctx->max_db*16);
  67.         if (!buffer)
  68.                 goto err;
  69.  
  70.         /* initialize buffer with zeroes */
  71.         results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
  72.         if (results) {
  73.                 uint64_t va = 0;
  74.  
  75.                 memset(results, 0, ctx->max_db * 4 * 4);
  76.                 ctx->ws->buffer_unmap(buffer->cs_buf);
  77.  
  78.                 /* emit EVENT_WRITE for ZPASS_DONE */
  79.                 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
  80.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
  81.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
  82.                 cs->buf[cs->cdw++] = va;
  83.                 cs->buf[cs->cdw++] = va >> 32;
  84.  
  85.                 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  86.                 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
  87.  
  88.                 /* analyze results */
  89.                 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
  90.                 if (results) {
  91.                         for(i = 0; i < ctx->max_db; i++) {
  92.                                 /* at least highest bit will be set if backend is used */
  93.                                 if (results[i*4 + 1])
  94.                                         mask |= (1<<i);
  95.                         }
  96.                         ctx->ws->buffer_unmap(buffer->cs_buf);
  97.                 }
  98.         }
  99.  
  100.         si_resource_reference(&buffer, NULL);
  101.  
  102.         if (mask != 0) {
  103.                 ctx->backend_mask = mask;
  104.                 return;
  105.         }
  106.  
  107. err:
  108.         /* fallback to old method - set num_backends lower bits to 1 */
  109.         ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
  110.         return;
  111. }
  112.  
  113. /* initialize */
  114. void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
  115.                         boolean count_draw_in)
  116. {
  117.         /* The number of dwords we already used in the CS so far. */
  118.         num_dw += ctx->cs->cdw;
  119.  
  120.         if (count_draw_in) {
  121.                 /* The number of dwords all the dirty states would take. */
  122.                 num_dw += ctx->pm4_dirty_cdwords;
  123.  
  124.                 /* The upper-bound of how much a draw command would take. */
  125.                 num_dw += SI_MAX_DRAW_CS_DWORDS;
  126.         }
  127.  
  128.         /* Count in queries_suspend. */
  129.         num_dw += ctx->num_cs_dw_queries_suspend;
  130.  
  131.         /* Count in streamout_end at the end of CS. */
  132.         num_dw += ctx->num_cs_dw_streamout_end;
  133.  
  134.         /* Count in render_condition(NULL) at the end of CS. */
  135.         if (ctx->predicate_drawing) {
  136.                 num_dw += 3;
  137.         }
  138.  
  139.         /* Count in framebuffer cache flushes at the end of CS. */
  140.         num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
  141.  
  142.         /* Save 16 dwords for the fence mechanism. */
  143.         num_dw += 16;
  144.  
  145. #if R600_TRACE_CS
  146.         if (ctx->screen->trace_bo) {
  147.                 num_dw += R600_TRACE_CS_DWORDS;
  148.         }
  149. #endif
  150.  
  151.         /* Flush if there's not enough space. */
  152.         if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
  153.                 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
  154.         }
  155. }
  156.  
  157. static void r600_flush_framebuffer(struct r600_context *ctx)
  158. {
  159.         struct si_pm4_state *pm4;
  160.  
  161.         if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
  162.                 return;
  163.  
  164.         pm4 = si_pm4_alloc_state(ctx);
  165.  
  166.         if (pm4 == NULL)
  167.                 return;
  168.  
  169.         si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
  170.                                 S_0085F0_CB1_DEST_BASE_ENA(1) |
  171.                                 S_0085F0_CB2_DEST_BASE_ENA(1) |
  172.                                 S_0085F0_CB3_DEST_BASE_ENA(1) |
  173.                                 S_0085F0_CB4_DEST_BASE_ENA(1) |
  174.                                 S_0085F0_CB5_DEST_BASE_ENA(1) |
  175.                                 S_0085F0_CB6_DEST_BASE_ENA(1) |
  176.                                 S_0085F0_CB7_DEST_BASE_ENA(1) |
  177.                                 S_0085F0_DB_ACTION_ENA(1) |
  178.                                 S_0085F0_DB_DEST_BASE_ENA(1));
  179.         si_pm4_emit(ctx, pm4);
  180.         si_pm4_free_state(ctx, pm4, ~0);
  181.  
  182.         ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
  183. }
  184.  
  185. void si_context_flush(struct r600_context *ctx, unsigned flags)
  186. {
  187.         struct radeon_winsys_cs *cs = ctx->cs;
  188.         bool queries_suspended = false;
  189.  
  190. #if 0
  191.         bool streamout_suspended = false;
  192. #endif
  193.  
  194.         if (!cs->cdw)
  195.                 return;
  196.  
  197.         /* suspend queries */
  198.         if (ctx->num_cs_dw_queries_suspend) {
  199.                 r600_context_queries_suspend(ctx);
  200.                 queries_suspended = true;
  201.         }
  202.  
  203. #if 0
  204.         if (ctx->num_cs_dw_streamout_end) {
  205.                 r600_context_streamout_end(ctx);
  206.                 streamout_suspended = true;
  207.         }
  208. #endif
  209.  
  210.         r600_flush_framebuffer(ctx);
  211.  
  212.         /* partial flush is needed to avoid lockups on some chips with user fences */
  213.         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
  214.         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
  215.  
  216.         /* force to keep tiling flags */
  217.         flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
  218.  
  219. #if R600_TRACE_CS
  220.         if (ctx->screen->trace_bo) {
  221.                 struct r600_screen *rscreen = ctx->screen;
  222.                 unsigned i;
  223.  
  224.                 for (i = 0; i < cs->cdw; i++) {
  225.                         fprintf(stderr, "[%4d] [%5d] 0x%08x\n", rscreen->cs_count, i, cs->buf[i]);
  226.                 }
  227.                 rscreen->cs_count++;
  228.         }
  229. #endif
  230.  
  231.         /* Flush the CS. */
  232.         ctx->ws->cs_flush(ctx->cs, flags, 0);
  233.  
  234. #if R600_TRACE_CS
  235.         if (ctx->screen->trace_bo) {
  236.                 struct r600_screen *rscreen = ctx->screen;
  237.                 unsigned i;
  238.  
  239.                 for (i = 0; i < 10; i++) {
  240.                         usleep(5);
  241.                         if (!ctx->ws->buffer_is_busy(rscreen->trace_bo->buf, RADEON_USAGE_READWRITE)) {
  242.                                 break;
  243.                         }
  244.                 }
  245.                 if (i == 10) {
  246.                         fprintf(stderr, "timeout on cs lockup likely happen at cs %d dw %d\n",
  247.                                 rscreen->trace_ptr[1], rscreen->trace_ptr[0]);
  248.                 } else {
  249.                         fprintf(stderr, "cs %d executed in %dms\n", rscreen->trace_ptr[1], i * 5);
  250.                 }
  251.         }
  252. #endif
  253.  
  254.         ctx->pm4_dirty_cdwords = 0;
  255.         ctx->flags = 0;
  256.  
  257. #if 0
  258.         if (streamout_suspended) {
  259.                 ctx->streamout_start = TRUE;
  260.                 ctx->streamout_append_bitmask = ~0;
  261.         }
  262. #endif
  263.  
  264.         /* resume queries */
  265.         if (queries_suspended) {
  266.                 r600_context_queries_resume(ctx);
  267.         }
  268.  
  269.         /* set all valid group as dirty so they get reemited on
  270.          * next draw command
  271.          */
  272.         si_pm4_reset_emitted(ctx);
  273. }
  274.  
  275. void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
  276. {
  277.         struct radeon_winsys_cs *cs = ctx->cs;
  278.         uint64_t va;
  279.  
  280.         si_need_cs_space(ctx, 10, FALSE);
  281.  
  282.         va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
  283.         va = va + (offset << 2);
  284.  
  285.         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
  286.         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
  287.         cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
  288.         cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
  289.         cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
  290.         /* DATA_SEL | INT_EN | ADDRESS_HI */
  291.         cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
  292.         cs->buf[cs->cdw++] = value;                   /* DATA_LO */
  293.         cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
  294.         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  295.         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
  296. }
  297.  
  298. static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
  299.                                        bool test_status_bit)
  300. {
  301.         uint32_t *current_result = (uint32_t*)map;
  302.         uint64_t start, end;
  303.  
  304.         start = (uint64_t)current_result[start_index] |
  305.                 (uint64_t)current_result[start_index+1] << 32;
  306.         end = (uint64_t)current_result[end_index] |
  307.               (uint64_t)current_result[end_index+1] << 32;
  308.  
  309.         if (!test_status_bit ||
  310.             ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
  311.                 return end - start;
  312.         }
  313.         return 0;
  314. }
  315.  
  316. static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
  317. {
  318.         unsigned results_base = query->results_start;
  319.         char *map;
  320.  
  321.         map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
  322.                                   PIPE_TRANSFER_READ |
  323.                                   (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
  324.         if (!map)
  325.                 return FALSE;
  326.  
  327.         /* count all results across all data blocks */
  328.         switch (query->type) {
  329.         case PIPE_QUERY_OCCLUSION_COUNTER:
  330.                 while (results_base != query->results_end) {
  331.                         query->result.u64 +=
  332.                                 r600_query_read_result(map + results_base, 0, 2, true);
  333.                         results_base = (results_base + 16) % query->buffer->b.b.width0;
  334.                 }
  335.                 break;
  336.         case PIPE_QUERY_OCCLUSION_PREDICATE:
  337.                 while (results_base != query->results_end) {
  338.                         query->result.b = query->result.b ||
  339.                                 r600_query_read_result(map + results_base, 0, 2, true) != 0;
  340.                         results_base = (results_base + 16) % query->buffer->b.b.width0;
  341.                 }
  342.                 break;
  343.         case PIPE_QUERY_TIME_ELAPSED:
  344.                 while (results_base != query->results_end) {
  345.                         query->result.u64 +=
  346.                                 r600_query_read_result(map + results_base, 0, 2, false);
  347.                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
  348.                 }
  349.                 break;
  350.         case PIPE_QUERY_PRIMITIVES_EMITTED:
  351.                 /* SAMPLE_STREAMOUTSTATS stores this structure:
  352.                  * {
  353.                  *    u64 NumPrimitivesWritten;
  354.                  *    u64 PrimitiveStorageNeeded;
  355.                  * }
  356.                  * We only need NumPrimitivesWritten here. */
  357.                 while (results_base != query->results_end) {
  358.                         query->result.u64 +=
  359.                                 r600_query_read_result(map + results_base, 2, 6, true);
  360.                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
  361.                 }
  362.                 break;
  363.         case PIPE_QUERY_PRIMITIVES_GENERATED:
  364.                 /* Here we read PrimitiveStorageNeeded. */
  365.                 while (results_base != query->results_end) {
  366.                         query->result.u64 +=
  367.                                 r600_query_read_result(map + results_base, 0, 4, true);
  368.                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
  369.                 }
  370.                 break;
  371.         case PIPE_QUERY_SO_STATISTICS:
  372.                 while (results_base != query->results_end) {
  373.                         query->result.so.num_primitives_written +=
  374.                                 r600_query_read_result(map + results_base, 2, 6, true);
  375.                         query->result.so.primitives_storage_needed +=
  376.                                 r600_query_read_result(map + results_base, 0, 4, true);
  377.                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
  378.                 }
  379.                 break;
  380.         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  381.                 while (results_base != query->results_end) {
  382.                         query->result.b = query->result.b ||
  383.                                 r600_query_read_result(map + results_base, 2, 6, true) !=
  384.                                 r600_query_read_result(map + results_base, 0, 4, true);
  385.                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
  386.                 }
  387.                 break;
  388.         default:
  389.                 assert(0);
  390.         }
  391.  
  392.         query->results_start = query->results_end;
  393.         ctx->ws->buffer_unmap(query->buffer->cs_buf);
  394.         return TRUE;
  395. }
  396.  
  397. void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
  398. {
  399.         struct radeon_winsys_cs *cs = ctx->cs;
  400.         unsigned new_results_end, i;
  401.         uint32_t *results;
  402.         uint64_t va;
  403.  
  404.         si_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
  405.  
  406.         new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
  407.  
  408.         /* collect current results if query buffer is full */
  409.         if (new_results_end == query->results_start) {
  410.                 r600_query_result(ctx, query, TRUE);
  411.         }
  412.  
  413.         switch (query->type) {
  414.         case PIPE_QUERY_OCCLUSION_COUNTER:
  415.         case PIPE_QUERY_OCCLUSION_PREDICATE:
  416.                 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
  417.                 if (results) {
  418.                         results = (uint32_t*)((char*)results + query->results_end);
  419.                         memset(results, 0, query->result_size);
  420.  
  421.                         /* Set top bits for unused backends */
  422.                         for (i = 0; i < ctx->max_db; i++) {
  423.                                 if (!(ctx->backend_mask & (1<<i))) {
  424.                                         results[(i * 4)+1] = 0x80000000;
  425.                                         results[(i * 4)+3] = 0x80000000;
  426.                                 }
  427.                         }
  428.                         ctx->ws->buffer_unmap(query->buffer->cs_buf);
  429.                 }
  430.                 break;
  431.         case PIPE_QUERY_TIME_ELAPSED:
  432.                 break;
  433.         case PIPE_QUERY_PRIMITIVES_EMITTED:
  434.         case PIPE_QUERY_PRIMITIVES_GENERATED:
  435.         case PIPE_QUERY_SO_STATISTICS:
  436.         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  437.                 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
  438.                 results = (uint32_t*)((char*)results + query->results_end);
  439.                 memset(results, 0, query->result_size);
  440.                 ctx->ws->buffer_unmap(query->buffer->cs_buf);
  441.                 break;
  442.         default:
  443.                 assert(0);
  444.         }
  445.  
  446.         /* emit begin query */
  447.         va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
  448.         va += query->results_end;
  449.  
  450.         switch (query->type) {
  451.         case PIPE_QUERY_OCCLUSION_COUNTER:
  452.         case PIPE_QUERY_OCCLUSION_PREDICATE:
  453.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
  454.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
  455.                 cs->buf[cs->cdw++] = va;
  456.                 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
  457.                 break;
  458.         case PIPE_QUERY_PRIMITIVES_EMITTED:
  459.         case PIPE_QUERY_PRIMITIVES_GENERATED:
  460.         case PIPE_QUERY_SO_STATISTICS:
  461.         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  462.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
  463.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
  464.                 cs->buf[cs->cdw++] = query->results_end;
  465.                 cs->buf[cs->cdw++] = 0;
  466.                 break;
  467.         case PIPE_QUERY_TIME_ELAPSED:
  468.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
  469.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
  470.                 cs->buf[cs->cdw++] = va;
  471.                 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
  472.                 cs->buf[cs->cdw++] = 0;
  473.                 cs->buf[cs->cdw++] = 0;
  474.                 break;
  475.         default:
  476.                 assert(0);
  477.         }
  478.         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  479.         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
  480.  
  481.         ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
  482. }
  483.  
  484. void r600_query_end(struct r600_context *ctx, struct r600_query *query)
  485. {
  486.         struct radeon_winsys_cs *cs = ctx->cs;
  487.         uint64_t va;
  488.  
  489.         va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
  490.         /* emit end query */
  491.         switch (query->type) {
  492.         case PIPE_QUERY_OCCLUSION_COUNTER:
  493.         case PIPE_QUERY_OCCLUSION_PREDICATE:
  494.                 va += query->results_end + 8;
  495.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
  496.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
  497.                 cs->buf[cs->cdw++] = va;
  498.                 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
  499.                 break;
  500.         case PIPE_QUERY_PRIMITIVES_EMITTED:
  501.         case PIPE_QUERY_PRIMITIVES_GENERATED:
  502.         case PIPE_QUERY_SO_STATISTICS:
  503.         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  504.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
  505.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
  506.                 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
  507.                 cs->buf[cs->cdw++] = 0;
  508.                 break;
  509.         case PIPE_QUERY_TIME_ELAPSED:
  510.                 va += query->results_end + query->result_size/2;
  511.                 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
  512.                 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
  513.                 cs->buf[cs->cdw++] = va;
  514.                 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
  515.                 cs->buf[cs->cdw++] = 0;
  516.                 cs->buf[cs->cdw++] = 0;
  517.                 break;
  518.         default:
  519.                 assert(0);
  520.         }
  521.         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  522.         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
  523.  
  524.         query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
  525.         ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
  526. }
  527.  
  528. void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
  529.                             int flag_wait)
  530. {
  531.         struct radeon_winsys_cs *cs = ctx->cs;
  532.         uint64_t va;
  533.  
  534.         if (operation == PREDICATION_OP_CLEAR) {
  535.                 si_need_cs_space(ctx, 3, FALSE);
  536.  
  537.                 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
  538.                 cs->buf[cs->cdw++] = 0;
  539.                 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
  540.         } else {
  541.                 unsigned results_base = query->results_start;
  542.                 unsigned count;
  543.                 uint32_t op;
  544.  
  545.                 /* find count of the query data blocks */
  546.                 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
  547.                 count /= query->result_size;
  548.  
  549.                 si_need_cs_space(ctx, 5 * count, TRUE);
  550.  
  551.                 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
  552.                                 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
  553.                 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
  554.  
  555.                 /* emit predicate packets for all data blocks */
  556.                 while (results_base != query->results_end) {
  557.                         cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
  558.                         cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
  559.                         cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
  560.                         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  561.                         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
  562.                                                                              RADEON_USAGE_READ);
  563.                         results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
  564.  
  565.                         /* set CONTINUE bit for all packets except the first */
  566.                         op |= PREDICATION_CONTINUE;
  567.                 }
  568.         }
  569. }
  570.  
  571. struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
  572. {
  573.         struct r600_query *query;
  574.         unsigned buffer_size = 4096;
  575.  
  576.         query = CALLOC_STRUCT(r600_query);
  577.         if (query == NULL)
  578.                 return NULL;
  579.  
  580.         query->type = query_type;
  581.  
  582.         switch (query_type) {
  583.         case PIPE_QUERY_OCCLUSION_COUNTER:
  584.         case PIPE_QUERY_OCCLUSION_PREDICATE:
  585.                 query->result_size = 16 * ctx->max_db;
  586.                 query->num_cs_dw = 6;
  587.                 break;
  588.         case PIPE_QUERY_TIME_ELAPSED:
  589.                 query->result_size = 16;
  590.                 query->num_cs_dw = 8;
  591.                 break;
  592.         case PIPE_QUERY_PRIMITIVES_EMITTED:
  593.         case PIPE_QUERY_PRIMITIVES_GENERATED:
  594.         case PIPE_QUERY_SO_STATISTICS:
  595.         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  596.                 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
  597.                 query->result_size = 32;
  598.                 query->num_cs_dw = 6;
  599.                 break;
  600.         default:
  601.                 assert(0);
  602.                 FREE(query);
  603.                 return NULL;
  604.         }
  605.  
  606.         /* adjust buffer size to simplify offsets wrapping math */
  607.         buffer_size -= buffer_size % query->result_size;
  608.  
  609.         /* Queries are normally read by the CPU after
  610.          * being written by the gpu, hence staging is probably a good
  611.          * usage pattern.
  612.          */
  613.         query->buffer = si_resource_create_custom(&ctx->screen->screen,
  614.                                                   PIPE_USAGE_STAGING,
  615.                                                   buffer_size);
  616.         if (!query->buffer) {
  617.                 FREE(query);
  618.                 return NULL;
  619.         }
  620.         return query;
  621. }
  622.  
  623. void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
  624. {
  625.         si_resource_reference(&query->buffer, NULL);
  626.         free(query);
  627. }
  628.  
  629. boolean r600_context_query_result(struct r600_context *ctx,
  630.                                 struct r600_query *query,
  631.                                 boolean wait, void *vresult)
  632. {
  633.         boolean *result_b = (boolean*)vresult;
  634.         uint64_t *result_u64 = (uint64_t*)vresult;
  635.         struct pipe_query_data_so_statistics *result_so =
  636.                 (struct pipe_query_data_so_statistics*)vresult;
  637.  
  638.         if (!r600_query_result(ctx, query, wait))
  639.                 return FALSE;
  640.  
  641.         switch (query->type) {
  642.         case PIPE_QUERY_OCCLUSION_COUNTER:
  643.         case PIPE_QUERY_PRIMITIVES_EMITTED:
  644.         case PIPE_QUERY_PRIMITIVES_GENERATED:
  645.                 *result_u64 = query->result.u64;
  646.                 break;
  647.         case PIPE_QUERY_OCCLUSION_PREDICATE:
  648.         case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  649.                 *result_b = query->result.b;
  650.                 break;
  651.         case PIPE_QUERY_TIME_ELAPSED:
  652.                 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
  653.                 break;
  654.         case PIPE_QUERY_SO_STATISTICS:
  655.                 *result_so = query->result.so;
  656.                 break;
  657.         default:
  658.                 assert(0);
  659.         }
  660.         return TRUE;
  661. }
  662.  
  663. void r600_context_queries_suspend(struct r600_context *ctx)
  664. {
  665.         struct r600_query *query;
  666.  
  667.         LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
  668.                 r600_query_end(ctx, query);
  669.         }
  670.         assert(ctx->num_cs_dw_queries_suspend == 0);
  671. }
  672.  
  673. void r600_context_queries_resume(struct r600_context *ctx)
  674. {
  675.         struct r600_query *query;
  676.  
  677.         assert(ctx->num_cs_dw_queries_suspend == 0);
  678.  
  679.         LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
  680.                 r600_query_begin(ctx, query);
  681.         }
  682. }
  683.  
  684. void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
  685. {
  686.         struct radeon_winsys_cs *cs = ctx->cs;
  687.         si_need_cs_space(ctx, 14 + 21, TRUE);
  688.  
  689.         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
  690.         cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
  691.         cs->buf[cs->cdw++] = 0;
  692.  
  693.         cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
  694.         cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
  695.         cs->buf[cs->cdw++] = t->stride >> 2;
  696.  
  697. #if 0
  698.         cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
  699.         cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
  700.         cs->buf[cs->cdw++] = 0; /* src address lo */
  701.         cs->buf[cs->cdw++] = 0; /* src address hi */
  702.         cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
  703.         cs->buf[cs->cdw++] = 0; /* unused */
  704. #endif
  705.  
  706.         cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
  707.         cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
  708.  
  709. }
  710.  
  711. #if R600_TRACE_CS
  712. void r600_trace_emit(struct r600_context *rctx)
  713. {
  714.         struct r600_screen *rscreen = rctx->screen;
  715.         struct radeon_winsys_cs *cs = rctx->cs;
  716.         uint64_t va;
  717.  
  718.         va = r600_resource_va(&rscreen->screen, (void*)rscreen->trace_bo);
  719.         r600_context_bo_reloc(rctx, rscreen->trace_bo, RADEON_USAGE_READWRITE);
  720.         cs->buf[cs->cdw++] = PKT3(PKT3_WRITE_DATA, 4, 0);
  721.         cs->buf[cs->cdw++] = PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
  722.                                 PKT3_WRITE_DATA_WR_CONFIRM |
  723.                                 PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME);
  724.         cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;
  725.         cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFFFFFFFUL;
  726.         cs->buf[cs->cdw++] = cs->cdw;
  727.         cs->buf[cs->cdw++] = rscreen->cs_count;
  728. }
  729. #endif
  730.