Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright 2011 Nouveau Project
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18.  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19.  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20.  * OTHER DEALINGS IN THE SOFTWARE.
  21.  *
  22.  * Authors: Christoph Bumiller
  23.  */
  24.  
  25. #define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
  26.  
  27. #include "nvc0/nvc0_context.h"
  28. #include "nv_object.xml.h"
  29. #include "nvc0/nve4_compute.xml.h"
  30. #include "nvc0/nvc0_compute.xml.h"
  31.  
  32. #define NVC0_QUERY_STATE_READY   0
  33. #define NVC0_QUERY_STATE_ACTIVE  1
  34. #define NVC0_QUERY_STATE_ENDED   2
  35. #define NVC0_QUERY_STATE_FLUSHED 3
  36.  
  37. struct nvc0_query {
  38.    uint32_t *data;
  39.    uint16_t type;
  40.    uint16_t index;
  41.    int8_t ctr[4];
  42.    uint32_t sequence;
  43.    struct nouveau_bo *bo;
  44.    uint32_t base;
  45.    uint32_t offset; /* base + i * rotate */
  46.    uint8_t state;
  47.    boolean is64bit;
  48.    uint8_t rotate;
  49.    int nesting; /* only used for occlusion queries */
  50.    union {
  51.       struct nouveau_mm_allocation *mm;
  52.       uint64_t value;
  53.    } u;
  54.    struct nouveau_fence *fence;
  55. };
  56.  
  57. #define NVC0_QUERY_ALLOC_SPACE 256
  58.  
  59. static boolean nvc0_mp_pm_query_begin(struct nvc0_context *,
  60.                                       struct nvc0_query *);
  61. static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
  62. static boolean nvc0_mp_pm_query_result(struct nvc0_context *,
  63.                                        struct nvc0_query *, void *, boolean);
  64.  
  65. static INLINE struct nvc0_query *
  66. nvc0_query(struct pipe_query *pipe)
  67. {
  68.    return (struct nvc0_query *)pipe;
  69. }
  70.  
  71. static boolean
  72. nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
  73. {
  74.    struct nvc0_screen *screen = nvc0->screen;
  75.    int ret;
  76.  
  77.    if (q->bo) {
  78.       nouveau_bo_ref(NULL, &q->bo);
  79.       if (q->u.mm) {
  80.          if (q->state == NVC0_QUERY_STATE_READY)
  81.             nouveau_mm_free(q->u.mm);
  82.          else
  83.             nouveau_fence_work(screen->base.fence.current,
  84.                                nouveau_mm_free_work, q->u.mm);
  85.       }
  86.    }
  87.    if (size) {
  88.       q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
  89.       if (!q->bo)
  90.          return FALSE;
  91.       q->offset = q->base;
  92.  
  93.       ret = nouveau_bo_map(q->bo, 0, screen->base.client);
  94.       if (ret) {
  95.          nvc0_query_allocate(nvc0, q, 0);
  96.          return FALSE;
  97.       }
  98.       q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
  99.    }
  100.    return TRUE;
  101. }
  102.  
  103. static void
  104. nvc0_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
  105. {
  106.    nvc0_query_allocate(nvc0_context(pipe), nvc0_query(pq), 0);
  107.    nouveau_fence_ref(NULL, &nvc0_query(pq)->fence);
  108.    FREE(nvc0_query(pq));
  109. }
  110.  
  111. static struct pipe_query *
  112. nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
  113. {
  114.    struct nvc0_context *nvc0 = nvc0_context(pipe);
  115.    struct nvc0_query *q;
  116.    unsigned space = NVC0_QUERY_ALLOC_SPACE;
  117.  
  118.    q = CALLOC_STRUCT(nvc0_query);
  119.    if (!q)
  120.       return NULL;
  121.  
  122.    switch (type) {
  123.    case PIPE_QUERY_OCCLUSION_COUNTER:
  124.    case PIPE_QUERY_OCCLUSION_PREDICATE:
  125.       q->rotate = 32;
  126.       space = NVC0_QUERY_ALLOC_SPACE;
  127.       break;
  128.    case PIPE_QUERY_PIPELINE_STATISTICS:
  129.       q->is64bit = TRUE;
  130.       space = 512;
  131.       break;
  132.    case PIPE_QUERY_SO_STATISTICS:
  133.    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  134.       q->is64bit = TRUE;
  135.       space = 64;
  136.       break;
  137.    case PIPE_QUERY_PRIMITIVES_GENERATED:
  138.    case PIPE_QUERY_PRIMITIVES_EMITTED:
  139.       q->is64bit = TRUE;
  140.       q->index = index;
  141.       space = 32;
  142.       break;
  143.    case PIPE_QUERY_TIME_ELAPSED:
  144.    case PIPE_QUERY_TIMESTAMP:
  145.    case PIPE_QUERY_TIMESTAMP_DISJOINT:
  146.    case PIPE_QUERY_GPU_FINISHED:
  147.       space = 32;
  148.       break;
  149.    case NVC0_QUERY_TFB_BUFFER_OFFSET:
  150.       space = 16;
  151.       break;
  152.    default:
  153. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  154.       if (type >= NVC0_QUERY_DRV_STAT(0) && type <= NVC0_QUERY_DRV_STAT_LAST) {
  155.          space = 0;
  156.          q->is64bit = true;
  157.          q->index = type - NVC0_QUERY_DRV_STAT(0);
  158.          break;
  159.       } else
  160. #endif
  161.       if (nvc0->screen->base.device->drm_version >= 0x01000101) {
  162.          if (type >= NVE4_PM_QUERY(0) && type <= NVE4_PM_QUERY_LAST) {
  163.             /* for each MP:
  164.              * [00] = WS0.C0
  165.              * [04] = WS0.C1
  166.              * [08] = WS0.C2
  167.              * [0c] = WS0.C3
  168.              * [10] = WS1.C0
  169.              * [14] = WS1.C1
  170.              * [18] = WS1.C2
  171.              * [1c] = WS1.C3
  172.              * [20] = WS2.C0
  173.              * [24] = WS2.C1
  174.              * [28] = WS2.C2
  175.              * [2c] = WS2.C3
  176.              * [30] = WS3.C0
  177.              * [34] = WS3.C1
  178.              * [38] = WS3.C2
  179.              * [3c] = WS3.C3
  180.              * [40] = MP.C4
  181.              * [44] = MP.C5
  182.              * [48] = MP.C6
  183.              * [4c] = MP.C7
  184.              * [50] = WS0.sequence
  185.              * [54] = WS1.sequence
  186.              * [58] = WS2.sequence
  187.              * [5c] = WS3.sequence
  188.              */
  189.             space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
  190.             break;
  191.          } else
  192.          if (type >= NVC0_PM_QUERY(0) && type <= NVC0_PM_QUERY_LAST) {
  193.             /* for each MP:
  194.              * [00] = MP.C0
  195.              * [04] = MP.C1
  196.              * [08] = MP.C2
  197.              * [0c] = MP.C3
  198.              * [10] = MP.C4
  199.              * [14] = MP.C5
  200.              * [18] = MP.C6
  201.              * [1c] = MP.C7
  202.              * [20] = MP.sequence
  203.              */
  204.             space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
  205.             break;
  206.          }
  207.       }
  208.       debug_printf("invalid query type: %u\n", type);
  209.       FREE(q);
  210.       return NULL;
  211.    }
  212.    if (!nvc0_query_allocate(nvc0, q, space)) {
  213.       FREE(q);
  214.       return NULL;
  215.    }
  216.  
  217.    q->type = type;
  218.  
  219.    if (q->rotate) {
  220.       /* we advance before query_begin ! */
  221.       q->offset -= q->rotate;
  222.       q->data -= q->rotate / sizeof(*q->data);
  223.    } else
  224.    if (!q->is64bit)
  225.       q->data[0] = 0; /* initialize sequence */
  226.  
  227.    return (struct pipe_query *)q;
  228. }
  229.  
  230. static void
  231. nvc0_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q,
  232.                unsigned offset, uint32_t get)
  233. {
  234.    offset += q->offset;
  235.  
  236.    PUSH_SPACE(push, 5);
  237.    PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
  238.    BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
  239.    PUSH_DATAh(push, q->bo->offset + offset);
  240.    PUSH_DATA (push, q->bo->offset + offset);
  241.    PUSH_DATA (push, q->sequence);
  242.    PUSH_DATA (push, get);
  243. }
  244.  
  245. static void
  246. nvc0_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q)
  247. {
  248.    q->offset += q->rotate;
  249.    q->data += q->rotate / sizeof(*q->data);
  250.    if (q->offset - q->base == NVC0_QUERY_ALLOC_SPACE)
  251.       nvc0_query_allocate(nvc0, q, NVC0_QUERY_ALLOC_SPACE);
  252. }
  253.  
  254. static boolean
  255. nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
  256. {
  257.    struct nvc0_context *nvc0 = nvc0_context(pipe);
  258.    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
  259.    struct nvc0_query *q = nvc0_query(pq);
  260.    boolean ret = true;
  261.  
  262.    /* For occlusion queries we have to change the storage, because a previous
  263.     * query might set the initial render conition to FALSE even *after* we re-
  264.     * initialized it to TRUE.
  265.     */
  266.    if (q->rotate) {
  267.       nvc0_query_rotate(nvc0, q);
  268.  
  269.       /* XXX: can we do this with the GPU, and sync with respect to a previous
  270.        *  query ?
  271.        */
  272.       q->data[0] = q->sequence; /* initialize sequence */
  273.       q->data[1] = 1; /* initial render condition = TRUE */
  274.       q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
  275.       q->data[5] = 0;
  276.    }
  277.    q->sequence++;
  278.  
  279.    switch (q->type) {
  280.    case PIPE_QUERY_OCCLUSION_COUNTER:
  281.    case PIPE_QUERY_OCCLUSION_PREDICATE:
  282.       q->nesting = nvc0->screen->num_occlusion_queries_active++;
  283.       if (q->nesting) {
  284.          nvc0_query_get(push, q, 0x10, 0x0100f002);
  285.       } else {
  286.          PUSH_SPACE(push, 3);
  287.          BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
  288.          PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
  289.          IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
  290.       }
  291.       break;
  292.    case PIPE_QUERY_PRIMITIVES_GENERATED:
  293.       nvc0_query_get(push, q, 0x10, 0x09005002 | (q->index << 5));
  294.       break;
  295.    case PIPE_QUERY_PRIMITIVES_EMITTED:
  296.       nvc0_query_get(push, q, 0x10, 0x05805002 | (q->index << 5));
  297.       break;
  298.    case PIPE_QUERY_SO_STATISTICS:
  299.       nvc0_query_get(push, q, 0x20, 0x05805002 | (q->index << 5));
  300.       nvc0_query_get(push, q, 0x30, 0x06805002 | (q->index << 5));
  301.       break;
  302.    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  303.       nvc0_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
  304.       break;
  305.    case PIPE_QUERY_TIME_ELAPSED:
  306.       nvc0_query_get(push, q, 0x10, 0x00005002);
  307.       break;
  308.    case PIPE_QUERY_PIPELINE_STATISTICS:
  309.       nvc0_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
  310.       nvc0_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
  311.       nvc0_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
  312.       nvc0_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
  313.       nvc0_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
  314.       nvc0_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
  315.       nvc0_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
  316.       nvc0_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
  317.       nvc0_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
  318.       nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
  319.       break;
  320.    default:
  321. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  322.       if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
  323.           q->type <= NVC0_QUERY_DRV_STAT_LAST) {
  324.          if (q->index >= 5)
  325.             q->u.value = nvc0->screen->base.stats.v[q->index];
  326.          else
  327.             q->u.value = 0;
  328.       } else
  329. #endif
  330.       if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
  331.           (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
  332.          ret = nvc0_mp_pm_query_begin(nvc0, q);
  333.       }
  334.       break;
  335.    }
  336.    q->state = NVC0_QUERY_STATE_ACTIVE;
  337.    return ret;
  338. }
  339.  
  340. static void
  341. nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
  342. {
  343.    struct nvc0_context *nvc0 = nvc0_context(pipe);
  344.    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
  345.    struct nvc0_query *q = nvc0_query(pq);
  346.  
  347.    if (q->state != NVC0_QUERY_STATE_ACTIVE) {
  348.       /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
  349.       if (q->rotate)
  350.          nvc0_query_rotate(nvc0, q);
  351.       q->sequence++;
  352.    }
  353.    q->state = NVC0_QUERY_STATE_ENDED;
  354.  
  355.    switch (q->type) {
  356.    case PIPE_QUERY_OCCLUSION_COUNTER:
  357.    case PIPE_QUERY_OCCLUSION_PREDICATE:
  358.       nvc0_query_get(push, q, 0, 0x0100f002);
  359.       if (--nvc0->screen->num_occlusion_queries_active == 0) {
  360.          PUSH_SPACE(push, 1);
  361.          IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
  362.       }
  363.       break;
  364.    case PIPE_QUERY_PRIMITIVES_GENERATED:
  365.       nvc0_query_get(push, q, 0, 0x09005002 | (q->index << 5));
  366.       break;
  367.    case PIPE_QUERY_PRIMITIVES_EMITTED:
  368.       nvc0_query_get(push, q, 0, 0x05805002 | (q->index << 5));
  369.       break;
  370.    case PIPE_QUERY_SO_STATISTICS:
  371.       nvc0_query_get(push, q, 0x00, 0x05805002 | (q->index << 5));
  372.       nvc0_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
  373.       break;
  374.    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  375.       /* TODO: How do we sum over all streams for render condition ? */
  376.       /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
  377.       nvc0_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
  378.       nvc0_query_get(push, q, 0x20, 0x00005002);
  379.       break;
  380.    case PIPE_QUERY_TIMESTAMP:
  381.    case PIPE_QUERY_TIME_ELAPSED:
  382.       nvc0_query_get(push, q, 0, 0x00005002);
  383.       break;
  384.    case PIPE_QUERY_GPU_FINISHED:
  385.       nvc0_query_get(push, q, 0, 0x1000f010);
  386.       break;
  387.    case PIPE_QUERY_PIPELINE_STATISTICS:
  388.       nvc0_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
  389.       nvc0_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
  390.       nvc0_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
  391.       nvc0_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
  392.       nvc0_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
  393.       nvc0_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
  394.       nvc0_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
  395.       nvc0_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
  396.       nvc0_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
  397.       nvc0_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
  398.       break;
  399.    case NVC0_QUERY_TFB_BUFFER_OFFSET:
  400.       /* indexed by TFB buffer instead of by vertex stream */
  401.       nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
  402.       break;
  403.    case PIPE_QUERY_TIMESTAMP_DISJOINT:
  404.       /* This query is not issued on GPU because disjoint is forced to FALSE */
  405.       q->state = NVC0_QUERY_STATE_READY;
  406.       break;
  407.    default:
  408. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  409.       if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
  410.           q->type <= NVC0_QUERY_DRV_STAT_LAST) {
  411.          q->u.value = nvc0->screen->base.stats.v[q->index] - q->u.value;
  412.          return;
  413.       } else
  414. #endif
  415.       if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
  416.           (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
  417.          nvc0_mp_pm_query_end(nvc0, q);
  418.       }
  419.       break;
  420.    }
  421.    if (q->is64bit)
  422.       nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
  423. }
  424.  
  425. static INLINE void
  426. nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
  427. {
  428.    if (q->is64bit) {
  429.       if (nouveau_fence_signalled(q->fence))
  430.          q->state = NVC0_QUERY_STATE_READY;
  431.    } else {
  432.       if (q->data[0] == q->sequence)
  433.          q->state = NVC0_QUERY_STATE_READY;
  434.    }
  435. }
  436.  
  437. static boolean
  438. nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
  439.                   boolean wait, union pipe_query_result *result)
  440. {
  441.    struct nvc0_context *nvc0 = nvc0_context(pipe);
  442.    struct nvc0_query *q = nvc0_query(pq);
  443.    uint64_t *res64 = (uint64_t*)result;
  444.    uint32_t *res32 = (uint32_t*)result;
  445.    boolean *res8 = (boolean*)result;
  446.    uint64_t *data64 = (uint64_t *)q->data;
  447.    unsigned i;
  448.  
  449. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  450.    if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
  451.        q->type <= NVC0_QUERY_DRV_STAT_LAST) {
  452.       res64[0] = q->u.value;
  453.       return TRUE;
  454.    } else
  455. #endif
  456.    if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
  457.        (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
  458.       return nvc0_mp_pm_query_result(nvc0, q, result, wait);
  459.    }
  460.  
  461.    if (q->state != NVC0_QUERY_STATE_READY)
  462.       nvc0_query_update(nvc0->screen->base.client, q);
  463.  
  464.    if (q->state != NVC0_QUERY_STATE_READY) {
  465.       if (!wait) {
  466.          if (q->state != NVC0_QUERY_STATE_FLUSHED) {
  467.             q->state = NVC0_QUERY_STATE_FLUSHED;
  468.             /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
  469.             PUSH_KICK(nvc0->base.pushbuf);
  470.          }
  471.          return FALSE;
  472.       }
  473.       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
  474.          return FALSE;
  475.       NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
  476.    }
  477.    q->state = NVC0_QUERY_STATE_READY;
  478.  
  479.    switch (q->type) {
  480.    case PIPE_QUERY_GPU_FINISHED:
  481.       res8[0] = TRUE;
  482.       break;
  483.    case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
  484.       res64[0] = q->data[1] - q->data[5];
  485.       break;
  486.    case PIPE_QUERY_OCCLUSION_PREDICATE:
  487.       res8[0] = q->data[1] != q->data[5];
  488.       break;
  489.    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
  490.    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
  491.       res64[0] = data64[0] - data64[2];
  492.       break;
  493.    case PIPE_QUERY_SO_STATISTICS:
  494.       res64[0] = data64[0] - data64[4];
  495.       res64[1] = data64[2] - data64[6];
  496.       break;
  497.    case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  498.       res8[0] = data64[0] != data64[2];
  499.       break;
  500.    case PIPE_QUERY_TIMESTAMP:
  501.       res64[0] = data64[1];
  502.       break;
  503.    case PIPE_QUERY_TIMESTAMP_DISJOINT:
  504.       res64[0] = 1000000000;
  505.       res8[8] = FALSE;
  506.       break;
  507.    case PIPE_QUERY_TIME_ELAPSED:
  508.       res64[0] = data64[1] - data64[3];
  509.       break;
  510.    case PIPE_QUERY_PIPELINE_STATISTICS:
  511.       for (i = 0; i < 10; ++i)
  512.          res64[i] = data64[i * 2] - data64[24 + i * 2];
  513.       break;
  514.    case NVC0_QUERY_TFB_BUFFER_OFFSET:
  515.       res32[0] = q->data[1];
  516.       break;
  517.    default:
  518.       assert(0); /* can't happen, we don't create queries with invalid type */
  519.       return FALSE;
  520.    }
  521.  
  522.    return TRUE;
  523. }
  524.  
  525. void
  526. nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
  527. {
  528.    struct nvc0_query *q = nvc0_query(pq);
  529.    unsigned offset = q->offset;
  530.  
  531.    if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
  532.  
  533.    PUSH_SPACE(push, 5);
  534.    PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
  535.    BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
  536.    PUSH_DATAh(push, q->bo->offset + offset);
  537.    PUSH_DATA (push, q->bo->offset + offset);
  538.    PUSH_DATA (push, q->sequence);
  539.    PUSH_DATA (push, (1 << 12) |
  540.               NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
  541. }
  542.  
  543. static void
  544. nvc0_render_condition(struct pipe_context *pipe,
  545.                       struct pipe_query *pq,
  546.                       boolean condition, uint mode)
  547. {
  548.    struct nvc0_context *nvc0 = nvc0_context(pipe);
  549.    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
  550.    struct nvc0_query *q;
  551.    uint32_t cond;
  552.    boolean wait =
  553.       mode != PIPE_RENDER_COND_NO_WAIT &&
  554.       mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
  555.  
  556.    if (!pq) {
  557.       cond = NVC0_3D_COND_MODE_ALWAYS;
  558.    }
  559.    else {
  560.       q = nvc0_query(pq);
  561.       /* NOTE: comparison of 2 queries only works if both have completed */
  562.       switch (q->type) {
  563.       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
  564.          cond = condition ? NVC0_3D_COND_MODE_EQUAL :
  565.                           NVC0_3D_COND_MODE_NOT_EQUAL;
  566.          wait = TRUE;
  567.          break;
  568.       case PIPE_QUERY_OCCLUSION_COUNTER:
  569.       case PIPE_QUERY_OCCLUSION_PREDICATE:
  570.          if (likely(!condition)) {
  571.             if (unlikely(q->nesting))
  572.                cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
  573.                              NVC0_3D_COND_MODE_ALWAYS;
  574.             else
  575.                cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
  576.          } else {
  577.             cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
  578.          }
  579.          break;
  580.       default:
  581.          assert(!"render condition query not a predicate");
  582.          cond = NVC0_3D_COND_MODE_ALWAYS;
  583.          break;
  584.       }
  585.    }
  586.  
  587.    nvc0->cond_query = pq;
  588.    nvc0->cond_cond = condition;
  589.    nvc0->cond_condmode = cond;
  590.    nvc0->cond_mode = mode;
  591.  
  592.    if (!pq) {
  593.       PUSH_SPACE(push, 1);
  594.       IMMED_NVC0(push, NVC0_3D(COND_MODE), cond);
  595.       return;
  596.    }
  597.  
  598.    if (wait)
  599.       nvc0_query_fifo_wait(push, pq);
  600.  
  601.    PUSH_SPACE(push, 7);
  602.    PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
  603.    BEGIN_NVC0(push, NVC0_3D(COND_ADDRESS_HIGH), 3);
  604.    PUSH_DATAh(push, q->bo->offset + q->offset);
  605.    PUSH_DATA (push, q->bo->offset + q->offset);
  606.    PUSH_DATA (push, cond);
  607.    BEGIN_NVC0(push, NVC0_2D(COND_ADDRESS_HIGH), 2);
  608.    PUSH_DATAh(push, q->bo->offset + q->offset);
  609.    PUSH_DATA (push, q->bo->offset + q->offset);
  610. }
  611.  
  612. void
  613. nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
  614.                           struct pipe_query *pq, unsigned result_offset)
  615. {
  616.    struct nvc0_query *q = nvc0_query(pq);
  617.  
  618. #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
  619.  
  620.    nouveau_pushbuf_space(push, 0, 0, 1);
  621.    nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
  622.                         NVC0_IB_ENTRY_1_NO_PREFETCH);
  623. }
  624.  
  625. void
  626. nvc0_so_target_save_offset(struct pipe_context *pipe,
  627.                            struct pipe_stream_output_target *ptarg,
  628.                            unsigned index, boolean *serialize)
  629. {
  630.    struct nvc0_so_target *targ = nvc0_so_target(ptarg);
  631.  
  632.    if (*serialize) {
  633.       *serialize = FALSE;
  634.       PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
  635.       IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
  636.  
  637.       NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1);
  638.    }
  639.  
  640.    nvc0_query(targ->pq)->index = index;
  641.  
  642.    nvc0_query_end(pipe, targ->pq);
  643. }
  644.  
  645.  
  646. /* === DRIVER STATISTICS === */
  647.  
  648. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  649.  
  650. static const char *nvc0_drv_stat_names[] =
  651. {
  652.    "drv-tex_obj_current_count",
  653.    "drv-tex_obj_current_bytes",
  654.    "drv-buf_obj_current_count",
  655.    "drv-buf_obj_current_bytes_vid",
  656.    "drv-buf_obj_current_bytes_sys",
  657.    "drv-tex_transfers_rd",
  658.    "drv-tex_transfers_wr",
  659.    "drv-tex_copy_count",
  660.    "drv-tex_blit_count",
  661.    "drv-tex_cache_flush_count",
  662.    "drv-buf_transfers_rd",
  663.    "drv-buf_transfers_wr",
  664.    "drv-buf_read_bytes_staging_vid",
  665.    "drv-buf_write_bytes_direct",
  666.    "drv-buf_write_bytes_staging_vid",
  667.    "drv-buf_write_bytes_staging_sys",
  668.    "drv-buf_copy_bytes",
  669.    "drv-buf_non_kernel_fence_sync_count",
  670.    "drv-any_non_kernel_fence_sync_count",
  671.    "drv-query_sync_count",
  672.    "drv-gpu_serialize_count",
  673.    "drv-draw_calls_array",
  674.    "drv-draw_calls_indexed",
  675.    "drv-draw_calls_fallback_count",
  676.    "drv-user_buffer_upload_bytes",
  677.    "drv-constbuf_upload_count",
  678.    "drv-constbuf_upload_bytes",
  679.    "drv-pushbuf_count",
  680.    "drv-resource_validate_count"
  681. };
  682.  
  683. #endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */
  684.  
  685.  
  686. /* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
  687.  
  688. /* Code to read out MP counters: They are accessible via mmio, too, but let's
  689.  * just avoid mapping registers in userspace. We'd have to know which MPs are
  690.  * enabled/present, too, and that information is not presently exposed.
  691.  * We could add a kernel interface for it, but reading the counters like this
  692.  * has the advantage of being async (if get_result isn't called immediately).
  693.  */
  694. static const uint64_t nve4_read_mp_pm_counters_code[] =
  695. {
  696.    /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
  697.     * mov b32 $r8 $tidx
  698.     * mov b32 $r12 $physid
  699.     * mov b32 $r0 $pm0
  700.     * mov b32 $r1 $pm1
  701.     * mov b32 $r2 $pm2
  702.     * mov b32 $r3 $pm3
  703.     * mov b32 $r4 $pm4
  704.     * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
  705.     * mov b32 $r5 $pm5
  706.     * mov b32 $r6 $pm6
  707.     * mov b32 $r7 $pm7
  708.     * set $p0 0x1 eq u32 $r8 0x0
  709.     * mov b32 $r10 c0[0x0]
  710.     * ext u32 $r8 $r12 0x414
  711.     * mov b32 $r11 c0[0x4]
  712.     * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
  713.     * ext u32 $r9 $r12 0x208
  714.     * (not $p0) exit
  715.     * set $p1 0x1 eq u32 $r9 0x0
  716.     * mul $r8 u32 $r8 u32 96
  717.     * mul $r12 u32 $r9 u32 16
  718.     * mul $r13 u32 $r9 u32 4
  719.     * add b32 $r9 $r8 $r13
  720.     * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
  721.     * add b32 $r8 $r8 $r12
  722.     * mov b32 $r12 $r10
  723.     * add b32 $r10 $c $r10 $r8
  724.     * mov b32 $r13 $r11
  725.     * add b32 $r11 $r11 0x0 $c
  726.     * add b32 $r12 $c $r12 $r9
  727.     * st b128 wt g[$r10d] $r0q
  728.     * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
  729.     * mov b32 $r0 c0[0x8]
  730.     * add b32 $r13 $r13 0x0 $c
  731.     * $p1 st b128 wt g[$r12d+0x40] $r4q
  732.     * st b32 wt g[$r12d+0x50] $r0
  733.     * exit */
  734.    0x2202020202020207ULL,
  735.    0x2c00000084021c04ULL,
  736.    0x2c0000000c031c04ULL,
  737.    0x2c00000010001c04ULL,
  738.    0x2c00000014005c04ULL,
  739.    0x2c00000018009c04ULL,
  740.    0x2c0000001c00dc04ULL,
  741.    0x2c00000020011c04ULL,
  742.    0x22b0420042320207ULL,
  743.    0x2c00000024015c04ULL,
  744.    0x2c00000028019c04ULL,
  745.    0x2c0000002c01dc04ULL,
  746.    0x190e0000fc81dc03ULL,
  747.    0x2800400000029de4ULL,
  748.    0x7000c01050c21c03ULL,
  749.    0x280040001002dde4ULL,
  750.    0x204282020042e047ULL,
  751.    0x7000c00820c25c03ULL,
  752.    0x80000000000021e7ULL,
  753.    0x190e0000fc93dc03ULL,
  754.    0x1000000180821c02ULL,
  755.    0x1000000040931c02ULL,
  756.    0x1000000010935c02ULL,
  757.    0x4800000034825c03ULL,
  758.    0x22c042c042c04287ULL,
  759.    0x4800000030821c03ULL,
  760.    0x2800000028031de4ULL,
  761.    0x4801000020a29c03ULL,
  762.    0x280000002c035de4ULL,
  763.    0x0800000000b2dc42ULL,
  764.    0x4801000024c31c03ULL,
  765.    0x9400000000a01fc5ULL,
  766.    0x200002e04202c047ULL,
  767.    0x2800400020001de4ULL,
  768.    0x0800000000d35c42ULL,
  769.    0x9400000100c107c5ULL,
  770.    0x9400000140c01f85ULL,
  771.    0x8000000000001de7ULL
  772. };
  773.  
  774. /* NOTE: intentionally using the same names as NV */
  775. static const char *nve4_pm_query_names[] =
  776. {
  777.    /* MP counters */
  778.    "prof_trigger_00",
  779.    "prof_trigger_01",
  780.    "prof_trigger_02",
  781.    "prof_trigger_03",
  782.    "prof_trigger_04",
  783.    "prof_trigger_05",
  784.    "prof_trigger_06",
  785.    "prof_trigger_07",
  786.    "warps_launched",
  787.    "threads_launched",
  788.    "sm_cta_launched",
  789.    "inst_issued1",
  790.    "inst_issued2",
  791.    "inst_executed",
  792.    "local_load",
  793.    "local_store",
  794.    "shared_load",
  795.    "shared_store",
  796.    "l1_local_load_hit",
  797.    "l1_local_load_miss",
  798.    "l1_local_store_hit",
  799.    "l1_local_store_miss",
  800.    "gld_request",
  801.    "gst_request",
  802.    "l1_global_load_hit",
  803.    "l1_global_load_miss",
  804.    "uncached_global_load_transaction",
  805.    "global_store_transaction",
  806.    "branch",
  807.    "divergent_branch",
  808.    "active_warps",
  809.    "active_cycles",
  810.    "inst_issued",
  811.    "atom_count",
  812.    "gred_count",
  813.    "shared_load_replay",
  814.    "shared_store_replay",
  815.    "local_load_transactions",
  816.    "local_store_transactions",
  817.    "l1_shared_load_transactions",
  818.    "l1_shared_store_transactions",
  819.    "global_ld_mem_divergence_replays",
  820.    "global_st_mem_divergence_replays",
  821.    /* metrics, i.e. functions of the MP counters */
  822.    "metric-ipc",                   /* inst_executed, clock */
  823.    "metric-ipac",                  /* inst_executed, active_cycles */
  824.    "metric-ipec",                  /* inst_executed, (bool)inst_executed */
  825.    "metric-achieved_occupancy",    /* active_warps, active_cycles */
  826.    "metric-sm_efficiency",         /* active_cycles, clock */
  827.    "metric-inst_replay_overhead"   /* inst_issued, inst_executed */
  828. };
  829.  
  830. /* For simplicity, we will allocate as many group slots as we allocate counter
  831.  * slots. This means that a single counter which wants to source from 2 groups
  832.  * will have to be declared as using 2 counter slots. This shouldn't really be
  833.  * a problem because such queries don't make much sense ... (unless someone is
  834.  * really creative).
  835.  */
  836. struct nvc0_mp_counter_cfg
  837. {
  838.    uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
  839.    uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
  840.    uint32_t num_src : 3;  /* number of sources (1 - 6, only for NVC0:NVE4) */
  841.    uint32_t sig_dom : 1;  /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
  842.    uint32_t sig_sel : 8;  /* signal group */
  843.    uint64_t src_sel;      /* signal selection for up to 6 sources (48 bit) */
  844. };
  845.  
  846. #define NVC0_COUNTER_OPn_SUM            0
  847. #define NVC0_COUNTER_OPn_OR             1
  848. #define NVC0_COUNTER_OPn_AND            2
  849. #define NVC0_COUNTER_OP2_REL_SUM_MM     3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
  850. #define NVC0_COUNTER_OP2_DIV_SUM_M0     4 /* sum(ctr0) / ctr1 of MP[0]) */
  851. #define NVC0_COUNTER_OP2_AVG_DIV_MM     5 /* avg(ctr0 / ctr1) */
  852. #define NVC0_COUNTER_OP2_AVG_DIV_M0     6 /* avg(ctr0) / ctr1 of MP[0]) */
  853.  
  854. struct nvc0_mp_pm_query_cfg
  855. {
  856.    struct nvc0_mp_counter_cfg ctr[4];
  857.    uint8_t num_counters;
  858.    uint8_t op;
  859.    uint8_t norm[2]; /* normalization num,denom */
  860. };
  861.  
  862. #define _Q1A(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
  863. #define _Q1B(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
  864. #define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
  865.    { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
  866.    { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
  867.    {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
  868. #define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
  869.    { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
  870.    { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
  871.    {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
  872. #define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
  873.    { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
  874.    { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
  875.    {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
  876.  
  877. /* NOTES:
  878.  * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
  879.  * inst_executed etc.: we only count a single warp scheduler
  880.  * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
  881.  *  this is inaccurate !
  882.  */
  883. static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
  884. {
  885.    _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
  886.    _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
  887.    _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
  888.    _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
  889.    _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
  890.    _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
  891.    _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
  892.    _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
  893.    _Q1A(LAUNCHED_WARPS,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
  894.    _Q1A(LAUNCHED_THREADS,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
  895.    _Q1B(LAUNCHED_CTA,      0x0001, B6, WARP, 0x0000001c, 1, 1),
  896.    _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
  897.    _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
  898.    _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
  899.    _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
  900.    _Q1A(LD_SHARED,   0x0001, B6, LDST, 0x00000000, 1, 1),
  901.    _Q1A(ST_SHARED,   0x0001, B6, LDST, 0x00000004, 1, 1),
  902.    _Q1A(LD_LOCAL,    0x0001, B6, LDST, 0x00000008, 1, 1),
  903.    _Q1A(ST_LOCAL,    0x0001, B6, LDST, 0x0000000c, 1, 1),
  904.    _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
  905.    _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
  906.    _Q1B(L1_LOCAL_LOAD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
  907.    _Q1B(L1_LOCAL_LOAD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
  908.    _Q1B(L1_LOCAL_STORE_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
  909.    _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
  910.    _Q1B(L1_GLOBAL_LOAD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
  911.    _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
  912.    _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000, 1, 1),
  913.    _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
  914.    _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
  915.    _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
  916.    _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
  917.    _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
  918.    _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
  919.    _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
  920.    _Q1B(LD_SHARED_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
  921.    _Q1B(ST_SHARED_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
  922.    _Q1B(LD_LOCAL_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
  923.    _Q1B(ST_LOCAL_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
  924.    _Q1B(L1_LD_SHARED_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
  925.    _Q1B(L1_ST_SHARED_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
  926.    _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
  927.    _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
  928.    _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
  929.    _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
  930.    _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
  931.    _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
  932.    _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
  933.    _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
  934. };
  935.  
  936. #undef _Q1A
  937. #undef _Q1B
  938. #undef _M2A
  939. #undef _M2B
  940.  
  941. /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
  942. static const uint64_t nvc0_read_mp_pm_counters_code[] =
  943. {
  944.    /* mov b32 $r8 $tidx
  945.     * mov b32 $r9 $physid
  946.     * mov b32 $r0 $pm0
  947.     * mov b32 $r1 $pm1
  948.     * mov b32 $r2 $pm2
  949.     * mov b32 $r3 $pm3
  950.     * mov b32 $r4 $pm4
  951.     * mov b32 $r5 $pm5
  952.     * mov b32 $r6 $pm6
  953.     * mov b32 $r7 $pm7
  954.     * set $p0 0x1 eq u32 $r8 0x0
  955.     * mov b32 $r10 c0[0x0]
  956.     * mov b32 $r11 c0[0x4]
  957.     * ext u32 $r8 $r9 0x414
  958.     * (not $p0) exit
  959.     * mul $r8 u32 $r8 u32 36
  960.     * add b32 $r10 $c $r10 $r8
  961.     * add b32 $r11 $r11 0x0 $c
  962.     * mov b32 $r8 c0[0x8]
  963.     * st b128 wt g[$r10d+0x00] $r0q
  964.     * st b128 wt g[$r10d+0x10] $r4q
  965.     * st b32 wt g[$r10d+0x20] $r8
  966.     * exit */
  967.    0x2c00000084021c04ULL,
  968.    0x2c0000000c025c04ULL,
  969.    0x2c00000010001c04ULL,
  970.    0x2c00000014005c04ULL,
  971.    0x2c00000018009c04ULL,
  972.    0x2c0000001c00dc04ULL,
  973.    0x2c00000020011c04ULL,
  974.    0x2c00000024015c04ULL,
  975.    0x2c00000028019c04ULL,
  976.    0x2c0000002c01dc04ULL,
  977.    0x190e0000fc81dc03ULL,
  978.    0x2800400000029de4ULL,
  979.    0x280040001002dde4ULL,
  980.    0x7000c01050921c03ULL,
  981.    0x80000000000021e7ULL,
  982.    0x1000000090821c02ULL,
  983.    0x4801000020a29c03ULL,
  984.    0x0800000000b2dc42ULL,
  985.    0x2800400020021de4ULL,
  986.    0x9400000000a01fc5ULL,
  987.    0x9400000040a11fc5ULL,
  988.    0x9400000080a21f85ULL,
  989.    0x8000000000001de7ULL
  990. };
  991.  
  992. static const char *nvc0_pm_query_names[] =
  993. {
  994.    /* MP counters */
  995.    "inst_executed",
  996.    "branch",
  997.    "divergent_branch",
  998.    "active_warps",
  999.    "active_cycles",
  1000.    "warps_launched",
  1001.    "threads_launched",
  1002.    "shared_load",
  1003.    "shared_store",
  1004.    "local_load",
  1005.    "local_store",
  1006.    "gred_count",
  1007.    "atom_count",
  1008.    "gld_request",
  1009.    "gst_request",
  1010.    "inst_issued1_0",
  1011.    "inst_issued1_1",
  1012.    "inst_issued2_0",
  1013.    "inst_issued2_1",
  1014.    "thread_inst_executed_0",
  1015.    "thread_inst_executed_1",
  1016.    "thread_inst_executed_2",
  1017.    "thread_inst_executed_3",
  1018.    "prof_trigger_00",
  1019.    "prof_trigger_01",
  1020.    "prof_trigger_02",
  1021.    "prof_trigger_03",
  1022.    "prof_trigger_04",
  1023.    "prof_trigger_05",
  1024.    "prof_trigger_06",
  1025.    "prof_trigger_07",
  1026. };
  1027.  
  1028. #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_PM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
  1029.  
  1030. static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
  1031. {
  1032.    _Q(INST_EXECUTED,       0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
  1033.    _Q(BRANCH,              0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
  1034.    _Q(BRANCH_DIVERGENT,    0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
  1035.    _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
  1036.    _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
  1037.    _Q(LAUNCHED_WARPS,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
  1038.    _Q(LAUNCHED_THREADS,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
  1039.    _Q(LD_SHARED,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
  1040.    _Q(ST_SHARED,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
  1041.    _Q(LD_LOCAL,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
  1042.    _Q(ST_LOCAL,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
  1043.    _Q(GRED_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
  1044.    _Q(ATOM_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
  1045.    _Q(GLD_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
  1046.    _Q(GST_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
  1047.    _Q(INST_ISSUED1_0,      0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
  1048.    _Q(INST_ISSUED1_1,      0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
  1049.    _Q(INST_ISSUED2_0,      0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
  1050.    _Q(INST_ISSUED2_1,      0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
  1051.    _Q(TH_INST_EXECUTED_0,  0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
  1052.    _Q(TH_INST_EXECUTED_1,  0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
  1053.    _Q(TH_INST_EXECUTED_2,  0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
  1054.    _Q(TH_INST_EXECUTED_3,  0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
  1055.    _Q(PROF_TRIGGER_0,      0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
  1056.    _Q(PROF_TRIGGER_1,      0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
  1057.    _Q(PROF_TRIGGER_2,      0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
  1058.    _Q(PROF_TRIGGER_3,      0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
  1059.    _Q(PROF_TRIGGER_4,      0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
  1060.    _Q(PROF_TRIGGER_5,      0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
  1061.    _Q(PROF_TRIGGER_6,      0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
  1062.    _Q(PROF_TRIGGER_7,      0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
  1063. };
  1064.  
  1065. #undef _Q
  1066.  
  1067. static const struct nvc0_mp_pm_query_cfg *
  1068. nvc0_mp_pm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q)
  1069. {
  1070.    struct nvc0_screen *screen = nvc0->screen;
  1071.  
  1072.    if (screen->base.class_3d >= NVE4_3D_CLASS)
  1073.       return &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
  1074.    return &nvc0_mp_pm_queries[q->type - NVC0_PM_QUERY(0)];
  1075. }
  1076.  
  1077. boolean
  1078. nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
  1079. {
  1080.    struct nvc0_screen *screen = nvc0->screen;
  1081.    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
  1082.    const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
  1083.    const struct nvc0_mp_pm_query_cfg *cfg;
  1084.    unsigned i, c;
  1085.    unsigned num_ab[2] = { 0, 0 };
  1086.  
  1087.    cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
  1088.  
  1089.    /* check if we have enough free counter slots */
  1090.    for (i = 0; i < cfg->num_counters; ++i)
  1091.       num_ab[cfg->ctr[i].sig_dom]++;
  1092.  
  1093.    if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 ||
  1094.        screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) {
  1095.       NOUVEAU_ERR("Not enough free MP counter slots !\n");
  1096.       return false;
  1097.    }
  1098.  
  1099.    assert(cfg->num_counters <= 4);
  1100.    PUSH_SPACE(push, 4 * 8 * (is_nve4 ? 1 : 6) + 6);
  1101.  
  1102.    if (!screen->pm.mp_counters_enabled) {
  1103.       screen->pm.mp_counters_enabled = TRUE;
  1104.       BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
  1105.       PUSH_DATA (push, 0x1fcb);
  1106.    }
  1107.  
  1108.    /* set sequence field to 0 (used to check if result is available) */
  1109.    for (i = 0; i < screen->mp_count; ++i)
  1110.       q->data[i * 10 + 10] = 0;
  1111.  
  1112.    for (i = 0; i < cfg->num_counters; ++i) {
  1113.       const unsigned d = cfg->ctr[i].sig_dom;
  1114.  
  1115.       if (!screen->pm.num_mp_pm_active[d]) {
  1116.          uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
  1117.          if (screen->pm.num_mp_pm_active[!d])
  1118.             m |= 1 << (7 + (8 * d));
  1119.          BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
  1120.          PUSH_DATA (push, m);
  1121.       }
  1122.       screen->pm.num_mp_pm_active[d]++;
  1123.  
  1124.       for (c = d * 4; c < (d * 4 + 4); ++c) {
  1125.          if (!screen->pm.mp_counter[c]) {
  1126.             q->ctr[i] = c;
  1127.             screen->pm.mp_counter[c] = (struct pipe_query *)q;
  1128.             break;
  1129.          }
  1130.       }
  1131.       assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
  1132.  
  1133.       /* configure and reset the counter(s) */
  1134.       if (is_nve4) {
  1135.          if (d == 0)
  1136.             BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
  1137.          else
  1138.             BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
  1139.          PUSH_DATA (push, cfg->ctr[i].sig_sel);
  1140.          BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
  1141.          PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
  1142.          BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
  1143.          PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
  1144.          BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
  1145.          PUSH_DATA (push, 0);
  1146.       } else {
  1147.          unsigned s;
  1148.  
  1149.          for (s = 0; s < cfg->ctr[i].num_src; s++) {
  1150.             BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
  1151.             PUSH_DATA (push, cfg->ctr[i].sig_sel);
  1152.             BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
  1153.             PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
  1154.             BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
  1155.             PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
  1156.             BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
  1157.             PUSH_DATA (push, 0);
  1158.          }
  1159.       }
  1160.    }
  1161.    return true;
  1162. }
  1163.  
  1164. static void
  1165. nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
  1166. {
  1167.    struct nvc0_screen *screen = nvc0->screen;
  1168.    struct pipe_context *pipe = &nvc0->base.pipe;
  1169.    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
  1170.    const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
  1171.    uint32_t mask;
  1172.    uint32_t input[3];
  1173.    const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
  1174.    const uint grid[3] = { screen->mp_count, 1, 1 };
  1175.    unsigned c;
  1176.    const struct nvc0_mp_pm_query_cfg *cfg;
  1177.  
  1178.    cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
  1179.  
  1180.    if (unlikely(!screen->pm.prog)) {
  1181.       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
  1182.       prog->type = PIPE_SHADER_COMPUTE;
  1183.       prog->translated = TRUE;
  1184.       prog->num_gprs = 14;
  1185.       prog->parm_size = 12;
  1186.       if (is_nve4) {
  1187.          prog->code = (uint32_t *)nve4_read_mp_pm_counters_code;
  1188.          prog->code_size = sizeof(nve4_read_mp_pm_counters_code);
  1189.       } else {
  1190.          prog->code = (uint32_t *)nvc0_read_mp_pm_counters_code;
  1191.          prog->code_size = sizeof(nvc0_read_mp_pm_counters_code);
  1192.       }
  1193.       screen->pm.prog = prog;
  1194.    }
  1195.  
  1196.    /* disable all counting */
  1197.    PUSH_SPACE(push, 8);
  1198.    for (c = 0; c < 8; ++c)
  1199.       if (screen->pm.mp_counter[c]) {
  1200.          if (is_nve4) {
  1201.             IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
  1202.          } else {
  1203.             IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
  1204.          }
  1205.       }
  1206.    /* release counters for this query */
  1207.    for (c = 0; c < 8; ++c) {
  1208.       if (nvc0_query(screen->pm.mp_counter[c]) == q) {
  1209.          screen->pm.num_mp_pm_active[c / 4]--;
  1210.          screen->pm.mp_counter[c] = NULL;
  1211.       }
  1212.    }
  1213.  
  1214.    BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
  1215.                 q->bo);
  1216.  
  1217.    PUSH_SPACE(push, 1);
  1218.    IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
  1219.  
  1220.    pipe->bind_compute_state(pipe, screen->pm.prog);
  1221.    input[0] = (q->bo->offset + q->base);
  1222.    input[1] = (q->bo->offset + q->base) >> 32;
  1223.    input[2] = q->sequence;
  1224.    pipe->launch_grid(pipe, block, grid, 0, input);
  1225.  
  1226.    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
  1227.  
  1228.    /* re-activate other counters */
  1229.    PUSH_SPACE(push, 16);
  1230.    mask = 0;
  1231.    for (c = 0; c < 8; ++c) {
  1232.       unsigned i;
  1233.       q = nvc0_query(screen->pm.mp_counter[c]);
  1234.       if (!q)
  1235.          continue;
  1236.       cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
  1237.       for (i = 0; i < cfg->num_counters; ++i) {
  1238.          if (mask & (1 << q->ctr[i]))
  1239.             break;
  1240.          mask |= 1 << q->ctr[i];
  1241.          if (is_nve4) {
  1242.             BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1);
  1243.          } else {
  1244.             BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(q->ctr[i])), 1);
  1245.          }
  1246.          PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
  1247.       }
  1248.    }
  1249. }
  1250.  
  1251. static INLINE boolean
  1252. nvc0_mp_pm_query_read_data(uint32_t count[32][4],
  1253.                            struct nvc0_context *nvc0, boolean wait,
  1254.                            struct nvc0_query *q,
  1255.                            const struct nvc0_mp_pm_query_cfg *cfg,
  1256.                            unsigned mp_count)
  1257. {
  1258.    unsigned p, c;
  1259.  
  1260.    for (p = 0; p < mp_count; ++p) {
  1261.       const unsigned b = (0x24 / 4) * p;
  1262.  
  1263.       for (c = 0; c < cfg->num_counters; ++c) {
  1264.          if (q->data[b + 8] != q->sequence) {
  1265.             if (!wait)
  1266.                return FALSE;
  1267.             if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
  1268.                return FALSE;
  1269.          }
  1270.          count[p][c] = q->data[b + q->ctr[c]];
  1271.       }
  1272.    }
  1273.    return TRUE;
  1274. }
  1275.  
  1276. static INLINE boolean
  1277. nve4_mp_pm_query_read_data(uint32_t count[32][4],
  1278.                            struct nvc0_context *nvc0, boolean wait,
  1279.                            struct nvc0_query *q,
  1280.                            const struct nvc0_mp_pm_query_cfg *cfg,
  1281.                            unsigned mp_count)
  1282. {
  1283.    unsigned p, c, d;
  1284.  
  1285.    for (p = 0; p < mp_count; ++p) {
  1286.       const unsigned b = (0x60 / 4) * p;
  1287.  
  1288.       for (c = 0; c < cfg->num_counters; ++c) {
  1289.          count[p][c] = 0;
  1290.          for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
  1291.             if (q->data[b + 20 + d] != q->sequence) {
  1292.                if (!wait)
  1293.                   return FALSE;
  1294.                if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
  1295.                   return FALSE;
  1296.             }
  1297.             if (q->ctr[c] & ~0x3)
  1298.                count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
  1299.             else
  1300.                count[p][c] += q->data[b + d * 4 + q->ctr[c]];
  1301.          }
  1302.       }
  1303.    }
  1304.    return TRUE;
  1305. }
  1306.  
  1307. /* Metric calculations:
  1308.  * sum(x) ... sum of x over all MPs
  1309.  * avg(x) ... average of x over all MPs
  1310.  *
  1311.  * IPC              : sum(inst_executed) / clock
  1312.  * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
  1313.  * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
  1314.  * MP_EFFICIENCY    : avg(active_cycles / clock)
  1315.  *
  1316.  * NOTE: Interpretation of IPC requires knowledge of MP count.
  1317.  */
  1318. static boolean
  1319. nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
  1320.                         void *result, boolean wait)
  1321. {
  1322.    uint32_t count[32][4];
  1323.    uint64_t value = 0;
  1324.    unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
  1325.    unsigned p, c;
  1326.    const struct nvc0_mp_pm_query_cfg *cfg;
  1327.    boolean ret;
  1328.  
  1329.    cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
  1330.  
  1331.    if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
  1332.       ret = nve4_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
  1333.    else
  1334.       ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
  1335.    if (!ret)
  1336.       return FALSE;
  1337.  
  1338.    if (cfg->op == NVC0_COUNTER_OPn_SUM) {
  1339.       for (c = 0; c < cfg->num_counters; ++c)
  1340.          for (p = 0; p < mp_count; ++p)
  1341.             value += count[p][c];
  1342.       value = (value * cfg->norm[0]) / cfg->norm[1];
  1343.    } else
  1344.    if (cfg->op == NVC0_COUNTER_OPn_OR) {
  1345.       uint32_t v = 0;
  1346.       for (c = 0; c < cfg->num_counters; ++c)
  1347.          for (p = 0; p < mp_count; ++p)
  1348.             v |= count[p][c];
  1349.       value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
  1350.    } else
  1351.    if (cfg->op == NVC0_COUNTER_OPn_AND) {
  1352.       uint32_t v = ~0;
  1353.       for (c = 0; c < cfg->num_counters; ++c)
  1354.          for (p = 0; p < mp_count; ++p)
  1355.             v &= count[p][c];
  1356.       value = ((uint64_t)v * cfg->norm[0]) / cfg->norm[1];
  1357.    } else
  1358.    if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
  1359.       uint64_t v[2] = { 0, 0 };
  1360.       for (p = 0; p < mp_count; ++p) {
  1361.          v[0] += count[p][0];
  1362.          v[1] += count[p][1];
  1363.       }
  1364.       if (v[0])
  1365.          value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
  1366.    } else
  1367.    if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
  1368.       for (p = 0; p < mp_count; ++p)
  1369.          value += count[p][0];
  1370.       if (count[0][1])
  1371.          value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
  1372.       else
  1373.          value = 0;
  1374.    } else
  1375.    if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
  1376.       unsigned mp_used = 0;
  1377.       for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
  1378.          if (count[p][1])
  1379.             value += (count[p][0] * cfg->norm[0]) / count[p][1];
  1380.       if (mp_used)
  1381.          value /= (uint64_t)mp_used * cfg->norm[1];
  1382.    } else
  1383.    if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
  1384.       unsigned mp_used = 0;
  1385.       for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
  1386.          value += count[p][0];
  1387.       if (count[0][1] && mp_used) {
  1388.          value *= cfg->norm[0];
  1389.          value /= (uint64_t)count[0][1] * mp_used * cfg->norm[1];
  1390.       } else {
  1391.          value = 0;
  1392.       }
  1393.    }
  1394.  
  1395.    *(uint64_t *)result = value;
  1396.    return TRUE;
  1397. }
  1398.  
  1399. int
  1400. nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
  1401.                                   unsigned id,
  1402.                                   struct pipe_driver_query_info *info)
  1403. {
  1404.    struct nvc0_screen *screen = nvc0_screen(pscreen);
  1405.    int count = 0;
  1406.  
  1407.    count += NVC0_QUERY_DRV_STAT_COUNT;
  1408.  
  1409.    if (screen->base.device->drm_version >= 0x01000101) {
  1410.       if (screen->compute) {
  1411.          if (screen->base.class_3d == NVE4_3D_CLASS) {
  1412.             count += NVE4_PM_QUERY_COUNT;
  1413.          } else
  1414.          if (screen->base.class_3d < NVE4_3D_CLASS) {
  1415.             /* NVC0_COMPUTE is not always enabled */
  1416.             count += NVC0_PM_QUERY_COUNT;
  1417.          }
  1418.       }
  1419.    }
  1420.  
  1421.    if (!info)
  1422.       return count;
  1423.  
  1424.    /* Init default values. */
  1425.    info->name = "this_is_not_the_query_you_are_looking_for";
  1426.    info->query_type = 0xdeadd01d;
  1427.    info->max_value.u64 = 0;
  1428.    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
  1429.    info->group_id = -1;
  1430.  
  1431. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  1432.    if (id < NVC0_QUERY_DRV_STAT_COUNT) {
  1433.       info->name = nvc0_drv_stat_names[id];
  1434.       info->query_type = NVC0_QUERY_DRV_STAT(id);
  1435.       info->max_value.u64 = 0;
  1436.       if (strstr(info->name, "bytes"))
  1437.          info->type = PIPE_DRIVER_QUERY_TYPE_BYTES;
  1438.       info->group_id = NVC0_QUERY_DRV_STAT_GROUP;
  1439.       return 1;
  1440.    } else
  1441. #endif
  1442.    if (id < count) {
  1443.       if (screen->compute) {
  1444.          if (screen->base.class_3d == NVE4_3D_CLASS) {
  1445.             info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
  1446.             info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
  1447.             info->max_value.u64 =
  1448.                (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
  1449.             info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
  1450.             return 1;
  1451.          } else
  1452.          if (screen->base.class_3d < NVE4_3D_CLASS) {
  1453.             info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
  1454.             info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
  1455.             info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
  1456.             return 1;
  1457.          }
  1458.       }
  1459.    }
  1460.    /* user asked for info about non-existing query */
  1461.    return 0;
  1462. }
  1463.  
  1464. int
  1465. nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
  1466.                                         unsigned id,
  1467.                                         struct pipe_driver_query_group_info *info)
  1468. {
  1469.    struct nvc0_screen *screen = nvc0_screen(pscreen);
  1470.    int count = 0;
  1471.  
  1472. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  1473.    count++;
  1474. #endif
  1475.  
  1476.    if (screen->base.device->drm_version >= 0x01000101) {
  1477.       if (screen->compute) {
  1478.          if (screen->base.class_3d == NVE4_3D_CLASS) {
  1479.             count++;
  1480.          } else
  1481.          if (screen->base.class_3d < NVE4_3D_CLASS) {
  1482.             count++; /* NVC0_COMPUTE is not always enabled */
  1483.          }
  1484.       }
  1485.    }
  1486.  
  1487.    if (!info)
  1488.       return count;
  1489.  
  1490.    if (id == NVC0_QUERY_MP_COUNTER_GROUP) {
  1491.       if (screen->compute) {
  1492.          info->name = "MP counters";
  1493.          info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
  1494.  
  1495.          if (screen->base.class_3d == NVE4_3D_CLASS) {
  1496.             info->num_queries = NVE4_PM_QUERY_COUNT;
  1497.  
  1498.              /* On NVE4+, each multiprocessor have 8 hardware counters separated
  1499.               * in two distinct domains, but we allow only one active query
  1500.               * simultaneously because some of them use more than one hardware
  1501.               * counter and this will result in an undefined behaviour. */
  1502.              info->max_active_queries = 1; /* TODO: handle multiple hw counters */
  1503.              return 1;
  1504.          } else
  1505.          if (screen->base.class_3d < NVE4_3D_CLASS) {
  1506.             info->num_queries = NVC0_PM_QUERY_COUNT;
  1507.  
  1508.             /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
  1509.              * in a single domain. */
  1510.             info->max_active_queries = 8;
  1511.             return 1;
  1512.          }
  1513.       }
  1514.    }
  1515. #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
  1516.    else if (id == NVC0_QUERY_DRV_STAT_GROUP) {
  1517.       info->name = "Driver statistics";
  1518.       info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_CPU;
  1519.       info->max_active_queries = NVC0_QUERY_DRV_STAT_COUNT;
  1520.       info->num_queries = NVC0_QUERY_DRV_STAT_COUNT;
  1521.       return 1;
  1522.    }
  1523. #endif
  1524.  
  1525.    /* user asked for info about non-existing query group */
  1526.    info->name = "this_is_not_the_query_group_you_are_looking_for";
  1527.    info->max_active_queries = 0;
  1528.    info->num_queries = 0;
  1529.    info->type = 0;
  1530.    return 0;
  1531. }
  1532.  
  1533. void
  1534. nvc0_init_query_functions(struct nvc0_context *nvc0)
  1535. {
  1536.    struct pipe_context *pipe = &nvc0->base.pipe;
  1537.  
  1538.    pipe->create_query = nvc0_query_create;
  1539.    pipe->destroy_query = nvc0_query_destroy;
  1540.    pipe->begin_query = nvc0_query_begin;
  1541.    pipe->end_query = nvc0_query_end;
  1542.    pipe->get_query_result = nvc0_query_result;
  1543.    pipe->render_condition = nvc0_render_condition;
  1544. }
  1545.