Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2013 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21.  * DEALINGS IN THE SOFTWARE.
  22.  */
  23.  
  24. /**
  25.  * \file brw_vec4_gs.c
  26.  *
  27.  * State atom for client-programmable geometry shaders, and support code.
  28.  */
  29.  
  30. #include "brw_gs.h"
  31. #include "brw_context.h"
  32. #include "brw_vec4_gs_visitor.h"
  33. #include "brw_state.h"
  34. #include "brw_ff_gs.h"
  35.  
  36.  
  37. bool
  38. brw_codegen_gs_prog(struct brw_context *brw,
  39.                     struct gl_shader_program *prog,
  40.                     struct brw_geometry_program *gp,
  41.                     struct brw_gs_prog_key *key)
  42. {
  43.    struct brw_stage_state *stage_state = &brw->gs.base;
  44.    struct brw_gs_compile c;
  45.    memset(&c, 0, sizeof(c));
  46.    c.key = *key;
  47.    c.gp = gp;
  48.  
  49.    c.prog_data.include_primitive_id =
  50.       (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0;
  51.  
  52.    c.prog_data.invocations = gp->program.Invocations;
  53.  
  54.    /* Allocate the references to the uniforms that will end up in the
  55.     * prog_data associated with the compiled program, and which will be freed
  56.     * by the state cache.
  57.     *
  58.     * Note: param_count needs to be num_uniform_components * 4, since we add
  59.     * padding around uniform values below vec4 size, so the worst case is that
  60.     * every uniform is a float which gets padded to the size of a vec4.
  61.     */
  62.    struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
  63.    int param_count = gs->num_uniform_components * 4;
  64.  
  65.    /* We also upload clip plane data as uniforms */
  66.    param_count += MAX_CLIP_PLANES * 4;
  67.  
  68.    c.prog_data.base.base.param =
  69.       rzalloc_array(NULL, const gl_constant_value *, param_count);
  70.    c.prog_data.base.base.pull_param =
  71.       rzalloc_array(NULL, const gl_constant_value *, param_count);
  72.    c.prog_data.base.base.nr_params = param_count;
  73.  
  74.    if (brw->gen >= 7) {
  75.       if (gp->program.OutputType == GL_POINTS) {
  76.          /* When the output type is points, the geometry shader may output data
  77.           * to multiple streams, and EndPrimitive() has no effect.  So we
  78.           * configure the hardware to interpret the control data as stream ID.
  79.           */
  80.          c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
  81.  
  82.          /* We only have to emit control bits if we are using streams */
  83.          if (prog->Geom.UsesStreams)
  84.             c.control_data_bits_per_vertex = 2;
  85.          else
  86.             c.control_data_bits_per_vertex = 0;
  87.       } else {
  88.          /* When the output type is triangle_strip or line_strip, EndPrimitive()
  89.           * may be used to terminate the current strip and start a new one
  90.           * (similar to primitive restart), and outputting data to multiple
  91.           * streams is not supported.  So we configure the hardware to interpret
  92.           * the control data as EndPrimitive information (a.k.a. "cut bits").
  93.           */
  94.          c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
  95.  
  96.          /* We only need to output control data if the shader actually calls
  97.           * EndPrimitive().
  98.           */
  99.          c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0;
  100.       }
  101.    } else {
  102.       /* There are no control data bits in gen6. */
  103.       c.control_data_bits_per_vertex = 0;
  104.  
  105.       /* If it is using transform feedback, enable it */
  106.       if (prog->TransformFeedback.NumVarying)
  107.          c.prog_data.gen6_xfb_enabled = true;
  108.       else
  109.          c.prog_data.gen6_xfb_enabled = false;
  110.    }
  111.    c.control_data_header_size_bits =
  112.       gp->program.VerticesOut * c.control_data_bits_per_vertex;
  113.  
  114.    /* 1 HWORD = 32 bytes = 256 bits */
  115.    c.prog_data.control_data_header_size_hwords =
  116.       ALIGN(c.control_data_header_size_bits, 256) / 256;
  117.  
  118.    GLbitfield64 outputs_written = gp->program.Base.OutputsWritten;
  119.  
  120.    /* In order for legacy clipping to work, we need to populate the clip
  121.     * distance varying slots whenever clipping is enabled, even if the vertex
  122.     * shader doesn't write to gl_ClipDistance.
  123.     */
  124.    if (c.key.base.userclip_active) {
  125.       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
  126.       outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
  127.    }
  128.  
  129.    brw_compute_vue_map(brw->intelScreen->devinfo,
  130.                        &c.prog_data.base.vue_map, outputs_written);
  131.  
  132.    /* Compute the output vertex size.
  133.     *
  134.     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
  135.     * Size (p168):
  136.     *
  137.     *     [0,62] indicating [1,63] 16B units
  138.     *
  139.     *     Specifies the size of each vertex stored in the GS output entry
  140.     *     (following any Control Header data) as a number of 128-bit units
  141.     *     (minus one).
  142.     *
  143.     *     Programming Restrictions: The vertex size must be programmed as a
  144.     *     multiple of 32B units with the following exception: Rendering is
  145.     *     disabled (as per SOL stage state) and the vertex size output by the
  146.     *     GS thread is 16B.
  147.     *
  148.     *     If rendering is enabled (as per SOL state) the vertex size must be
  149.     *     programmed as a multiple of 32B units. In other words, the only time
  150.     *     software can program a vertex size with an odd number of 16B units
  151.     *     is when rendering is disabled.
  152.     *
  153.     * Note: B=bytes in the above text.
  154.     *
  155.     * It doesn't seem worth the extra trouble to optimize the case where the
  156.     * vertex size is 16B (especially since this would require special-casing
  157.     * the GEN assembly that writes to the URB).  So we just set the vertex
  158.     * size to a multiple of 32B (2 vec4's) in all cases.
  159.     *
  160.     * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
  161.     * budget that as follows:
  162.     *
  163.     *   512 bytes for varyings (a varying component is 4 bytes and
  164.     *             gl_MaxGeometryOutputComponents = 128)
  165.     *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
  166.     *             bytes)
  167.     *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
  168.     *             even if it's not used)
  169.     *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
  170.     *             whenever clip planes are enabled, even if the shader doesn't
  171.     *             write to gl_ClipDistance)
  172.     *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
  173.     *             (see below)--this causes up to 1 VUE slot to be wasted
  174.     *   400 bytes available for varying packing overhead
  175.     *
  176.     * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
  177.     * per interpolation type, so this is plenty.
  178.     *
  179.     */
  180.    unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16;
  181.    assert(brw->gen == 6 ||
  182.           output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
  183.    c.prog_data.output_vertex_size_hwords =
  184.       ALIGN(output_vertex_size_bytes, 32) / 32;
  185.  
  186.    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
  187.     * That divides up as follows:
  188.     *
  189.     *     64 bytes for the control data header (cut indices or StreamID bits)
  190.     *   4096 bytes for varyings (a varying component is 4 bytes and
  191.     *              gl_MaxGeometryTotalOutputComponents = 1024)
  192.     *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
  193.     *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
  194.     *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
  195.     *              even if it's not used)
  196.     *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
  197.     *              whenever clip planes are enabled, even if the shader doesn't
  198.     *              write to gl_ClipDistance)
  199.     *   4096 bytes overhead since the VUE size must be a multiple of 32
  200.     *              bytes (see above)--this causes up to 1 VUE slot to be wasted
  201.     *   8128 bytes available for varying packing overhead
  202.     *
  203.     * Worst-case varying packing overhead is 3/4 of a varying slot per
  204.     * interpolation type, which works out to 3072 bytes, so this would allow
  205.     * us to accommodate 2 interpolation types without any danger of running
  206.     * out of URB space.
  207.     *
  208.     * In practice, the risk of running out of URB space is very small, since
  209.     * the above figures are all worst-case, and most of them scale with the
  210.     * number of output vertices.  So we'll just calculate the amount of space
  211.     * we need, and if it's too large, fail to compile.
  212.     *
  213.     * The above is for gen7+ where we have a single URB entry that will hold
  214.     * all the output. In gen6, we will have to allocate URB entries for every
  215.     * vertex we emit, so our URB entries only need to be large enough to hold
  216.     * a single vertex. Also, gen6 does not have a control data header.
  217.     */
  218.    unsigned output_size_bytes;
  219.    if (brw->gen >= 7) {
  220.       output_size_bytes =
  221.          c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut;
  222.       output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords;
  223.    } else {
  224.       output_size_bytes = c.prog_data.output_vertex_size_hwords * 32;
  225.    }
  226.  
  227.    /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
  228.     * which comes before the control header.
  229.     */
  230.    if (brw->gen >= 8)
  231.       output_size_bytes += 32;
  232.  
  233.    assert(output_size_bytes >= 1);
  234.    int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
  235.    if (brw->gen == 6)
  236.       max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
  237.    if (output_size_bytes > max_output_size_bytes)
  238.       return false;
  239.  
  240.  
  241.    /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
  242.     * a multiple of 128 bytes in gen6.
  243.     */
  244.    if (brw->gen >= 7)
  245.       c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
  246.    else
  247.       c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
  248.  
  249.    c.prog_data.output_topology =
  250.       get_hw_prim_for_gl_prim(gp->program.OutputType);
  251.  
  252.    brw_compute_vue_map(brw->intelScreen->devinfo,
  253.                        &c.input_vue_map, c.key.input_varyings);
  254.  
  255.    /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
  256.     * need to program a URB read length of ceiling(num_slots / 2).
  257.     */
  258.    c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
  259.  
  260.    void *mem_ctx = ralloc_context(NULL);
  261.    unsigned program_size;
  262.    const unsigned *program =
  263.       brw_gs_emit(brw, prog, &c, mem_ctx, &program_size);
  264.    if (program == NULL) {
  265.       ralloc_free(mem_ctx);
  266.       return false;
  267.    }
  268.  
  269.    /* Scratch space is used for register spilling */
  270.    if (c.base.last_scratch) {
  271.       perf_debug("Geometry shader triggered register spilling.  "
  272.                  "Try reducing the number of live vec4 values to "
  273.                  "improve performance.\n");
  274.  
  275.       c.prog_data.base.base.total_scratch
  276.          = brw_get_scratch_size(c.base.last_scratch*REG_SIZE);
  277.  
  278.       brw_get_scratch_bo(brw, &stage_state->scratch_bo,
  279.                          c.prog_data.base.base.total_scratch *
  280.                          brw->max_gs_threads);
  281.    }
  282.  
  283.    brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG,
  284.                     &c.key, sizeof(c.key),
  285.                     program, program_size,
  286.                     &c.prog_data, sizeof(c.prog_data),
  287.                     &stage_state->prog_offset, &brw->gs.prog_data);
  288.    ralloc_free(mem_ctx);
  289.  
  290.    return true;
  291. }
  292.  
  293. static bool
  294. brw_gs_state_dirty(struct brw_context *brw)
  295. {
  296.    return brw_state_dirty(brw,
  297.                           _NEW_TEXTURE,
  298.                           BRW_NEW_GEOMETRY_PROGRAM |
  299.                           BRW_NEW_TRANSFORM_FEEDBACK |
  300.                           BRW_NEW_VUE_MAP_VS);
  301. }
  302.  
  303. static void
  304. brw_gs_populate_key(struct brw_context *brw,
  305.                     struct brw_gs_prog_key *key)
  306. {
  307.    struct gl_context *ctx = &brw->ctx;
  308.    struct brw_stage_state *stage_state = &brw->gs.base;
  309.    struct brw_geometry_program *gp =
  310.       (struct brw_geometry_program *) brw->geometry_program;
  311.    struct gl_program *prog = &gp->program.Base;
  312.  
  313.    memset(key, 0, sizeof(*key));
  314.  
  315.    key->base.program_string_id = gp->id;
  316.    brw_setup_vue_key_clip_info(brw, &key->base,
  317.                                gp->program.Base.UsesClipDistanceOut);
  318.  
  319.    /* _NEW_TEXTURE */
  320.    brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
  321.                                       &key->base.tex);
  322.  
  323.    /* BRW_NEW_VUE_MAP_VS */
  324.    key->input_varyings = brw->vue_map_vs.slots_valid;
  325. }
  326.  
  327. void
  328. brw_upload_gs_prog(struct brw_context *brw)
  329. {
  330.    struct gl_context *ctx = &brw->ctx;
  331.    struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
  332.    struct brw_stage_state *stage_state = &brw->gs.base;
  333.    struct brw_gs_prog_key key;
  334.    /* BRW_NEW_GEOMETRY_PROGRAM */
  335.    struct brw_geometry_program *gp =
  336.       (struct brw_geometry_program *) brw->geometry_program;
  337.  
  338.    if (!brw_gs_state_dirty(brw))
  339.       return;
  340.  
  341.    if (gp == NULL) {
  342.       /* No geometry shader.  Vertex data just passes straight through. */
  343.       if (brw->ctx.NewDriverState & BRW_NEW_VUE_MAP_VS) {
  344.          brw->vue_map_geom_out = brw->vue_map_vs;
  345.          brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
  346.       }
  347.  
  348.       if (brw->gen == 6 &&
  349.           (brw->ctx.NewDriverState & BRW_NEW_TRANSFORM_FEEDBACK)) {
  350.          gen6_brw_upload_ff_gs_prog(brw);
  351.          return;
  352.       }
  353.  
  354.       /* Other state atoms had better not try to access prog_data, since
  355.        * there's no GS program.
  356.        */
  357.       brw->gs.prog_data = NULL;
  358.       brw->gs.base.prog_data = NULL;
  359.  
  360.       return;
  361.    }
  362.  
  363.    brw_gs_populate_key(brw, &key);
  364.  
  365.    if (!brw_search_cache(&brw->cache, BRW_CACHE_GS_PROG,
  366.                          &key, sizeof(key),
  367.                          &stage_state->prog_offset, &brw->gs.prog_data)) {
  368.       bool success = brw_codegen_gs_prog(brw, current[MESA_SHADER_GEOMETRY],
  369.                                          gp, &key);
  370.       assert(success);
  371.       (void)success;
  372.    }
  373.    brw->gs.base.prog_data = &brw->gs.prog_data->base.base;
  374.  
  375.    if (memcmp(&brw->gs.prog_data->base.vue_map, &brw->vue_map_geom_out,
  376.               sizeof(brw->vue_map_geom_out)) != 0) {
  377.       brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
  378.       brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT;
  379.    }
  380. }
  381.  
  382. bool
  383. brw_gs_precompile(struct gl_context *ctx,
  384.                   struct gl_shader_program *shader_prog,
  385.                   struct gl_program *prog)
  386. {
  387.    struct brw_context *brw = brw_context(ctx);
  388.    struct brw_gs_prog_key key;
  389.    uint32_t old_prog_offset = brw->gs.base.prog_offset;
  390.    struct brw_gs_prog_data *old_prog_data = brw->gs.prog_data;
  391.    bool success;
  392.  
  393.    struct gl_geometry_program *gp = (struct gl_geometry_program *) prog;
  394.    struct brw_geometry_program *bgp = brw_geometry_program(gp);
  395.  
  396.    memset(&key, 0, sizeof(key));
  397.  
  398.    brw_vue_setup_prog_key_for_precompile(ctx, &key.base, bgp->id, &gp->Base);
  399.  
  400.    /* Assume that the set of varyings coming in from the vertex shader exactly
  401.     * matches what the geometry shader requires.
  402.     */
  403.    key.input_varyings = gp->Base.InputsRead;
  404.  
  405.    success = brw_codegen_gs_prog(brw, shader_prog, bgp, &key);
  406.  
  407.    brw->gs.base.prog_offset = old_prog_offset;
  408.    brw->gs.prog_data = old_prog_data;
  409.  
  410.    return success;
  411. }
  412.  
  413.  
  414. bool
  415. brw_gs_prog_data_compare(const void *in_a, const void *in_b)
  416. {
  417.    const struct brw_gs_prog_data *a = in_a;
  418.    const struct brw_gs_prog_data *b = in_b;
  419.  
  420.    /* Compare the base structure. */
  421.    if (!brw_stage_prog_data_compare(&a->base.base, &b->base.base))
  422.       return false;
  423.  
  424.    /* Compare the rest of the struct. */
  425.    const unsigned offset = sizeof(struct brw_stage_prog_data);
  426.    if (memcmp(((char *) a) + offset, ((char *) b) + offset,
  427.               sizeof(struct brw_gs_prog_data) - offset)) {
  428.       return false;
  429.    }
  430.  
  431.    return true;
  432. }
  433.