Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2013 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21.  * DEALINGS IN THE SOFTWARE.
  22.  */
  23.  
  24. /**
  25.  * \file brw_vec4_gs_visitor.cpp
  26.  *
  27.  * Geometry-shader-specific code derived from the vec4_visitor class.
  28.  */
  29.  
  30. #include "brw_vec4_gs_visitor.h"
  31. #include "gen6_gs_visitor.h"
  32.  
  33. const unsigned MAX_GS_INPUT_VERTICES = 6;
  34.  
  35. namespace brw {
  36.  
  37. vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
  38.                                  struct brw_gs_compile *c,
  39.                                  struct gl_shader_program *prog,
  40.                                  void *mem_ctx,
  41.                                  bool no_spills)
  42.    : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
  43.                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
  44.                   no_spills,
  45.                   ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
  46.      c(c)
  47. {
  48. }
  49.  
  50.  
  51. dst_reg *
  52. vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
  53. {
  54.    dst_reg *reg = new(mem_ctx) dst_reg(this, ir->type);
  55.  
  56.    switch (ir->data.location) {
  57.    case SYSTEM_VALUE_INVOCATION_ID:
  58.       this->current_annotation = "initialize gl_InvocationID";
  59.       emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
  60.       break;
  61.    default:
  62.       unreachable("not reached");
  63.    }
  64.  
  65.    return reg;
  66. }
  67.  
  68.  
  69. int
  70. vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
  71.                                       int attributes_per_reg)
  72. {
  73.    /* For geometry shaders there are N copies of the input attributes, where N
  74.     * is the number of input vertices.  attribute_map[BRW_VARYING_SLOT_COUNT *
  75.     * i + j] represents attribute j for vertex i.
  76.     *
  77.     * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
  78.     * so the total number of input slots that will be delivered to the GS (and
  79.     * thus the stride of the input arrays) is urb_read_length * 2.
  80.     */
  81.    const unsigned num_input_vertices = c->gp->program.VerticesIn;
  82.    assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
  83.    unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
  84.  
  85.    for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
  86.       int varying = c->input_vue_map.slot_to_varying[slot];
  87.       for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
  88.          attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
  89.             attributes_per_reg * payload_reg + input_array_stride * vertex +
  90.             slot;
  91.       }
  92.    }
  93.  
  94.    int regs_used = ALIGN(input_array_stride * num_input_vertices,
  95.                          attributes_per_reg) / attributes_per_reg;
  96.    return payload_reg + regs_used;
  97. }
  98.  
  99.  
  100. void
  101. vec4_gs_visitor::setup_payload()
  102. {
  103.    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
  104.  
  105.    /* If we are in dual instanced or single mode, then attributes are going
  106.     * to be interleaved, so one register contains two attribute slots.
  107.     */
  108.    int attributes_per_reg =
  109.       c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
  110.  
  111.    /* If a geometry shader tries to read from an input that wasn't written by
  112.     * the vertex shader, that produces undefined results, but it shouldn't
  113.     * crash anything.  So initialize attribute_map to zeros--that ensures that
  114.     * these undefined results are read from r0.
  115.     */
  116.    memset(attribute_map, 0, sizeof(attribute_map));
  117.  
  118.    int reg = 0;
  119.  
  120.    /* The payload always contains important data in r0, which contains
  121.     * the URB handles that are passed on to the URB write at the end
  122.     * of the thread.
  123.     */
  124.    reg++;
  125.  
  126.    /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
  127.    if (c->prog_data.include_primitive_id)
  128.       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
  129.  
  130.    reg = setup_uniforms(reg);
  131.  
  132.    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
  133.  
  134.    lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
  135.  
  136.    this->first_non_payload_grf = reg;
  137. }
  138.  
  139.  
  140. void
  141. vec4_gs_visitor::emit_prolog()
  142. {
  143.    /* In vertex shaders, r0.2 is guaranteed to be initialized to zero.  In
  144.     * geometry shaders, it isn't (it contains a bunch of information we don't
  145.     * need, like the input primitive type).  We need r0.2 to be zero in order
  146.     * to build scratch read/write messages correctly (otherwise this value
  147.     * will be interpreted as a global offset, causing us to do our scratch
  148.     * reads/writes to garbage memory).  So just set it to zero at the top of
  149.     * the shader.
  150.     */
  151.    this->current_annotation = "clear r0.2";
  152.    dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
  153.    vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
  154.    inst->force_writemask_all = true;
  155.  
  156.    /* Create a virtual register to hold the vertex count */
  157.    this->vertex_count = src_reg(this, glsl_type::uint_type);
  158.  
  159.    /* Initialize the vertex_count register to 0 */
  160.    this->current_annotation = "initialize vertex_count";
  161.    inst = emit(MOV(dst_reg(this->vertex_count), 0u));
  162.    inst->force_writemask_all = true;
  163.  
  164.    if (c->control_data_header_size_bits > 0) {
  165.       /* Create a virtual register to hold the current set of control data
  166.        * bits.
  167.        */
  168.       this->control_data_bits = src_reg(this, glsl_type::uint_type);
  169.  
  170.       /* If we're outputting more than 32 control data bits, then EmitVertex()
  171.        * will set control_data_bits to 0 after emitting the first vertex.
  172.        * Otherwise, we need to initialize it to 0 here.
  173.        */
  174.       if (c->control_data_header_size_bits <= 32) {
  175.          this->current_annotation = "initialize control data bits";
  176.          inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
  177.          inst->force_writemask_all = true;
  178.       }
  179.    }
  180.  
  181.    /* If the geometry shader uses the gl_PointSize input, we need to fix it up
  182.     * to account for the fact that the vertex shader stored it in the w
  183.     * component of VARYING_SLOT_PSIZ.
  184.     */
  185.    if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
  186.       this->current_annotation = "swizzle gl_PointSize input";
  187.       for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
  188.          dst_reg dst(ATTR,
  189.                      BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
  190.          dst.type = BRW_REGISTER_TYPE_F;
  191.          src_reg src(dst);
  192.          dst.writemask = WRITEMASK_X;
  193.          src.swizzle = BRW_SWIZZLE_WWWW;
  194.          inst = emit(MOV(dst, src));
  195.  
  196.          /* In dual instanced dispatch mode, dst has a width of 4, so we need
  197.           * to make sure the MOV happens regardless of which channels are
  198.           * enabled.
  199.           */
  200.          inst->force_writemask_all = true;
  201.       }
  202.    }
  203.  
  204.    this->current_annotation = NULL;
  205. }
  206.  
  207.  
  208. void
  209. vec4_gs_visitor::emit_program_code()
  210. {
  211.    /* We don't support NV_geometry_program4. */
  212.    unreachable("Unreached");
  213. }
  214.  
  215.  
  216. void
  217. vec4_gs_visitor::emit_thread_end()
  218. {
  219.    if (c->control_data_header_size_bits > 0) {
  220.       /* During shader execution, we only ever call emit_control_data_bits()
  221.        * just prior to outputting a vertex.  Therefore, the control data bits
  222.        * corresponding to the most recently output vertex still need to be
  223.        * emitted.
  224.        */
  225.       current_annotation = "thread end: emit control data bits";
  226.       emit_control_data_bits();
  227.    }
  228.  
  229.    /* MRF 0 is reserved for the debugger, so start with message header
  230.     * in MRF 1.
  231.     */
  232.    int base_mrf = 1;
  233.  
  234.    current_annotation = "thread end";
  235.    dst_reg mrf_reg(MRF, base_mrf);
  236.    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  237.    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
  238.    inst->force_writemask_all = true;
  239.    emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
  240.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  241.       emit_shader_time_end();
  242.    inst = emit(GS_OPCODE_THREAD_END);
  243.    inst->base_mrf = base_mrf;
  244.    inst->mlen = 1;
  245. }
  246.  
  247.  
  248. void
  249. vec4_gs_visitor::emit_urb_write_header(int mrf)
  250. {
  251.    /* The SEND instruction that writes the vertex data to the VUE will use
  252.     * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
  253.     * header specify an offset (in multiples of 256 bits) into the URB entry
  254.     * at which the write should take place.
  255.     *
  256.     * So we have to prepare a message header with the appropriate offset
  257.     * values.
  258.     */
  259.    dst_reg mrf_reg(MRF, mrf);
  260.    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  261.    this->current_annotation = "URB write header";
  262.    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
  263.    inst->force_writemask_all = true;
  264.    emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
  265.         (uint32_t) c->prog_data.output_vertex_size_hwords);
  266. }
  267.  
  268.  
  269. vec4_instruction *
  270. vec4_gs_visitor::emit_urb_write_opcode(bool complete)
  271. {
  272.    /* We don't care whether the vertex is complete, because in general
  273.     * geometry shaders output multiple vertices, and we don't terminate the
  274.     * thread until all vertices are complete.
  275.     */
  276.    (void) complete;
  277.  
  278.    vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
  279.    inst->offset = c->prog_data.control_data_header_size_hwords;
  280.  
  281.    /* We need to increment Global Offset by 1 to make room for Broadwell's
  282.     * extra "Vertex Count" payload at the beginning of the URB entry.
  283.     */
  284.    if (devinfo->gen >= 8)
  285.       inst->offset++;
  286.  
  287.    inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
  288.    return inst;
  289. }
  290.  
  291.  
  292. int
  293. vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir)
  294. {
  295.    /* Geometry shader inputs are arrays, but they use an unusual array layout:
  296.     * instead of all array elements for a given geometry shader input being
  297.     * stored consecutively, all geometry shader inputs are interleaved into
  298.     * one giant array.  At this stage of compilation, we assume that the
  299.     * stride of the array is BRW_VARYING_SLOT_COUNT.  Later,
  300.     * setup_attributes() will remap our accesses to the actual input array.
  301.     */
  302.    ir_dereference_variable *deref_var = ir->array->as_dereference_variable();
  303.    if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
  304.       return BRW_VARYING_SLOT_COUNT;
  305.    else
  306.       return vec4_visitor::compute_array_stride(ir);
  307. }
  308.  
  309.  
  310. /**
  311.  * Write out a batch of 32 control data bits from the control_data_bits
  312.  * register to the URB.
  313.  *
  314.  * The current value of the vertex_count register determines which DWORD in
  315.  * the URB receives the control data bits.  The control_data_bits register is
  316.  * assumed to contain the correct data for the vertex that was most recently
  317.  * output, and all previous vertices that share the same DWORD.
  318.  *
  319.  * This function takes care of ensuring that if no vertices have been output
  320.  * yet, no control bits are emitted.
  321.  */
  322. void
  323. vec4_gs_visitor::emit_control_data_bits()
  324. {
  325.    assert(c->control_data_bits_per_vertex != 0);
  326.  
  327.    /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
  328.     * granularity, we need to use two tricks to ensure that the batch of 32
  329.     * control data bits is written to the appropriate DWORD in the URB.  To
  330.     * select which vec4 we are writing to, we use the "slot {0,1} offset"
  331.     * fields of the message header.  To select which DWORD in the vec4 we are
  332.     * writing to, we use the channel mask fields of the message header.  To
  333.     * avoid penalizing geometry shaders that emit a small number of vertices
  334.     * with extra bookkeeping, we only do each of these tricks when
  335.     * c->prog_data.control_data_header_size_bits is large enough to make it
  336.     * necessary.
  337.     *
  338.     * Note: this means that if we're outputting just a single DWORD of control
  339.     * data bits, we'll actually replicate it four times since we won't do any
  340.     * channel masking.  But that's not a problem since in this case the
  341.     * hardware only pays attention to the first DWORD.
  342.     */
  343.    enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
  344.    if (c->control_data_header_size_bits > 32)
  345.       urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
  346.    if (c->control_data_header_size_bits > 128)
  347.       urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
  348.  
  349.    /* If vertex_count is 0, then no control data bits have been accumulated
  350.     * yet, so we should do nothing.
  351.     */
  352.    emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
  353.    emit(IF(BRW_PREDICATE_NORMAL));
  354.    {
  355.       /* If we are using either channel masks or a per-slot offset, then we
  356.        * need to figure out which DWORD we are trying to write to, using the
  357.        * formula:
  358.        *
  359.        *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
  360.        *
  361.        * Since bits_per_vertex is a power of two, and is known at compile
  362.        * time, this can be optimized to:
  363.        *
  364.        *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
  365.        */
  366.       src_reg dword_index(this, glsl_type::uint_type);
  367.       if (urb_write_flags) {
  368.          src_reg prev_count(this, glsl_type::uint_type);
  369.          emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
  370.          unsigned log2_bits_per_vertex =
  371.             _mesa_fls(c->control_data_bits_per_vertex);
  372.          emit(SHR(dst_reg(dword_index), prev_count,
  373.                   (uint32_t) (6 - log2_bits_per_vertex)));
  374.       }
  375.  
  376.       /* Start building the URB write message.  The first MRF gets a copy of
  377.        * R0.
  378.        */
  379.       int base_mrf = 1;
  380.       dst_reg mrf_reg(MRF, base_mrf);
  381.       src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  382.       vec4_instruction *inst = emit(MOV(mrf_reg, r0));
  383.       inst->force_writemask_all = true;
  384.  
  385.       if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
  386.          /* Set the per-slot offset to dword_index / 4, to that we'll write to
  387.           * the appropriate OWORD within the control data header.
  388.           */
  389.          src_reg per_slot_offset(this, glsl_type::uint_type);
  390.          emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
  391.          emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
  392.       }
  393.  
  394.       if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
  395.          /* Set the channel masks to 1 << (dword_index % 4), so that we'll
  396.           * write to the appropriate DWORD within the OWORD.  We need to do
  397.           * this computation with force_writemask_all, otherwise garbage data
  398.           * from invocation 0 might clobber the mask for invocation 1 when
  399.           * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
  400.           * together.
  401.           */
  402.          src_reg channel(this, glsl_type::uint_type);
  403.          inst = emit(AND(dst_reg(channel), dword_index, 3u));
  404.          inst->force_writemask_all = true;
  405.          src_reg one(this, glsl_type::uint_type);
  406.          inst = emit(MOV(dst_reg(one), 1u));
  407.          inst->force_writemask_all = true;
  408.          src_reg channel_mask(this, glsl_type::uint_type);
  409.          inst = emit(SHL(dst_reg(channel_mask), one, channel));
  410.          inst->force_writemask_all = true;
  411.          emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
  412.                                                channel_mask);
  413.          emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
  414.       }
  415.  
  416.       /* Store the control data bits in the message payload and send it. */
  417.       dst_reg mrf_reg2(MRF, base_mrf + 1);
  418.       inst = emit(MOV(mrf_reg2, this->control_data_bits));
  419.       inst->force_writemask_all = true;
  420.       inst = emit(GS_OPCODE_URB_WRITE);
  421.       inst->urb_write_flags = urb_write_flags;
  422.       /* We need to increment Global Offset by 256-bits to make room for
  423.        * Broadwell's extra "Vertex Count" payload at the beginning of the
  424.        * URB entry.  Since this is an OWord message, Global Offset is counted
  425.        * in 128-bit units, so we must set it to 2.
  426.        */
  427.       if (devinfo->gen >= 8)
  428.          inst->offset = 2;
  429.       inst->base_mrf = base_mrf;
  430.       inst->mlen = 2;
  431.    }
  432.    emit(BRW_OPCODE_ENDIF);
  433. }
  434.  
  435. void
  436. vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
  437. {
  438.    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
  439.  
  440.    /* Note: we are calling this *before* increasing vertex_count, so
  441.     * this->vertex_count == vertex_count - 1 in the formula above.
  442.     */
  443.  
  444.    /* Stream mode uses 2 bits per vertex */
  445.    assert(c->control_data_bits_per_vertex == 2);
  446.  
  447.    /* Must be a valid stream */
  448.    assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
  449.  
  450.    /* Control data bits are initialized to 0 so we don't have to set any
  451.     * bits when sending vertices to stream 0.
  452.     */
  453.    if (stream_id == 0)
  454.       return;
  455.  
  456.    /* reg::sid = stream_id */
  457.    src_reg sid(this, glsl_type::uint_type);
  458.    emit(MOV(dst_reg(sid), stream_id));
  459.  
  460.    /* reg:shift_count = 2 * (vertex_count - 1) */
  461.    src_reg shift_count(this, glsl_type::uint_type);
  462.    emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
  463.  
  464.    /* Note: we're relying on the fact that the GEN SHL instruction only pays
  465.     * attention to the lower 5 bits of its second source argument, so on this
  466.     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
  467.     * stream_id << ((2 * (vertex_count - 1)) % 32).
  468.     */
  469.    src_reg mask(this, glsl_type::uint_type);
  470.    emit(SHL(dst_reg(mask), sid, shift_count));
  471.    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
  472. }
  473.  
  474. void
  475. vec4_gs_visitor::visit(ir_emit_vertex *ir)
  476. {
  477.    this->current_annotation = "emit vertex: safety check";
  478.  
  479.    /* Haswell and later hardware ignores the "Render Stream Select" bits
  480.     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
  481.     * and instead sends all primitives down the pipeline for rasterization.
  482.     * If the SOL stage is enabled, "Render Stream Select" is honored and
  483.     * primitives bound to non-zero streams are discarded after stream output.
  484.     *
  485.     * Since the only purpose of primives sent to non-zero streams is to
  486.     * be recorded by transform feedback, we can simply discard all geometry
  487.     * bound to these streams when transform feedback is disabled.
  488.     */
  489.    if (ir->stream_id() > 0 && shader_prog->TransformFeedback.NumVarying == 0)
  490.       return;
  491.  
  492.    /* To ensure that we don't output more vertices than the shader specified
  493.     * using max_vertices, do the logic inside a conditional of the form "if
  494.     * (vertex_count < MAX)"
  495.     */
  496.    unsigned num_output_vertices = c->gp->program.VerticesOut;
  497.    emit(CMP(dst_null_d(), this->vertex_count,
  498.             src_reg(num_output_vertices), BRW_CONDITIONAL_L));
  499.    emit(IF(BRW_PREDICATE_NORMAL));
  500.    {
  501.       /* If we're outputting 32 control data bits or less, then we can wait
  502.        * until the shader is over to output them all.  Otherwise we need to
  503.        * output them as we go.  Now is the time to do it, since we're about to
  504.        * output the vertex_count'th vertex, so it's guaranteed that the
  505.        * control data bits associated with the (vertex_count - 1)th vertex are
  506.        * correct.
  507.        */
  508.       if (c->control_data_header_size_bits > 32) {
  509.          this->current_annotation = "emit vertex: emit control data bits";
  510.          /* Only emit control data bits if we've finished accumulating a batch
  511.           * of 32 bits.  This is the case when:
  512.           *
  513.           *     (vertex_count * bits_per_vertex) % 32 == 0
  514.           *
  515.           * (in other words, when the last 5 bits of vertex_count *
  516.           * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
  517.           * integer n (which is always the case, since bits_per_vertex is
  518.           * always 1 or 2), this is equivalent to requiring that the last 5-n
  519.           * bits of vertex_count are 0:
  520.           *
  521.           *     vertex_count & (2^(5-n) - 1) == 0
  522.           *
  523.           * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
  524.           * equivalent to:
  525.           *
  526.           *     vertex_count & (32 / bits_per_vertex - 1) == 0
  527.           */
  528.          vec4_instruction *inst =
  529.             emit(AND(dst_null_d(), this->vertex_count,
  530.                      (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
  531.          inst->conditional_mod = BRW_CONDITIONAL_Z;
  532.          emit(IF(BRW_PREDICATE_NORMAL));
  533.          {
  534.             emit_control_data_bits();
  535.  
  536.             /* Reset control_data_bits to 0 so we can start accumulating a new
  537.              * batch.
  538.              *
  539.              * Note: in the case where vertex_count == 0, this neutralizes the
  540.              * effect of any call to EndPrimitive() that the shader may have
  541.              * made before outputting its first vertex.
  542.              */
  543.             inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
  544.             inst->force_writemask_all = true;
  545.          }
  546.          emit(BRW_OPCODE_ENDIF);
  547.       }
  548.  
  549.       this->current_annotation = "emit vertex: vertex data";
  550.       emit_vertex();
  551.  
  552.       /* In stream mode we have to set control data bits for all vertices
  553.        * unless we have disabled control data bits completely (which we do
  554.        * do for GL_POINTS outputs that don't use streams).
  555.        */
  556.       if (c->control_data_header_size_bits > 0 &&
  557.           c->prog_data.control_data_format ==
  558.              GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
  559.           this->current_annotation = "emit vertex: Stream control data bits";
  560.           set_stream_control_data_bits(ir->stream_id());
  561.       }
  562.  
  563.       this->current_annotation = "emit vertex: increment vertex count";
  564.       emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
  565.                src_reg(1u)));
  566.    }
  567.    emit(BRW_OPCODE_ENDIF);
  568.  
  569.    this->current_annotation = NULL;
  570. }
  571.  
  572. void
  573. vec4_gs_visitor::visit(ir_end_primitive *)
  574. {
  575.    /* We can only do EndPrimitive() functionality when the control data
  576.     * consists of cut bits.  Fortunately, the only time it isn't is when the
  577.     * output type is points, in which case EndPrimitive() is a no-op.
  578.     */
  579.    if (c->prog_data.control_data_format !=
  580.        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
  581.       return;
  582.    }
  583.  
  584.    /* Cut bits use one bit per vertex. */
  585.    assert(c->control_data_bits_per_vertex == 1);
  586.  
  587.    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
  588.     * vertex n, 0 otherwise.  So all we need to do here is mark bit
  589.     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
  590.     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
  591.     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
  592.     *
  593.     * Note that if EndPrimitve() is called before emitting any vertices, this
  594.     * will cause us to set bit 31 of the control_data_bits register to 1.
  595.     * That's fine because:
  596.     *
  597.     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
  598.     *   output, so the hardware will ignore cut bit 31.
  599.     *
  600.     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
  601.     *   last vertex, so setting cut bit 31 has no effect (since the primitive
  602.     *   is automatically ended when the GS terminates).
  603.     *
  604.     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
  605.     *   control_data_bits register to 0 when the first vertex is emitted.
  606.     */
  607.  
  608.    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
  609.    src_reg one(this, glsl_type::uint_type);
  610.    emit(MOV(dst_reg(one), 1u));
  611.    src_reg prev_count(this, glsl_type::uint_type);
  612.    emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
  613.    src_reg mask(this, glsl_type::uint_type);
  614.    /* Note: we're relying on the fact that the GEN SHL instruction only pays
  615.     * attention to the lower 5 bits of its second source argument, so on this
  616.     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
  617.     * ((vertex_count - 1) % 32).
  618.     */
  619.    emit(SHL(dst_reg(mask), one, prev_count));
  620.    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
  621. }
  622.  
  623. static const unsigned *
  624. generate_assembly(struct brw_context *brw,
  625.                   struct gl_shader_program *shader_prog,
  626.                   struct gl_program *prog,
  627.                   struct brw_vue_prog_data *prog_data,
  628.                   void *mem_ctx,
  629.                   const cfg_t *cfg,
  630.                   unsigned *final_assembly_size)
  631. {
  632.    vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
  633.                     INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
  634.    return g.generate_assembly(cfg, final_assembly_size);
  635. }
  636.  
  637. extern "C" const unsigned *
  638. brw_gs_emit(struct brw_context *brw,
  639.             struct gl_shader_program *prog,
  640.             struct brw_gs_compile *c,
  641.             void *mem_ctx,
  642.             unsigned *final_assembly_size)
  643. {
  644.    if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
  645.       struct brw_shader *shader =
  646.          (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
  647.  
  648.       brw_dump_ir("geometry", prog, &shader->base, NULL);
  649.    }
  650.  
  651.    if (brw->gen >= 7) {
  652.       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
  653.        * so without spilling. If the GS invocations count > 1, then we can't use
  654.        * dual object mode.
  655.        */
  656.       if (c->prog_data.invocations <= 1 &&
  657.           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
  658.          c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
  659.  
  660.          vec4_gs_visitor v(brw, c, prog, mem_ctx, true /* no_spills */);
  661.          if (v.run()) {
  662.             return generate_assembly(brw, prog, &c->gp->program.Base,
  663.                                      &c->prog_data.base, mem_ctx, v.cfg,
  664.                                      final_assembly_size);
  665.          }
  666.       }
  667.    }
  668.  
  669.    /* Either we failed to compile in DUAL_OBJECT mode (probably because it
  670.     * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
  671.     * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
  672.     *
  673.     * FIXME: Single dispatch mode requires that the driver can handle
  674.     * interleaving of input registers, but this is already supported (dual
  675.     * instance mode has the same requirement). However, to take full advantage
  676.     * of single dispatch mode to reduce register pressure we would also need to
  677.     * do interleaved outputs, but currently, the vec4 visitor and generator
  678.     * classes do not support this, so at the moment register pressure in
  679.     * single and dual instance modes is the same.
  680.     *
  681.     * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
  682.     * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
  683.     * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
  684.     * is also supported. When InstanceCount=1 (one instance per object) software
  685.     * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
  686.     * the best choice for performance, followed by SINGLE mode."
  687.     *
  688.     * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
  689.     * mode is more performant when invocations > 1. Gen6 only supports
  690.     * SINGLE mode.
  691.     */
  692.    if (c->prog_data.invocations <= 1 || brw->gen < 7)
  693.       c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
  694.    else
  695.       c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
  696.  
  697.    vec4_gs_visitor *gs = NULL;
  698.    const unsigned *ret = NULL;
  699.  
  700.    if (brw->gen >= 7)
  701.       gs = new vec4_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
  702.    else
  703.       gs = new gen6_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
  704.  
  705.    if (!gs->run()) {
  706.       prog->LinkStatus = false;
  707.       ralloc_strcat(&prog->InfoLog, gs->fail_msg);
  708.    } else {
  709.       ret = generate_assembly(brw, prog, &c->gp->program.Base,
  710.                               &c->prog_data.base, mem_ctx, gs->cfg,
  711.                               final_assembly_size);
  712.    }
  713.  
  714.    delete gs;
  715.    return ret;
  716. }
  717.  
  718.  
  719. } /* namespace brw */
  720.