Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2011 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. #include "brw_vec4.h"
  25. #include "glsl/ir_uniform.h"
  26. extern "C" {
  27. #include "main/context.h"
  28. #include "main/macros.h"
  29. #include "program/prog_parameter.h"
  30. #include "program/sampler.h"
  31. }
  32.  
  33. namespace brw {
  34.  
  35. vec4_instruction::vec4_instruction(vec4_visitor *v,
  36.                                    enum opcode opcode, dst_reg dst,
  37.                                    src_reg src0, src_reg src1, src_reg src2)
  38. {
  39.    this->opcode = opcode;
  40.    this->dst = dst;
  41.    this->src[0] = src0;
  42.    this->src[1] = src1;
  43.    this->src[2] = src2;
  44.    this->ir = v->base_ir;
  45.    this->annotation = v->current_annotation;
  46. }
  47.  
  48. vec4_instruction *
  49. vec4_visitor::emit(vec4_instruction *inst)
  50. {
  51.    this->instructions.push_tail(inst);
  52.  
  53.    return inst;
  54. }
  55.  
  56. vec4_instruction *
  57. vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
  58. {
  59.    new_inst->ir = inst->ir;
  60.    new_inst->annotation = inst->annotation;
  61.  
  62.    inst->insert_before(new_inst);
  63.  
  64.    return inst;
  65. }
  66.  
  67. vec4_instruction *
  68. vec4_visitor::emit(enum opcode opcode, dst_reg dst,
  69.                    src_reg src0, src_reg src1, src_reg src2)
  70. {
  71.    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
  72.                                              src0, src1, src2));
  73. }
  74.  
  75.  
  76. vec4_instruction *
  77. vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
  78. {
  79.    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
  80. }
  81.  
  82. vec4_instruction *
  83. vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
  84. {
  85.    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
  86. }
  87.  
  88. vec4_instruction *
  89. vec4_visitor::emit(enum opcode opcode)
  90. {
  91.    return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
  92. }
  93.  
  94. #define ALU1(op)                                                        \
  95.    vec4_instruction *                                                   \
  96.    vec4_visitor::op(dst_reg dst, src_reg src0)                          \
  97.    {                                                                    \
  98.       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  99.                                            src0);                       \
  100.    }
  101.  
  102. #define ALU2(op)                                                        \
  103.    vec4_instruction *                                                   \
  104.    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)            \
  105.    {                                                                    \
  106.       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  107.                                            src0, src1);                 \
  108.    }
  109.  
  110. #define ALU3(op)                                                        \
  111.    vec4_instruction *                                                   \
  112.    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
  113.    {                                                                    \
  114.       return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,  \
  115.                                            src0, src1, src2);           \
  116.    }
  117.  
  118. ALU1(NOT)
  119. ALU1(MOV)
  120. ALU1(FRC)
  121. ALU1(RNDD)
  122. ALU1(RNDE)
  123. ALU1(RNDZ)
  124. ALU1(F32TO16)
  125. ALU1(F16TO32)
  126. ALU2(ADD)
  127. ALU2(MUL)
  128. ALU2(MACH)
  129. ALU2(AND)
  130. ALU2(OR)
  131. ALU2(XOR)
  132. ALU2(DP3)
  133. ALU2(DP4)
  134. ALU2(DPH)
  135. ALU2(SHL)
  136. ALU2(SHR)
  137. ALU2(ASR)
  138. ALU3(LRP)
  139. ALU1(BFREV)
  140. ALU3(BFE)
  141. ALU2(BFI1)
  142. ALU3(BFI2)
  143. ALU1(FBH)
  144. ALU1(FBL)
  145. ALU1(CBIT)
  146.  
  147. /** Gen4 predicated IF. */
  148. vec4_instruction *
  149. vec4_visitor::IF(uint32_t predicate)
  150. {
  151.    vec4_instruction *inst;
  152.  
  153.    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
  154.    inst->predicate = predicate;
  155.  
  156.    return inst;
  157. }
  158.  
  159. /** Gen6+ IF with embedded comparison. */
  160. vec4_instruction *
  161. vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
  162. {
  163.    assert(brw->gen >= 6);
  164.  
  165.    vec4_instruction *inst;
  166.  
  167.    resolve_ud_negate(&src0);
  168.    resolve_ud_negate(&src1);
  169.  
  170.    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
  171.                                         src0, src1);
  172.    inst->conditional_mod = condition;
  173.  
  174.    return inst;
  175. }
  176.  
  177. /**
  178.  * CMP: Sets the low bit of the destination channels with the result
  179.  * of the comparison, while the upper bits are undefined, and updates
  180.  * the flag register with the packed 16 bits of the result.
  181.  */
  182. vec4_instruction *
  183. vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
  184. {
  185.    vec4_instruction *inst;
  186.  
  187.    /* original gen4 does type conversion to the destination type
  188.     * before before comparison, producing garbage results for floating
  189.     * point comparisons.
  190.     */
  191.    if (brw->gen == 4) {
  192.       dst.type = src0.type;
  193.       if (dst.file == HW_REG)
  194.          dst.fixed_hw_reg.type = dst.type;
  195.    }
  196.  
  197.    resolve_ud_negate(&src0);
  198.    resolve_ud_negate(&src1);
  199.  
  200.    inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
  201.    inst->conditional_mod = condition;
  202.  
  203.    return inst;
  204. }
  205.  
  206. vec4_instruction *
  207. vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
  208. {
  209.    vec4_instruction *inst;
  210.  
  211.    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
  212.                                         dst, index);
  213.    inst->base_mrf = 14;
  214.    inst->mlen = 2;
  215.  
  216.    return inst;
  217. }
  218.  
  219. vec4_instruction *
  220. vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
  221. {
  222.    vec4_instruction *inst;
  223.  
  224.    inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
  225.                                         dst, src, index);
  226.    inst->base_mrf = 13;
  227.    inst->mlen = 3;
  228.  
  229.    return inst;
  230. }
  231.  
  232. void
  233. vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
  234. {
  235.    static enum opcode dot_opcodes[] = {
  236.       BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
  237.    };
  238.  
  239.    emit(dot_opcodes[elements - 2], dst, src0, src1);
  240. }
  241.  
  242. src_reg
  243. vec4_visitor::fix_3src_operand(src_reg src)
  244. {
  245.    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
  246.     * able to use vertical stride of zero to replicate the vec4 uniform, like
  247.     *
  248.     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
  249.     *
  250.     * But you can't, since vertical stride is always four in three-source
  251.     * instructions. Instead, insert a MOV instruction to do the replication so
  252.     * that the three-source instruction can consume it.
  253.     */
  254.  
  255.    /* The MOV is only needed if the source is a uniform or immediate. */
  256.    if (src.file != UNIFORM && src.file != IMM)
  257.       return src;
  258.  
  259.    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
  260.    expanded.type = src.type;
  261.    emit(MOV(expanded, src));
  262.    return src_reg(expanded);
  263. }
  264.  
  265. src_reg
  266. vec4_visitor::fix_math_operand(src_reg src)
  267. {
  268.    /* The gen6 math instruction ignores the source modifiers --
  269.     * swizzle, abs, negate, and at least some parts of the register
  270.     * region description.
  271.     *
  272.     * Rather than trying to enumerate all these cases, *always* expand the
  273.     * operand to a temp GRF for gen6.
  274.     *
  275.     * For gen7, keep the operand as-is, except if immediate, which gen7 still
  276.     * can't use.
  277.     */
  278.  
  279.    if (brw->gen == 7 && src.file != IMM)
  280.       return src;
  281.  
  282.    dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
  283.    expanded.type = src.type;
  284.    emit(MOV(expanded, src));
  285.    return src_reg(expanded);
  286. }
  287.  
  288. void
  289. vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
  290. {
  291.    src = fix_math_operand(src);
  292.  
  293.    if (dst.writemask != WRITEMASK_XYZW) {
  294.       /* The gen6 math instruction must be align1, so we can't do
  295.        * writemasks.
  296.        */
  297.       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
  298.  
  299.       emit(opcode, temp_dst, src);
  300.  
  301.       emit(MOV(dst, src_reg(temp_dst)));
  302.    } else {
  303.       emit(opcode, dst, src);
  304.    }
  305. }
  306.  
  307. void
  308. vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
  309. {
  310.    vec4_instruction *inst = emit(opcode, dst, src);
  311.    inst->base_mrf = 1;
  312.    inst->mlen = 1;
  313. }
  314.  
  315. void
  316. vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
  317. {
  318.    switch (opcode) {
  319.    case SHADER_OPCODE_RCP:
  320.    case SHADER_OPCODE_RSQ:
  321.    case SHADER_OPCODE_SQRT:
  322.    case SHADER_OPCODE_EXP2:
  323.    case SHADER_OPCODE_LOG2:
  324.    case SHADER_OPCODE_SIN:
  325.    case SHADER_OPCODE_COS:
  326.       break;
  327.    default:
  328.       assert(!"not reached: bad math opcode");
  329.       return;
  330.    }
  331.  
  332.    if (brw->gen >= 6) {
  333.       return emit_math1_gen6(opcode, dst, src);
  334.    } else {
  335.       return emit_math1_gen4(opcode, dst, src);
  336.    }
  337. }
  338.  
  339. void
  340. vec4_visitor::emit_math2_gen6(enum opcode opcode,
  341.                               dst_reg dst, src_reg src0, src_reg src1)
  342. {
  343.    src0 = fix_math_operand(src0);
  344.    src1 = fix_math_operand(src1);
  345.  
  346.    if (dst.writemask != WRITEMASK_XYZW) {
  347.       /* The gen6 math instruction must be align1, so we can't do
  348.        * writemasks.
  349.        */
  350.       dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
  351.       temp_dst.type = dst.type;
  352.  
  353.       emit(opcode, temp_dst, src0, src1);
  354.  
  355.       emit(MOV(dst, src_reg(temp_dst)));
  356.    } else {
  357.       emit(opcode, dst, src0, src1);
  358.    }
  359. }
  360.  
  361. void
  362. vec4_visitor::emit_math2_gen4(enum opcode opcode,
  363.                               dst_reg dst, src_reg src0, src_reg src1)
  364. {
  365.    vec4_instruction *inst = emit(opcode, dst, src0, src1);
  366.    inst->base_mrf = 1;
  367.    inst->mlen = 2;
  368. }
  369.  
  370. void
  371. vec4_visitor::emit_math(enum opcode opcode,
  372.                         dst_reg dst, src_reg src0, src_reg src1)
  373. {
  374.    switch (opcode) {
  375.    case SHADER_OPCODE_POW:
  376.    case SHADER_OPCODE_INT_QUOTIENT:
  377.    case SHADER_OPCODE_INT_REMAINDER:
  378.       break;
  379.    default:
  380.       assert(!"not reached: unsupported binary math opcode");
  381.       return;
  382.    }
  383.  
  384.    if (brw->gen >= 6) {
  385.       return emit_math2_gen6(opcode, dst, src0, src1);
  386.    } else {
  387.       return emit_math2_gen4(opcode, dst, src0, src1);
  388.    }
  389. }
  390.  
  391. void
  392. vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
  393. {
  394.    if (brw->gen < 7)
  395.       assert(!"ir_unop_pack_half_2x16 should be lowered");
  396.  
  397.    assert(dst.type == BRW_REGISTER_TYPE_UD);
  398.    assert(src0.type == BRW_REGISTER_TYPE_F);
  399.  
  400.    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
  401.     *
  402.     *   Because this instruction does not have a 16-bit floating-point type,
  403.     *   the destination data type must be Word (W).
  404.     *
  405.     *   The destination must be DWord-aligned and specify a horizontal stride
  406.     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
  407.     *   each destination channel and the upper word is not modified.
  408.     *
  409.     * The above restriction implies that the f32to16 instruction must use
  410.     * align1 mode, because only in align1 mode is it possible to specify
  411.     * horizontal stride.  We choose here to defy the hardware docs and emit
  412.     * align16 instructions.
  413.     *
  414.     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
  415.     * instructions. I was partially successful in that the code passed all
  416.     * tests.  However, the code was dubiously correct and fragile, and the
  417.     * tests were not harsh enough to probe that frailty. Not trusting the
  418.     * code, I chose instead to remain in align16 mode in defiance of the hw
  419.     * docs).
  420.     *
  421.     * I've [chadv] experimentally confirmed that, on gen7 hardware and the
  422.     * simulator, emitting a f32to16 in align16 mode with UD as destination
  423.     * data type is safe. The behavior differs from that specified in the PRM
  424.     * in that the upper word of each destination channel is cleared to 0.
  425.     */
  426.  
  427.    dst_reg tmp_dst(this, glsl_type::uvec2_type);
  428.    src_reg tmp_src(tmp_dst);
  429.  
  430. #if 0
  431.    /* Verify the undocumented behavior on which the following instructions
  432.     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
  433.     * then the result of the bit-or instruction below will be incorrect.
  434.     *
  435.     * You should inspect the disasm output in order to verify that the MOV is
  436.     * not optimized away.
  437.     */
  438.    emit(MOV(tmp_dst, src_reg(0x12345678u)));
  439. #endif
  440.  
  441.    /* Give tmp the form below, where "." means untouched.
  442.     *
  443.     *     w z          y          x w z          y          x
  444.     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
  445.     *
  446.     * That the upper word of each write-channel be 0 is required for the
  447.     * following bit-shift and bit-or instructions to work. Note that this
  448.     * relies on the undocumented hardware behavior mentioned above.
  449.     */
  450.    tmp_dst.writemask = WRITEMASK_XY;
  451.    emit(F32TO16(tmp_dst, src0));
  452.  
  453.    /* Give the write-channels of dst the form:
  454.     *   0xhhhh0000
  455.     */
  456.    tmp_src.swizzle = SWIZZLE_Y;
  457.    emit(SHL(dst, tmp_src, src_reg(16u)));
  458.  
  459.    /* Finally, give the write-channels of dst the form of packHalf2x16's
  460.     * output:
  461.     *   0xhhhhllll
  462.     */
  463.    tmp_src.swizzle = SWIZZLE_X;
  464.    emit(OR(dst, src_reg(dst), tmp_src));
  465. }
  466.  
  467. void
  468. vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
  469. {
  470.    if (brw->gen < 7)
  471.       assert(!"ir_unop_unpack_half_2x16 should be lowered");
  472.  
  473.    assert(dst.type == BRW_REGISTER_TYPE_F);
  474.    assert(src0.type == BRW_REGISTER_TYPE_UD);
  475.  
  476.    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
  477.     *
  478.     *   Because this instruction does not have a 16-bit floating-point type,
  479.     *   the source data type must be Word (W). The destination type must be
  480.     *   F (Float).
  481.     *
  482.     * To use W as the source data type, we must adjust horizontal strides,
  483.     * which is only possible in align1 mode. All my [chadv] attempts at
  484.     * emitting align1 instructions for unpackHalf2x16 failed to pass the
  485.     * Piglit tests, so I gave up.
  486.     *
  487.     * I've verified that, on gen7 hardware and the simulator, it is safe to
  488.     * emit f16to32 in align16 mode with UD as source data type.
  489.     */
  490.  
  491.    dst_reg tmp_dst(this, glsl_type::uvec2_type);
  492.    src_reg tmp_src(tmp_dst);
  493.  
  494.    tmp_dst.writemask = WRITEMASK_X;
  495.    emit(AND(tmp_dst, src0, src_reg(0xffffu)));
  496.  
  497.    tmp_dst.writemask = WRITEMASK_Y;
  498.    emit(SHR(tmp_dst, src0, src_reg(16u)));
  499.  
  500.    dst.writemask = WRITEMASK_XY;
  501.    emit(F16TO32(dst, tmp_src));
  502. }
  503.  
  504. void
  505. vec4_visitor::visit_instructions(const exec_list *list)
  506. {
  507.    foreach_list(node, list) {
  508.       ir_instruction *ir = (ir_instruction *)node;
  509.  
  510.       base_ir = ir;
  511.       ir->accept(this);
  512.    }
  513. }
  514.  
  515.  
  516. static int
  517. type_size(const struct glsl_type *type)
  518. {
  519.    unsigned int i;
  520.    int size;
  521.  
  522.    switch (type->base_type) {
  523.    case GLSL_TYPE_UINT:
  524.    case GLSL_TYPE_INT:
  525.    case GLSL_TYPE_FLOAT:
  526.    case GLSL_TYPE_BOOL:
  527.       if (type->is_matrix()) {
  528.          return type->matrix_columns;
  529.       } else {
  530.          /* Regardless of size of vector, it gets a vec4. This is bad
  531.           * packing for things like floats, but otherwise arrays become a
  532.           * mess.  Hopefully a later pass over the code can pack scalars
  533.           * down if appropriate.
  534.           */
  535.          return 1;
  536.       }
  537.    case GLSL_TYPE_ARRAY:
  538.       assert(type->length > 0);
  539.       return type_size(type->fields.array) * type->length;
  540.    case GLSL_TYPE_STRUCT:
  541.       size = 0;
  542.       for (i = 0; i < type->length; i++) {
  543.          size += type_size(type->fields.structure[i].type);
  544.       }
  545.       return size;
  546.    case GLSL_TYPE_SAMPLER:
  547.       /* Samplers take up one slot in UNIFORMS[], but they're baked in
  548.        * at link time.
  549.        */
  550.       return 1;
  551.    case GLSL_TYPE_VOID:
  552.    case GLSL_TYPE_ERROR:
  553.    case GLSL_TYPE_INTERFACE:
  554.       assert(0);
  555.       break;
  556.    }
  557.  
  558.    return 0;
  559. }
  560.  
  561. int
  562. vec4_visitor::virtual_grf_alloc(int size)
  563. {
  564.    if (virtual_grf_array_size <= virtual_grf_count) {
  565.       if (virtual_grf_array_size == 0)
  566.          virtual_grf_array_size = 16;
  567.       else
  568.          virtual_grf_array_size *= 2;
  569.       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
  570.                                    virtual_grf_array_size);
  571.       virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
  572.                                      virtual_grf_array_size);
  573.    }
  574.    virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
  575.    virtual_grf_reg_count += size;
  576.    virtual_grf_sizes[virtual_grf_count] = size;
  577.    return virtual_grf_count++;
  578. }
  579.  
  580. src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
  581. {
  582.    init();
  583.  
  584.    this->file = GRF;
  585.    this->reg = v->virtual_grf_alloc(type_size(type));
  586.  
  587.    if (type->is_array() || type->is_record()) {
  588.       this->swizzle = BRW_SWIZZLE_NOOP;
  589.    } else {
  590.       this->swizzle = swizzle_for_size(type->vector_elements);
  591.    }
  592.  
  593.    this->type = brw_type_for_base_type(type);
  594. }
  595.  
  596. dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
  597. {
  598.    init();
  599.  
  600.    this->file = GRF;
  601.    this->reg = v->virtual_grf_alloc(type_size(type));
  602.  
  603.    if (type->is_array() || type->is_record()) {
  604.       this->writemask = WRITEMASK_XYZW;
  605.    } else {
  606.       this->writemask = (1 << type->vector_elements) - 1;
  607.    }
  608.  
  609.    this->type = brw_type_for_base_type(type);
  610. }
  611.  
  612. /* Our support for uniforms is piggy-backed on the struct
  613.  * gl_fragment_program, because that's where the values actually
  614.  * get stored, rather than in some global gl_shader_program uniform
  615.  * store.
  616.  */
  617. void
  618. vec4_visitor::setup_uniform_values(ir_variable *ir)
  619. {
  620.    int namelen = strlen(ir->name);
  621.  
  622.    /* The data for our (non-builtin) uniforms is stored in a series of
  623.     * gl_uniform_driver_storage structs for each subcomponent that
  624.     * glGetUniformLocation() could name.  We know it's been set up in the same
  625.     * order we'd walk the type, so walk the list of storage and find anything
  626.     * with our name, or the prefix of a component that starts with our name.
  627.     */
  628.    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
  629.       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
  630.  
  631.       if (strncmp(ir->name, storage->name, namelen) != 0 ||
  632.           (storage->name[namelen] != 0 &&
  633.            storage->name[namelen] != '.' &&
  634.            storage->name[namelen] != '[')) {
  635.          continue;
  636.       }
  637.  
  638.       gl_constant_value *components = storage->storage;
  639.       unsigned vector_count = (MAX2(storage->array_elements, 1) *
  640.                                storage->type->matrix_columns);
  641.  
  642.       for (unsigned s = 0; s < vector_count; s++) {
  643.          uniform_vector_size[uniforms] = storage->type->vector_elements;
  644.  
  645.          int i;
  646.          for (i = 0; i < uniform_vector_size[uniforms]; i++) {
  647.             prog_data->param[uniforms * 4 + i] = &components->f;
  648.             components++;
  649.          }
  650.          for (; i < 4; i++) {
  651.             static float zero = 0;
  652.             prog_data->param[uniforms * 4 + i] = &zero;
  653.          }
  654.  
  655.          uniforms++;
  656.       }
  657.    }
  658. }
  659.  
  660. void
  661. vec4_visitor::setup_uniform_clipplane_values()
  662. {
  663.    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
  664.  
  665.    if (brw->gen < 6) {
  666.       /* Pre-Gen6, we compact clip planes.  For example, if the user
  667.        * enables just clip planes 0, 1, and 3, we will enable clip planes
  668.        * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
  669.        * plane 2.  This simplifies the implementation of the Gen6 clip
  670.        * thread.
  671.        */
  672.       int compacted_clipplane_index = 0;
  673.       for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
  674.          if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
  675.             continue;
  676.  
  677.          this->uniform_vector_size[this->uniforms] = 4;
  678.          this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
  679.          this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
  680.          for (int j = 0; j < 4; ++j) {
  681.             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
  682.          }
  683.          ++compacted_clipplane_index;
  684.          ++this->uniforms;
  685.       }
  686.    } else {
  687.       /* In Gen6 and later, we don't compact clip planes, because this
  688.        * simplifies the implementation of gl_ClipDistance.
  689.        */
  690.       for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
  691.          this->uniform_vector_size[this->uniforms] = 4;
  692.          this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
  693.          this->userplane[i].type = BRW_REGISTER_TYPE_F;
  694.          for (int j = 0; j < 4; ++j) {
  695.             prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
  696.          }
  697.          ++this->uniforms;
  698.       }
  699.    }
  700. }
  701.  
  702. /* Our support for builtin uniforms is even scarier than non-builtin.
  703.  * It sits on top of the PROG_STATE_VAR parameters that are
  704.  * automatically updated from GL context state.
  705.  */
  706. void
  707. vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
  708. {
  709.    const ir_state_slot *const slots = ir->state_slots;
  710.    assert(ir->state_slots != NULL);
  711.  
  712.    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
  713.       /* This state reference has already been setup by ir_to_mesa,
  714.        * but we'll get the same index back here.  We can reference
  715.        * ParameterValues directly, since unlike brw_fs.cpp, we never
  716.        * add new state references during compile.
  717.        */
  718.       int index = _mesa_add_state_reference(this->prog->Parameters,
  719.                                             (gl_state_index *)slots[i].tokens);
  720.       float *values = &this->prog->Parameters->ParameterValues[index][0].f;
  721.  
  722.       this->uniform_vector_size[this->uniforms] = 0;
  723.       /* Add each of the unique swizzled channels of the element.
  724.        * This will end up matching the size of the glsl_type of this field.
  725.        */
  726.       int last_swiz = -1;
  727.       for (unsigned int j = 0; j < 4; j++) {
  728.          int swiz = GET_SWZ(slots[i].swizzle, j);
  729.          last_swiz = swiz;
  730.  
  731.          prog_data->param[this->uniforms * 4 + j] = &values[swiz];
  732.          if (swiz <= last_swiz)
  733.             this->uniform_vector_size[this->uniforms]++;
  734.       }
  735.       this->uniforms++;
  736.    }
  737. }
  738.  
  739. dst_reg *
  740. vec4_visitor::variable_storage(ir_variable *var)
  741. {
  742.    return (dst_reg *)hash_table_find(this->variable_ht, var);
  743. }
  744.  
  745. void
  746. vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
  747. {
  748.    ir_expression *expr = ir->as_expression();
  749.  
  750.    *predicate = BRW_PREDICATE_NORMAL;
  751.  
  752.    if (expr) {
  753.       src_reg op[2];
  754.       vec4_instruction *inst;
  755.  
  756.       assert(expr->get_num_operands() <= 2);
  757.       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
  758.          expr->operands[i]->accept(this);
  759.          op[i] = this->result;
  760.  
  761.          resolve_ud_negate(&op[i]);
  762.       }
  763.  
  764.       switch (expr->operation) {
  765.       case ir_unop_logic_not:
  766.          inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
  767.          inst->conditional_mod = BRW_CONDITIONAL_Z;
  768.          break;
  769.  
  770.       case ir_binop_logic_xor:
  771.          inst = emit(XOR(dst_null_d(), op[0], op[1]));
  772.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  773.          break;
  774.  
  775.       case ir_binop_logic_or:
  776.          inst = emit(OR(dst_null_d(), op[0], op[1]));
  777.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  778.          break;
  779.  
  780.       case ir_binop_logic_and:
  781.          inst = emit(AND(dst_null_d(), op[0], op[1]));
  782.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  783.          break;
  784.  
  785.       case ir_unop_f2b:
  786.          if (brw->gen >= 6) {
  787.             emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
  788.          } else {
  789.             inst = emit(MOV(dst_null_f(), op[0]));
  790.             inst->conditional_mod = BRW_CONDITIONAL_NZ;
  791.          }
  792.          break;
  793.  
  794.       case ir_unop_i2b:
  795.          if (brw->gen >= 6) {
  796.             emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  797.          } else {
  798.             inst = emit(MOV(dst_null_d(), op[0]));
  799.             inst->conditional_mod = BRW_CONDITIONAL_NZ;
  800.          }
  801.          break;
  802.  
  803.       case ir_binop_all_equal:
  804.          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
  805.          *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
  806.          break;
  807.  
  808.       case ir_binop_any_nequal:
  809.          inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
  810.          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
  811.          break;
  812.  
  813.       case ir_unop_any:
  814.          inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  815.          *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
  816.          break;
  817.  
  818.       case ir_binop_greater:
  819.       case ir_binop_gequal:
  820.       case ir_binop_less:
  821.       case ir_binop_lequal:
  822.       case ir_binop_equal:
  823.       case ir_binop_nequal:
  824.          emit(CMP(dst_null_d(), op[0], op[1],
  825.                   brw_conditional_for_comparison(expr->operation)));
  826.          break;
  827.  
  828.       default:
  829.          assert(!"not reached");
  830.          break;
  831.       }
  832.       return;
  833.    }
  834.  
  835.    ir->accept(this);
  836.  
  837.    resolve_ud_negate(&this->result);
  838.  
  839.    if (brw->gen >= 6) {
  840.       vec4_instruction *inst = emit(AND(dst_null_d(),
  841.                                         this->result, src_reg(1)));
  842.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  843.    } else {
  844.       vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
  845.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  846.    }
  847. }
  848.  
  849. /**
  850.  * Emit a gen6 IF statement with the comparison folded into the IF
  851.  * instruction.
  852.  */
  853. void
  854. vec4_visitor::emit_if_gen6(ir_if *ir)
  855. {
  856.    ir_expression *expr = ir->condition->as_expression();
  857.  
  858.    if (expr) {
  859.       src_reg op[2];
  860.       dst_reg temp;
  861.  
  862.       assert(expr->get_num_operands() <= 2);
  863.       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
  864.          expr->operands[i]->accept(this);
  865.          op[i] = this->result;
  866.       }
  867.  
  868.       switch (expr->operation) {
  869.       case ir_unop_logic_not:
  870.          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
  871.          return;
  872.  
  873.       case ir_binop_logic_xor:
  874.          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
  875.          return;
  876.  
  877.       case ir_binop_logic_or:
  878.          temp = dst_reg(this, glsl_type::bool_type);
  879.          emit(OR(temp, op[0], op[1]));
  880.          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
  881.          return;
  882.  
  883.       case ir_binop_logic_and:
  884.          temp = dst_reg(this, glsl_type::bool_type);
  885.          emit(AND(temp, op[0], op[1]));
  886.          emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
  887.          return;
  888.  
  889.       case ir_unop_f2b:
  890.          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  891.          return;
  892.  
  893.       case ir_unop_i2b:
  894.          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  895.          return;
  896.  
  897.       case ir_binop_greater:
  898.       case ir_binop_gequal:
  899.       case ir_binop_less:
  900.       case ir_binop_lequal:
  901.       case ir_binop_equal:
  902.       case ir_binop_nequal:
  903.          emit(IF(op[0], op[1],
  904.                  brw_conditional_for_comparison(expr->operation)));
  905.          return;
  906.  
  907.       case ir_binop_all_equal:
  908.          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
  909.          emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
  910.          return;
  911.  
  912.       case ir_binop_any_nequal:
  913.          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
  914.          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
  915.          return;
  916.  
  917.       case ir_unop_any:
  918.          emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  919.          emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
  920.          return;
  921.  
  922.       default:
  923.          assert(!"not reached");
  924.          emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  925.          return;
  926.       }
  927.       return;
  928.    }
  929.  
  930.    ir->condition->accept(this);
  931.  
  932.    emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
  933. }
  934.  
  935. static dst_reg
  936. with_writemask(dst_reg const & r, int mask)
  937. {
  938.    dst_reg result = r;
  939.    result.writemask = mask;
  940.    return result;
  941. }
  942.  
  943. void
  944. vec4_vs_visitor::emit_prolog()
  945. {
  946.    dst_reg sign_recovery_shift;
  947.    dst_reg normalize_factor;
  948.    dst_reg es3_normalize_factor;
  949.  
  950.    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
  951.       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
  952.          uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
  953.          dst_reg reg(ATTR, i);
  954.          dst_reg reg_d = reg;
  955.          reg_d.type = BRW_REGISTER_TYPE_D;
  956.          dst_reg reg_ud = reg;
  957.          reg_ud.type = BRW_REGISTER_TYPE_UD;
  958.  
  959.          /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
  960.           * come in as floating point conversions of the integer values.
  961.           */
  962.          if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
  963.             dst_reg dst = reg;
  964.             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
  965.             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
  966.             emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
  967.          }
  968.  
  969.          /* Do sign recovery for 2101010 formats if required. */
  970.          if (wa_flags & BRW_ATTRIB_WA_SIGN) {
  971.             if (sign_recovery_shift.file == BAD_FILE) {
  972.                /* shift constant: <22,22,22,30> */
  973.                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
  974.                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
  975.                emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
  976.             }
  977.  
  978.             emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
  979.             emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
  980.          }
  981.  
  982.          /* Apply BGRA swizzle if required. */
  983.          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
  984.             src_reg temp = src_reg(reg);
  985.             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
  986.             emit(MOV(reg, temp));
  987.          }
  988.  
  989.          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
  990.             /* ES 3.0 has different rules for converting signed normalized
  991.              * fixed-point numbers than desktop GL.
  992.              */
  993.             if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
  994.                /* According to equation 2.2 of the ES 3.0 specification,
  995.                 * signed normalization conversion is done by:
  996.                 *
  997.                 * f = c / (2^(b-1)-1)
  998.                 */
  999.                if (es3_normalize_factor.file == BAD_FILE) {
  1000.                   /* mul constant: 1 / (2^(b-1) - 1) */
  1001.                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
  1002.                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
  1003.                            src_reg(1.0f / ((1<<9) - 1))));
  1004.                   emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
  1005.                            src_reg(1.0f / ((1<<1) - 1))));
  1006.                }
  1007.  
  1008.                dst_reg dst = reg;
  1009.                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
  1010.                emit(MOV(dst, src_reg(reg_d)));
  1011.                emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
  1012.                emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
  1013.             } else {
  1014.                /* The following equations are from the OpenGL 3.2 specification:
  1015.                 *
  1016.                 * 2.1 unsigned normalization
  1017.                 * f = c/(2^n-1)
  1018.                 *
  1019.                 * 2.2 signed normalization
  1020.                 * f = (2c+1)/(2^n-1)
  1021.                 *
  1022.                 * Both of these share a common divisor, which is represented by
  1023.                 * "normalize_factor" in the code below.
  1024.                 */
  1025.                if (normalize_factor.file == BAD_FILE) {
  1026.                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
  1027.                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
  1028.                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
  1029.                            src_reg(1.0f / ((1<<10) - 1))));
  1030.                   emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
  1031.                            src_reg(1.0f / ((1<<2) - 1))));
  1032.                }
  1033.  
  1034.                dst_reg dst = reg;
  1035.                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
  1036.                emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
  1037.  
  1038.                /* For signed normalization, we want the numerator to be 2c+1. */
  1039.                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
  1040.                   emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
  1041.                   emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
  1042.                }
  1043.  
  1044.                emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
  1045.             }
  1046.          }
  1047.  
  1048.          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
  1049.             dst_reg dst = reg;
  1050.             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
  1051.             emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
  1052.          }
  1053.       }
  1054.    }
  1055. }
  1056.  
  1057.  
  1058. dst_reg *
  1059. vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
  1060. {
  1061.    /* VertexID is stored by the VF as the last vertex element, but
  1062.     * we don't represent it with a flag in inputs_read, so we call
  1063.     * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
  1064.     */
  1065.    dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
  1066.    vs_prog_data->uses_vertexid = true;
  1067.  
  1068.    switch (ir->location) {
  1069.    case SYSTEM_VALUE_VERTEX_ID:
  1070.       reg->writemask = WRITEMASK_X;
  1071.       break;
  1072.    case SYSTEM_VALUE_INSTANCE_ID:
  1073.       reg->writemask = WRITEMASK_Y;
  1074.       break;
  1075.    default:
  1076.       assert(!"not reached");
  1077.       break;
  1078.    }
  1079.  
  1080.    return reg;
  1081. }
  1082.  
  1083.  
  1084. void
  1085. vec4_visitor::visit(ir_variable *ir)
  1086. {
  1087.    dst_reg *reg = NULL;
  1088.  
  1089.    if (variable_storage(ir))
  1090.       return;
  1091.  
  1092.    switch (ir->mode) {
  1093.    case ir_var_shader_in:
  1094.       reg = new(mem_ctx) dst_reg(ATTR, ir->location);
  1095.       break;
  1096.  
  1097.    case ir_var_shader_out:
  1098.       reg = new(mem_ctx) dst_reg(this, ir->type);
  1099.  
  1100.       for (int i = 0; i < type_size(ir->type); i++) {
  1101.          output_reg[ir->location + i] = *reg;
  1102.          output_reg[ir->location + i].reg_offset = i;
  1103.          output_reg[ir->location + i].type =
  1104.             brw_type_for_base_type(ir->type->get_scalar_type());
  1105.          output_reg_annotation[ir->location + i] = ir->name;
  1106.       }
  1107.       break;
  1108.  
  1109.    case ir_var_auto:
  1110.    case ir_var_temporary:
  1111.       reg = new(mem_ctx) dst_reg(this, ir->type);
  1112.       break;
  1113.  
  1114.    case ir_var_uniform:
  1115.       reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
  1116.  
  1117.       /* Thanks to the lower_ubo_reference pass, we will see only
  1118.        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
  1119.        * variables, so no need for them to be in variable_ht.
  1120.        */
  1121.       if (ir->is_in_uniform_block())
  1122.          return;
  1123.  
  1124.       /* Track how big the whole uniform variable is, in case we need to put a
  1125.        * copy of its data into pull constants for array access.
  1126.        */
  1127.       this->uniform_size[this->uniforms] = type_size(ir->type);
  1128.  
  1129.       if (!strncmp(ir->name, "gl_", 3)) {
  1130.          setup_builtin_uniform_values(ir);
  1131.       } else {
  1132.          setup_uniform_values(ir);
  1133.       }
  1134.       break;
  1135.  
  1136.    case ir_var_system_value:
  1137.       reg = make_reg_for_system_value(ir);
  1138.       break;
  1139.  
  1140.    default:
  1141.       assert(!"not reached");
  1142.    }
  1143.  
  1144.    reg->type = brw_type_for_base_type(ir->type);
  1145.    hash_table_insert(this->variable_ht, reg, ir);
  1146. }
  1147.  
  1148. void
  1149. vec4_visitor::visit(ir_loop *ir)
  1150. {
  1151.    dst_reg counter;
  1152.  
  1153.    /* We don't want debugging output to print the whole body of the
  1154.     * loop as the annotation.
  1155.     */
  1156.    this->base_ir = NULL;
  1157.  
  1158.    if (ir->counter != NULL) {
  1159.       this->base_ir = ir->counter;
  1160.       ir->counter->accept(this);
  1161.       counter = *(variable_storage(ir->counter));
  1162.  
  1163.       if (ir->from != NULL) {
  1164.          this->base_ir = ir->from;
  1165.          ir->from->accept(this);
  1166.  
  1167.          emit(MOV(counter, this->result));
  1168.       }
  1169.    }
  1170.  
  1171.    emit(BRW_OPCODE_DO);
  1172.  
  1173.    if (ir->to) {
  1174.       this->base_ir = ir->to;
  1175.       ir->to->accept(this);
  1176.  
  1177.       emit(CMP(dst_null_d(), src_reg(counter), this->result,
  1178.                brw_conditional_for_comparison(ir->cmp)));
  1179.  
  1180.       vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
  1181.       inst->predicate = BRW_PREDICATE_NORMAL;
  1182.    }
  1183.  
  1184.    visit_instructions(&ir->body_instructions);
  1185.  
  1186.  
  1187.    if (ir->increment) {
  1188.       this->base_ir = ir->increment;
  1189.       ir->increment->accept(this);
  1190.       emit(ADD(counter, src_reg(counter), this->result));
  1191.    }
  1192.  
  1193.    emit(BRW_OPCODE_WHILE);
  1194. }
  1195.  
  1196. void
  1197. vec4_visitor::visit(ir_loop_jump *ir)
  1198. {
  1199.    switch (ir->mode) {
  1200.    case ir_loop_jump::jump_break:
  1201.       emit(BRW_OPCODE_BREAK);
  1202.       break;
  1203.    case ir_loop_jump::jump_continue:
  1204.       emit(BRW_OPCODE_CONTINUE);
  1205.       break;
  1206.    }
  1207. }
  1208.  
  1209.  
  1210. void
  1211. vec4_visitor::visit(ir_function_signature *ir)
  1212. {
  1213.    assert(0);
  1214.    (void)ir;
  1215. }
  1216.  
  1217. void
  1218. vec4_visitor::visit(ir_function *ir)
  1219. {
  1220.    /* Ignore function bodies other than main() -- we shouldn't see calls to
  1221.     * them since they should all be inlined.
  1222.     */
  1223.    if (strcmp(ir->name, "main") == 0) {
  1224.       const ir_function_signature *sig;
  1225.       exec_list empty;
  1226.  
  1227.       sig = ir->matching_signature(&empty);
  1228.  
  1229.       assert(sig);
  1230.  
  1231.       visit_instructions(&sig->body);
  1232.    }
  1233. }
  1234.  
  1235. bool
  1236. vec4_visitor::try_emit_sat(ir_expression *ir)
  1237. {
  1238.    ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
  1239.    if (!sat_src)
  1240.       return false;
  1241.  
  1242.    sat_src->accept(this);
  1243.    src_reg src = this->result;
  1244.  
  1245.    this->result = src_reg(this, ir->type);
  1246.    vec4_instruction *inst;
  1247.    inst = emit(MOV(dst_reg(this->result), src));
  1248.    inst->saturate = true;
  1249.  
  1250.    return true;
  1251. }
  1252.  
  1253. bool
  1254. vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
  1255. {
  1256.    /* 3-src instructions were introduced in gen6. */
  1257.    if (brw->gen < 6)
  1258.       return false;
  1259.  
  1260.    /* MAD can only handle floating-point data. */
  1261.    if (ir->type->base_type != GLSL_TYPE_FLOAT)
  1262.       return false;
  1263.  
  1264.    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
  1265.    ir_expression *mul = ir->operands[mul_arg]->as_expression();
  1266.  
  1267.    if (!mul || mul->operation != ir_binop_mul)
  1268.       return false;
  1269.  
  1270.    nonmul->accept(this);
  1271.    src_reg src0 = fix_3src_operand(this->result);
  1272.  
  1273.    mul->operands[0]->accept(this);
  1274.    src_reg src1 = fix_3src_operand(this->result);
  1275.  
  1276.    mul->operands[1]->accept(this);
  1277.    src_reg src2 = fix_3src_operand(this->result);
  1278.  
  1279.    this->result = src_reg(this, ir->type);
  1280.    emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
  1281.  
  1282.    return true;
  1283. }
  1284.  
  1285. void
  1286. vec4_visitor::emit_bool_comparison(unsigned int op,
  1287.                                  dst_reg dst, src_reg src0, src_reg src1)
  1288. {
  1289.    /* original gen4 does destination conversion before comparison. */
  1290.    if (brw->gen < 5)
  1291.       dst.type = src0.type;
  1292.  
  1293.    emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
  1294.  
  1295.    dst.type = BRW_REGISTER_TYPE_D;
  1296.    emit(AND(dst, src_reg(dst), src_reg(0x1)));
  1297. }
  1298.  
  1299. void
  1300. vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
  1301.                           src_reg src0, src_reg src1)
  1302. {
  1303.    vec4_instruction *inst;
  1304.  
  1305.    if (brw->gen >= 6) {
  1306.       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
  1307.       inst->conditional_mod = conditionalmod;
  1308.    } else {
  1309.       emit(CMP(dst, src0, src1, conditionalmod));
  1310.  
  1311.       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
  1312.       inst->predicate = BRW_PREDICATE_NORMAL;
  1313.    }
  1314. }
  1315.  
  1316. static bool
  1317. is_16bit_constant(ir_rvalue *rvalue)
  1318. {
  1319.    ir_constant *constant = rvalue->as_constant();
  1320.    if (!constant)
  1321.       return false;
  1322.  
  1323.    if (constant->type != glsl_type::int_type &&
  1324.        constant->type != glsl_type::uint_type)
  1325.       return false;
  1326.  
  1327.    return constant->value.u[0] < (1 << 16);
  1328. }
  1329.  
  1330. void
  1331. vec4_visitor::visit(ir_expression *ir)
  1332. {
  1333.    unsigned int operand;
  1334.    src_reg op[Elements(ir->operands)];
  1335.    src_reg result_src;
  1336.    dst_reg result_dst;
  1337.    vec4_instruction *inst;
  1338.  
  1339.    if (try_emit_sat(ir))
  1340.       return;
  1341.  
  1342.    if (ir->operation == ir_binop_add) {
  1343.       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
  1344.          return;
  1345.    }
  1346.  
  1347.    for (operand = 0; operand < ir->get_num_operands(); operand++) {
  1348.       this->result.file = BAD_FILE;
  1349.       ir->operands[operand]->accept(this);
  1350.       if (this->result.file == BAD_FILE) {
  1351.          printf("Failed to get tree for expression operand:\n");
  1352.          ir->operands[operand]->print();
  1353.          exit(1);
  1354.       }
  1355.       op[operand] = this->result;
  1356.  
  1357.       /* Matrix expression operands should have been broken down to vector
  1358.        * operations already.
  1359.        */
  1360.       assert(!ir->operands[operand]->type->is_matrix());
  1361.    }
  1362.  
  1363.    int vector_elements = ir->operands[0]->type->vector_elements;
  1364.    if (ir->operands[1]) {
  1365.       vector_elements = MAX2(vector_elements,
  1366.                              ir->operands[1]->type->vector_elements);
  1367.    }
  1368.  
  1369.    this->result.file = BAD_FILE;
  1370.  
  1371.    /* Storage for our result.  Ideally for an assignment we'd be using
  1372.     * the actual storage for the result here, instead.
  1373.     */
  1374.    result_src = src_reg(this, ir->type);
  1375.    /* convenience for the emit functions below. */
  1376.    result_dst = dst_reg(result_src);
  1377.    /* If nothing special happens, this is the result. */
  1378.    this->result = result_src;
  1379.    /* Limit writes to the channels that will be used by result_src later.
  1380.     * This does limit this temp's use as a temporary for multi-instruction
  1381.     * sequences.
  1382.     */
  1383.    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
  1384.  
  1385.    switch (ir->operation) {
  1386.    case ir_unop_logic_not:
  1387.       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
  1388.        * ones complement of the whole register, not just bit 0.
  1389.        */
  1390.       emit(XOR(result_dst, op[0], src_reg(1)));
  1391.       break;
  1392.    case ir_unop_neg:
  1393.       op[0].negate = !op[0].negate;
  1394.       emit(MOV(result_dst, op[0]));
  1395.       break;
  1396.    case ir_unop_abs:
  1397.       op[0].abs = true;
  1398.       op[0].negate = false;
  1399.       emit(MOV(result_dst, op[0]));
  1400.       break;
  1401.  
  1402.    case ir_unop_sign:
  1403.       emit(MOV(result_dst, src_reg(0.0f)));
  1404.  
  1405.       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
  1406.       inst = emit(MOV(result_dst, src_reg(1.0f)));
  1407.       inst->predicate = BRW_PREDICATE_NORMAL;
  1408.  
  1409.       emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
  1410.       inst = emit(MOV(result_dst, src_reg(-1.0f)));
  1411.       inst->predicate = BRW_PREDICATE_NORMAL;
  1412.  
  1413.       break;
  1414.  
  1415.    case ir_unop_rcp:
  1416.       emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
  1417.       break;
  1418.  
  1419.    case ir_unop_exp2:
  1420.       emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
  1421.       break;
  1422.    case ir_unop_log2:
  1423.       emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
  1424.       break;
  1425.    case ir_unop_exp:
  1426.    case ir_unop_log:
  1427.       assert(!"not reached: should be handled by ir_explog_to_explog2");
  1428.       break;
  1429.    case ir_unop_sin:
  1430.    case ir_unop_sin_reduced:
  1431.       emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
  1432.       break;
  1433.    case ir_unop_cos:
  1434.    case ir_unop_cos_reduced:
  1435.       emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
  1436.       break;
  1437.  
  1438.    case ir_unop_dFdx:
  1439.    case ir_unop_dFdy:
  1440.       assert(!"derivatives not valid in vertex shader");
  1441.       break;
  1442.  
  1443.    case ir_unop_bitfield_reverse:
  1444.       emit(BFREV(result_dst, op[0]));
  1445.       break;
  1446.    case ir_unop_bit_count:
  1447.       emit(CBIT(result_dst, op[0]));
  1448.       break;
  1449.    case ir_unop_find_msb: {
  1450.       src_reg temp = src_reg(this, glsl_type::uint_type);
  1451.  
  1452.       inst = emit(FBH(dst_reg(temp), op[0]));
  1453.       inst->dst.writemask = WRITEMASK_XYZW;
  1454.  
  1455.       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
  1456.        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
  1457.        * subtract the result from 31 to convert the MSB count into an LSB count.
  1458.        */
  1459.  
  1460.       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
  1461.       temp.swizzle = BRW_SWIZZLE_NOOP;
  1462.       emit(MOV(result_dst, temp));
  1463.  
  1464.       src_reg src_tmp = src_reg(result_dst);
  1465.       emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
  1466.  
  1467.       src_tmp.negate = true;
  1468.       inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
  1469.       inst->predicate = BRW_PREDICATE_NORMAL;
  1470.       break;
  1471.    }
  1472.    case ir_unop_find_lsb:
  1473.       emit(FBL(result_dst, op[0]));
  1474.       break;
  1475.  
  1476.    case ir_unop_noise:
  1477.       assert(!"not reached: should be handled by lower_noise");
  1478.       break;
  1479.  
  1480.    case ir_binop_add:
  1481.       emit(ADD(result_dst, op[0], op[1]));
  1482.       break;
  1483.    case ir_binop_sub:
  1484.       assert(!"not reached: should be handled by ir_sub_to_add_neg");
  1485.       break;
  1486.  
  1487.    case ir_binop_mul:
  1488.       if (ir->type->is_integer()) {
  1489.          /* For integer multiplication, the MUL uses the low 16 bits of one of
  1490.           * the operands (src0 through SNB, src1 on IVB and later).  The MACH
  1491.           * accumulates in the contribution of the upper 16 bits of that
  1492.           * operand.  If we can determine that one of the args is in the low
  1493.           * 16 bits, though, we can just emit a single MUL.
  1494.           */
  1495.          if (is_16bit_constant(ir->operands[0])) {
  1496.             if (brw->gen < 7)
  1497.                emit(MUL(result_dst, op[0], op[1]));
  1498.             else
  1499.                emit(MUL(result_dst, op[1], op[0]));
  1500.          } else if (is_16bit_constant(ir->operands[1])) {
  1501.             if (brw->gen < 7)
  1502.                emit(MUL(result_dst, op[1], op[0]));
  1503.             else
  1504.                emit(MUL(result_dst, op[0], op[1]));
  1505.          } else {
  1506.             struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
  1507.  
  1508.             emit(MUL(acc, op[0], op[1]));
  1509.             emit(MACH(dst_null_d(), op[0], op[1]));
  1510.             emit(MOV(result_dst, src_reg(acc)));
  1511.          }
  1512.       } else {
  1513.          emit(MUL(result_dst, op[0], op[1]));
  1514.       }
  1515.       break;
  1516.    case ir_binop_div:
  1517.       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
  1518.       assert(ir->type->is_integer());
  1519.       emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
  1520.       break;
  1521.    case ir_binop_mod:
  1522.       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
  1523.       assert(ir->type->is_integer());
  1524.       emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
  1525.       break;
  1526.  
  1527.    case ir_binop_less:
  1528.    case ir_binop_greater:
  1529.    case ir_binop_lequal:
  1530.    case ir_binop_gequal:
  1531.    case ir_binop_equal:
  1532.    case ir_binop_nequal: {
  1533.       emit(CMP(result_dst, op[0], op[1],
  1534.                brw_conditional_for_comparison(ir->operation)));
  1535.       emit(AND(result_dst, result_src, src_reg(0x1)));
  1536.       break;
  1537.    }
  1538.  
  1539.    case ir_binop_all_equal:
  1540.       /* "==" operator producing a scalar boolean. */
  1541.       if (ir->operands[0]->type->is_vector() ||
  1542.           ir->operands[1]->type->is_vector()) {
  1543.          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
  1544.          emit(MOV(result_dst, src_reg(0)));
  1545.          inst = emit(MOV(result_dst, src_reg(1)));
  1546.          inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
  1547.       } else {
  1548.          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
  1549.          emit(AND(result_dst, result_src, src_reg(0x1)));
  1550.       }
  1551.       break;
  1552.    case ir_binop_any_nequal:
  1553.       /* "!=" operator producing a scalar boolean. */
  1554.       if (ir->operands[0]->type->is_vector() ||
  1555.           ir->operands[1]->type->is_vector()) {
  1556.          emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
  1557.  
  1558.          emit(MOV(result_dst, src_reg(0)));
  1559.          inst = emit(MOV(result_dst, src_reg(1)));
  1560.          inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
  1561.       } else {
  1562.          emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
  1563.          emit(AND(result_dst, result_src, src_reg(0x1)));
  1564.       }
  1565.       break;
  1566.  
  1567.    case ir_unop_any:
  1568.       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
  1569.       emit(MOV(result_dst, src_reg(0)));
  1570.  
  1571.       inst = emit(MOV(result_dst, src_reg(1)));
  1572.       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
  1573.       break;
  1574.  
  1575.    case ir_binop_logic_xor:
  1576.       emit(XOR(result_dst, op[0], op[1]));
  1577.       break;
  1578.  
  1579.    case ir_binop_logic_or:
  1580.       emit(OR(result_dst, op[0], op[1]));
  1581.       break;
  1582.  
  1583.    case ir_binop_logic_and:
  1584.       emit(AND(result_dst, op[0], op[1]));
  1585.       break;
  1586.  
  1587.    case ir_binop_dot:
  1588.       assert(ir->operands[0]->type->is_vector());
  1589.       assert(ir->operands[0]->type == ir->operands[1]->type);
  1590.       emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
  1591.       break;
  1592.  
  1593.    case ir_unop_sqrt:
  1594.       emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
  1595.       break;
  1596.    case ir_unop_rsq:
  1597.       emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
  1598.       break;
  1599.  
  1600.    case ir_unop_bitcast_i2f:
  1601.    case ir_unop_bitcast_u2f:
  1602.       this->result = op[0];
  1603.       this->result.type = BRW_REGISTER_TYPE_F;
  1604.       break;
  1605.  
  1606.    case ir_unop_bitcast_f2i:
  1607.       this->result = op[0];
  1608.       this->result.type = BRW_REGISTER_TYPE_D;
  1609.       break;
  1610.  
  1611.    case ir_unop_bitcast_f2u:
  1612.       this->result = op[0];
  1613.       this->result.type = BRW_REGISTER_TYPE_UD;
  1614.       break;
  1615.  
  1616.    case ir_unop_i2f:
  1617.    case ir_unop_i2u:
  1618.    case ir_unop_u2i:
  1619.    case ir_unop_u2f:
  1620.    case ir_unop_b2f:
  1621.    case ir_unop_b2i:
  1622.    case ir_unop_f2i:
  1623.    case ir_unop_f2u:
  1624.       emit(MOV(result_dst, op[0]));
  1625.       break;
  1626.    case ir_unop_f2b:
  1627.    case ir_unop_i2b: {
  1628.       emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
  1629.       emit(AND(result_dst, result_src, src_reg(1)));
  1630.       break;
  1631.    }
  1632.  
  1633.    case ir_unop_trunc:
  1634.       emit(RNDZ(result_dst, op[0]));
  1635.       break;
  1636.    case ir_unop_ceil:
  1637.       op[0].negate = !op[0].negate;
  1638.       inst = emit(RNDD(result_dst, op[0]));
  1639.       this->result.negate = true;
  1640.       break;
  1641.    case ir_unop_floor:
  1642.       inst = emit(RNDD(result_dst, op[0]));
  1643.       break;
  1644.    case ir_unop_fract:
  1645.       inst = emit(FRC(result_dst, op[0]));
  1646.       break;
  1647.    case ir_unop_round_even:
  1648.       emit(RNDE(result_dst, op[0]));
  1649.       break;
  1650.  
  1651.    case ir_binop_min:
  1652.       emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
  1653.       break;
  1654.    case ir_binop_max:
  1655.       emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
  1656.       break;
  1657.  
  1658.    case ir_binop_pow:
  1659.       emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
  1660.       break;
  1661.  
  1662.    case ir_unop_bit_not:
  1663.       inst = emit(NOT(result_dst, op[0]));
  1664.       break;
  1665.    case ir_binop_bit_and:
  1666.       inst = emit(AND(result_dst, op[0], op[1]));
  1667.       break;
  1668.    case ir_binop_bit_xor:
  1669.       inst = emit(XOR(result_dst, op[0], op[1]));
  1670.       break;
  1671.    case ir_binop_bit_or:
  1672.       inst = emit(OR(result_dst, op[0], op[1]));
  1673.       break;
  1674.  
  1675.    case ir_binop_lshift:
  1676.       inst = emit(SHL(result_dst, op[0], op[1]));
  1677.       break;
  1678.  
  1679.    case ir_binop_rshift:
  1680.       if (ir->type->base_type == GLSL_TYPE_INT)
  1681.          inst = emit(ASR(result_dst, op[0], op[1]));
  1682.       else
  1683.          inst = emit(SHR(result_dst, op[0], op[1]));
  1684.       break;
  1685.  
  1686.    case ir_binop_bfm:
  1687.       emit(BFI1(result_dst, op[0], op[1]));
  1688.       break;
  1689.  
  1690.    case ir_binop_ubo_load: {
  1691.       ir_constant *uniform_block = ir->operands[0]->as_constant();
  1692.       ir_constant *const_offset_ir = ir->operands[1]->as_constant();
  1693.       unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
  1694.       src_reg offset = op[1];
  1695.  
  1696.       /* Now, load the vector from that offset. */
  1697.       assert(ir->type->is_vector() || ir->type->is_scalar());
  1698.  
  1699.       src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
  1700.       packed_consts.type = result.type;
  1701.       src_reg surf_index =
  1702.          src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
  1703.       if (const_offset_ir) {
  1704.          offset = src_reg(const_offset / 16);
  1705.       } else {
  1706.          emit(SHR(dst_reg(offset), offset, src_reg(4)));
  1707.       }
  1708.  
  1709.       vec4_instruction *pull =
  1710.          emit(new(mem_ctx) vec4_instruction(this,
  1711.                                             VS_OPCODE_PULL_CONSTANT_LOAD,
  1712.                                             dst_reg(packed_consts),
  1713.                                             surf_index,
  1714.                                             offset));
  1715.       pull->base_mrf = 14;
  1716.       pull->mlen = 1;
  1717.  
  1718.       packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
  1719.       packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
  1720.                                             const_offset % 16 / 4,
  1721.                                             const_offset % 16 / 4,
  1722.                                             const_offset % 16 / 4);
  1723.  
  1724.       /* UBO bools are any nonzero int.  We store bools as either 0 or 1. */
  1725.       if (ir->type->base_type == GLSL_TYPE_BOOL) {
  1726.          emit(CMP(result_dst, packed_consts, src_reg(0u),
  1727.                   BRW_CONDITIONAL_NZ));
  1728.          emit(AND(result_dst, result, src_reg(0x1)));
  1729.       } else {
  1730.          emit(MOV(result_dst, packed_consts));
  1731.       }
  1732.       break;
  1733.    }
  1734.  
  1735.    case ir_binop_vector_extract:
  1736.       assert(!"should have been lowered by vec_index_to_cond_assign");
  1737.       break;
  1738.  
  1739.    case ir_triop_lrp:
  1740.       op[0] = fix_3src_operand(op[0]);
  1741.       op[1] = fix_3src_operand(op[1]);
  1742.       op[2] = fix_3src_operand(op[2]);
  1743.       /* Note that the instruction's argument order is reversed from GLSL
  1744.        * and the IR.
  1745.        */
  1746.       emit(LRP(result_dst, op[2], op[1], op[0]));
  1747.       break;
  1748.  
  1749.    case ir_triop_bfi:
  1750.       op[0] = fix_3src_operand(op[0]);
  1751.       op[1] = fix_3src_operand(op[1]);
  1752.       op[2] = fix_3src_operand(op[2]);
  1753.       emit(BFI2(result_dst, op[0], op[1], op[2]));
  1754.       break;
  1755.  
  1756.    case ir_triop_bitfield_extract:
  1757.       op[0] = fix_3src_operand(op[0]);
  1758.       op[1] = fix_3src_operand(op[1]);
  1759.       op[2] = fix_3src_operand(op[2]);
  1760.       /* Note that the instruction's argument order is reversed from GLSL
  1761.        * and the IR.
  1762.        */
  1763.       emit(BFE(result_dst, op[2], op[1], op[0]));
  1764.       break;
  1765.  
  1766.    case ir_triop_vector_insert:
  1767.       assert(!"should have been lowered by lower_vector_insert");
  1768.       break;
  1769.  
  1770.    case ir_quadop_bitfield_insert:
  1771.       assert(!"not reached: should be handled by "
  1772.               "bitfield_insert_to_bfm_bfi\n");
  1773.       break;
  1774.  
  1775.    case ir_quadop_vector:
  1776.       assert(!"not reached: should be handled by lower_quadop_vector");
  1777.       break;
  1778.  
  1779.    case ir_unop_pack_half_2x16:
  1780.       emit_pack_half_2x16(result_dst, op[0]);
  1781.       break;
  1782.    case ir_unop_unpack_half_2x16:
  1783.       emit_unpack_half_2x16(result_dst, op[0]);
  1784.       break;
  1785.    case ir_unop_pack_snorm_2x16:
  1786.    case ir_unop_pack_snorm_4x8:
  1787.    case ir_unop_pack_unorm_2x16:
  1788.    case ir_unop_pack_unorm_4x8:
  1789.    case ir_unop_unpack_snorm_2x16:
  1790.    case ir_unop_unpack_snorm_4x8:
  1791.    case ir_unop_unpack_unorm_2x16:
  1792.    case ir_unop_unpack_unorm_4x8:
  1793.       assert(!"not reached: should be handled by lower_packing_builtins");
  1794.       break;
  1795.    case ir_unop_unpack_half_2x16_split_x:
  1796.    case ir_unop_unpack_half_2x16_split_y:
  1797.    case ir_binop_pack_half_2x16_split:
  1798.       assert(!"not reached: should not occur in vertex shader");
  1799.       break;
  1800.    }
  1801. }
  1802.  
  1803.  
  1804. void
  1805. vec4_visitor::visit(ir_swizzle *ir)
  1806. {
  1807.    src_reg src;
  1808.    int i = 0;
  1809.    int swizzle[4];
  1810.  
  1811.    /* Note that this is only swizzles in expressions, not those on the left
  1812.     * hand side of an assignment, which do write masking.  See ir_assignment
  1813.     * for that.
  1814.     */
  1815.  
  1816.    ir->val->accept(this);
  1817.    src = this->result;
  1818.    assert(src.file != BAD_FILE);
  1819.  
  1820.    for (i = 0; i < ir->type->vector_elements; i++) {
  1821.       switch (i) {
  1822.       case 0:
  1823.          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
  1824.          break;
  1825.       case 1:
  1826.          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
  1827.          break;
  1828.       case 2:
  1829.          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
  1830.          break;
  1831.       case 3:
  1832.          swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
  1833.             break;
  1834.       }
  1835.    }
  1836.    for (; i < 4; i++) {
  1837.       /* Replicate the last channel out. */
  1838.       swizzle[i] = swizzle[ir->type->vector_elements - 1];
  1839.    }
  1840.  
  1841.    src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
  1842.  
  1843.    this->result = src;
  1844. }
  1845.  
  1846. void
  1847. vec4_visitor::visit(ir_dereference_variable *ir)
  1848. {
  1849.    const struct glsl_type *type = ir->type;
  1850.    dst_reg *reg = variable_storage(ir->var);
  1851.  
  1852.    if (!reg) {
  1853.       fail("Failed to find variable storage for %s\n", ir->var->name);
  1854.       this->result = src_reg(brw_null_reg());
  1855.       return;
  1856.    }
  1857.  
  1858.    this->result = src_reg(*reg);
  1859.  
  1860.    /* System values get their swizzle from the dst_reg writemask */
  1861.    if (ir->var->mode == ir_var_system_value)
  1862.       return;
  1863.  
  1864.    if (type->is_scalar() || type->is_vector() || type->is_matrix())
  1865.       this->result.swizzle = swizzle_for_size(type->vector_elements);
  1866. }
  1867.  
  1868.  
  1869. int
  1870. vec4_visitor::compute_array_stride(ir_dereference_array *ir)
  1871. {
  1872.    /* Under normal circumstances array elements are stored consecutively, so
  1873.     * the stride is equal to the size of the array element.
  1874.     */
  1875.    return type_size(ir->type);
  1876. }
  1877.  
  1878.  
  1879. void
  1880. vec4_visitor::visit(ir_dereference_array *ir)
  1881. {
  1882.    ir_constant *constant_index;
  1883.    src_reg src;
  1884.    int array_stride = compute_array_stride(ir);
  1885.  
  1886.    constant_index = ir->array_index->constant_expression_value();
  1887.  
  1888.    ir->array->accept(this);
  1889.    src = this->result;
  1890.  
  1891.    if (constant_index) {
  1892.       src.reg_offset += constant_index->value.i[0] * array_stride;
  1893.    } else {
  1894.       /* Variable index array dereference.  It eats the "vec4" of the
  1895.        * base of the array and an index that offsets the Mesa register
  1896.        * index.
  1897.        */
  1898.       ir->array_index->accept(this);
  1899.  
  1900.       src_reg index_reg;
  1901.  
  1902.       if (array_stride == 1) {
  1903.          index_reg = this->result;
  1904.       } else {
  1905.          index_reg = src_reg(this, glsl_type::int_type);
  1906.  
  1907.          emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
  1908.       }
  1909.  
  1910.       if (src.reladdr) {
  1911.          src_reg temp = src_reg(this, glsl_type::int_type);
  1912.  
  1913.          emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
  1914.  
  1915.          index_reg = temp;
  1916.       }
  1917.  
  1918.       src.reladdr = ralloc(mem_ctx, src_reg);
  1919.       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
  1920.    }
  1921.  
  1922.    /* If the type is smaller than a vec4, replicate the last channel out. */
  1923.    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
  1924.       src.swizzle = swizzle_for_size(ir->type->vector_elements);
  1925.    else
  1926.       src.swizzle = BRW_SWIZZLE_NOOP;
  1927.    src.type = brw_type_for_base_type(ir->type);
  1928.  
  1929.    this->result = src;
  1930. }
  1931.  
  1932. void
  1933. vec4_visitor::visit(ir_dereference_record *ir)
  1934. {
  1935.    unsigned int i;
  1936.    const glsl_type *struct_type = ir->record->type;
  1937.    int offset = 0;
  1938.  
  1939.    ir->record->accept(this);
  1940.  
  1941.    for (i = 0; i < struct_type->length; i++) {
  1942.       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
  1943.          break;
  1944.       offset += type_size(struct_type->fields.structure[i].type);
  1945.    }
  1946.  
  1947.    /* If the type is smaller than a vec4, replicate the last channel out. */
  1948.    if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
  1949.       this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
  1950.    else
  1951.       this->result.swizzle = BRW_SWIZZLE_NOOP;
  1952.    this->result.type = brw_type_for_base_type(ir->type);
  1953.  
  1954.    this->result.reg_offset += offset;
  1955. }
  1956.  
  1957. /**
  1958.  * We want to be careful in assignment setup to hit the actual storage
  1959.  * instead of potentially using a temporary like we might with the
  1960.  * ir_dereference handler.
  1961.  */
  1962. static dst_reg
  1963. get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
  1964. {
  1965.    /* The LHS must be a dereference.  If the LHS is a variable indexed array
  1966.     * access of a vector, it must be separated into a series conditional moves
  1967.     * before reaching this point (see ir_vec_index_to_cond_assign).
  1968.     */
  1969.    assert(ir->as_dereference());
  1970.    ir_dereference_array *deref_array = ir->as_dereference_array();
  1971.    if (deref_array) {
  1972.       assert(!deref_array->array->type->is_vector());
  1973.    }
  1974.  
  1975.    /* Use the rvalue deref handler for the most part.  We'll ignore
  1976.     * swizzles in it and write swizzles using writemask, though.
  1977.     */
  1978.    ir->accept(v);
  1979.    return dst_reg(v->result);
  1980. }
  1981.  
  1982. void
  1983. vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
  1984.                               const struct glsl_type *type, uint32_t predicate)
  1985. {
  1986.    if (type->base_type == GLSL_TYPE_STRUCT) {
  1987.       for (unsigned int i = 0; i < type->length; i++) {
  1988.          emit_block_move(dst, src, type->fields.structure[i].type, predicate);
  1989.       }
  1990.       return;
  1991.    }
  1992.  
  1993.    if (type->is_array()) {
  1994.       for (unsigned int i = 0; i < type->length; i++) {
  1995.          emit_block_move(dst, src, type->fields.array, predicate);
  1996.       }
  1997.       return;
  1998.    }
  1999.  
  2000.    if (type->is_matrix()) {
  2001.       const struct glsl_type *vec_type;
  2002.  
  2003.       vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
  2004.                                          type->vector_elements, 1);
  2005.  
  2006.       for (int i = 0; i < type->matrix_columns; i++) {
  2007.          emit_block_move(dst, src, vec_type, predicate);
  2008.       }
  2009.       return;
  2010.    }
  2011.  
  2012.    assert(type->is_scalar() || type->is_vector());
  2013.  
  2014.    dst->type = brw_type_for_base_type(type);
  2015.    src->type = dst->type;
  2016.  
  2017.    dst->writemask = (1 << type->vector_elements) - 1;
  2018.  
  2019.    src->swizzle = swizzle_for_size(type->vector_elements);
  2020.  
  2021.    vec4_instruction *inst = emit(MOV(*dst, *src));
  2022.    inst->predicate = predicate;
  2023.  
  2024.    dst->reg_offset++;
  2025.    src->reg_offset++;
  2026. }
  2027.  
  2028.  
  2029. /* If the RHS processing resulted in an instruction generating a
  2030.  * temporary value, and it would be easy to rewrite the instruction to
  2031.  * generate its result right into the LHS instead, do so.  This ends
  2032.  * up reliably removing instructions where it can be tricky to do so
  2033.  * later without real UD chain information.
  2034.  */
  2035. bool
  2036. vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
  2037.                                      dst_reg dst,
  2038.                                      src_reg src,
  2039.                                      vec4_instruction *pre_rhs_inst,
  2040.                                      vec4_instruction *last_rhs_inst)
  2041. {
  2042.    /* This could be supported, but it would take more smarts. */
  2043.    if (ir->condition)
  2044.       return false;
  2045.  
  2046.    if (pre_rhs_inst == last_rhs_inst)
  2047.       return false; /* No instructions generated to work with. */
  2048.  
  2049.    /* Make sure the last instruction generated our source reg. */
  2050.    if (src.file != GRF ||
  2051.        src.file != last_rhs_inst->dst.file ||
  2052.        src.reg != last_rhs_inst->dst.reg ||
  2053.        src.reg_offset != last_rhs_inst->dst.reg_offset ||
  2054.        src.reladdr ||
  2055.        src.abs ||
  2056.        src.negate ||
  2057.        last_rhs_inst->predicate != BRW_PREDICATE_NONE)
  2058.       return false;
  2059.  
  2060.    /* Check that that last instruction fully initialized the channels
  2061.     * we want to use, in the order we want to use them.  We could
  2062.     * potentially reswizzle the operands of many instructions so that
  2063.     * we could handle out of order channels, but don't yet.
  2064.     */
  2065.  
  2066.    for (unsigned i = 0; i < 4; i++) {
  2067.       if (dst.writemask & (1 << i)) {
  2068.          if (!(last_rhs_inst->dst.writemask & (1 << i)))
  2069.             return false;
  2070.  
  2071.          if (BRW_GET_SWZ(src.swizzle, i) != i)
  2072.             return false;
  2073.       }
  2074.    }
  2075.  
  2076.    /* Success!  Rewrite the instruction. */
  2077.    last_rhs_inst->dst.file = dst.file;
  2078.    last_rhs_inst->dst.reg = dst.reg;
  2079.    last_rhs_inst->dst.reg_offset = dst.reg_offset;
  2080.    last_rhs_inst->dst.reladdr = dst.reladdr;
  2081.    last_rhs_inst->dst.writemask &= dst.writemask;
  2082.  
  2083.    return true;
  2084. }
  2085.  
  2086. void
  2087. vec4_visitor::visit(ir_assignment *ir)
  2088. {
  2089.    dst_reg dst = get_assignment_lhs(ir->lhs, this);
  2090.    uint32_t predicate = BRW_PREDICATE_NONE;
  2091.  
  2092.    if (!ir->lhs->type->is_scalar() &&
  2093.        !ir->lhs->type->is_vector()) {
  2094.       ir->rhs->accept(this);
  2095.       src_reg src = this->result;
  2096.  
  2097.       if (ir->condition) {
  2098.          emit_bool_to_cond_code(ir->condition, &predicate);
  2099.       }
  2100.  
  2101.       /* emit_block_move doesn't account for swizzles in the source register.
  2102.        * This should be ok, since the source register is a structure or an
  2103.        * array, and those can't be swizzled.  But double-check to be sure.
  2104.        */
  2105.       assert(src.swizzle ==
  2106.              (ir->rhs->type->is_matrix()
  2107.               ? swizzle_for_size(ir->rhs->type->vector_elements)
  2108.               : BRW_SWIZZLE_NOOP));
  2109.  
  2110.       emit_block_move(&dst, &src, ir->rhs->type, predicate);
  2111.       return;
  2112.    }
  2113.  
  2114.    /* Now we're down to just a scalar/vector with writemasks. */
  2115.    int i;
  2116.  
  2117.    vec4_instruction *pre_rhs_inst, *last_rhs_inst;
  2118.    pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
  2119.  
  2120.    ir->rhs->accept(this);
  2121.  
  2122.    last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
  2123.  
  2124.    src_reg src = this->result;
  2125.  
  2126.    int swizzles[4];
  2127.    int first_enabled_chan = 0;
  2128.    int src_chan = 0;
  2129.  
  2130.    assert(ir->lhs->type->is_vector() ||
  2131.           ir->lhs->type->is_scalar());
  2132.    dst.writemask = ir->write_mask;
  2133.  
  2134.    for (int i = 0; i < 4; i++) {
  2135.       if (dst.writemask & (1 << i)) {
  2136.          first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
  2137.          break;
  2138.       }
  2139.    }
  2140.  
  2141.    /* Swizzle a small RHS vector into the channels being written.
  2142.     *
  2143.     * glsl ir treats write_mask as dictating how many channels are
  2144.     * present on the RHS while in our instructions we need to make
  2145.     * those channels appear in the slots of the vec4 they're written to.
  2146.     */
  2147.    for (int i = 0; i < 4; i++) {
  2148.       if (dst.writemask & (1 << i))
  2149.          swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
  2150.       else
  2151.          swizzles[i] = first_enabled_chan;
  2152.    }
  2153.    src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  2154.                               swizzles[2], swizzles[3]);
  2155.  
  2156.    if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
  2157.       return;
  2158.    }
  2159.  
  2160.    if (ir->condition) {
  2161.       emit_bool_to_cond_code(ir->condition, &predicate);
  2162.    }
  2163.  
  2164.    for (i = 0; i < type_size(ir->lhs->type); i++) {
  2165.       vec4_instruction *inst = emit(MOV(dst, src));
  2166.       inst->predicate = predicate;
  2167.  
  2168.       dst.reg_offset++;
  2169.       src.reg_offset++;
  2170.    }
  2171. }
  2172.  
  2173. void
  2174. vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
  2175. {
  2176.    if (ir->type->base_type == GLSL_TYPE_STRUCT) {
  2177.       foreach_list(node, &ir->components) {
  2178.          ir_constant *field_value = (ir_constant *)node;
  2179.  
  2180.          emit_constant_values(dst, field_value);
  2181.       }
  2182.       return;
  2183.    }
  2184.  
  2185.    if (ir->type->is_array()) {
  2186.       for (unsigned int i = 0; i < ir->type->length; i++) {
  2187.          emit_constant_values(dst, ir->array_elements[i]);
  2188.       }
  2189.       return;
  2190.    }
  2191.  
  2192.    if (ir->type->is_matrix()) {
  2193.       for (int i = 0; i < ir->type->matrix_columns; i++) {
  2194.          float *vec = &ir->value.f[i * ir->type->vector_elements];
  2195.  
  2196.          for (int j = 0; j < ir->type->vector_elements; j++) {
  2197.             dst->writemask = 1 << j;
  2198.             dst->type = BRW_REGISTER_TYPE_F;
  2199.  
  2200.             emit(MOV(*dst, src_reg(vec[j])));
  2201.          }
  2202.          dst->reg_offset++;
  2203.       }
  2204.       return;
  2205.    }
  2206.  
  2207.    int remaining_writemask = (1 << ir->type->vector_elements) - 1;
  2208.  
  2209.    for (int i = 0; i < ir->type->vector_elements; i++) {
  2210.       if (!(remaining_writemask & (1 << i)))
  2211.          continue;
  2212.  
  2213.       dst->writemask = 1 << i;
  2214.       dst->type = brw_type_for_base_type(ir->type);
  2215.  
  2216.       /* Find other components that match the one we're about to
  2217.        * write.  Emits fewer instructions for things like vec4(0.5,
  2218.        * 1.5, 1.5, 1.5).
  2219.        */
  2220.       for (int j = i + 1; j < ir->type->vector_elements; j++) {
  2221.          if (ir->type->base_type == GLSL_TYPE_BOOL) {
  2222.             if (ir->value.b[i] == ir->value.b[j])
  2223.                dst->writemask |= (1 << j);
  2224.          } else {
  2225.             /* u, i, and f storage all line up, so no need for a
  2226.              * switch case for comparing each type.
  2227.              */
  2228.             if (ir->value.u[i] == ir->value.u[j])
  2229.                dst->writemask |= (1 << j);
  2230.          }
  2231.       }
  2232.  
  2233.       switch (ir->type->base_type) {
  2234.       case GLSL_TYPE_FLOAT:
  2235.          emit(MOV(*dst, src_reg(ir->value.f[i])));
  2236.          break;
  2237.       case GLSL_TYPE_INT:
  2238.          emit(MOV(*dst, src_reg(ir->value.i[i])));
  2239.          break;
  2240.       case GLSL_TYPE_UINT:
  2241.          emit(MOV(*dst, src_reg(ir->value.u[i])));
  2242.          break;
  2243.       case GLSL_TYPE_BOOL:
  2244.          emit(MOV(*dst, src_reg(ir->value.b[i])));
  2245.          break;
  2246.       default:
  2247.          assert(!"Non-float/uint/int/bool constant");
  2248.          break;
  2249.       }
  2250.  
  2251.       remaining_writemask &= ~dst->writemask;
  2252.    }
  2253.    dst->reg_offset++;
  2254. }
  2255.  
  2256. void
  2257. vec4_visitor::visit(ir_constant *ir)
  2258. {
  2259.    dst_reg dst = dst_reg(this, ir->type);
  2260.    this->result = src_reg(dst);
  2261.  
  2262.    emit_constant_values(&dst, ir);
  2263. }
  2264.  
  2265. void
  2266. vec4_visitor::visit(ir_call *ir)
  2267. {
  2268.    assert(!"not reached");
  2269. }
  2270.  
  2271. void
  2272. vec4_visitor::visit(ir_texture *ir)
  2273. {
  2274.    int sampler =
  2275.       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
  2276.  
  2277.    /* Should be lowered by do_lower_texture_projection */
  2278.    assert(!ir->projector);
  2279.  
  2280.    /* Generate code to compute all the subexpression trees.  This has to be
  2281.     * done before loading any values into MRFs for the sampler message since
  2282.     * generating these values may involve SEND messages that need the MRFs.
  2283.     */
  2284.    src_reg coordinate;
  2285.    if (ir->coordinate) {
  2286.       ir->coordinate->accept(this);
  2287.       coordinate = this->result;
  2288.    }
  2289.  
  2290.    src_reg shadow_comparitor;
  2291.    if (ir->shadow_comparitor) {
  2292.       ir->shadow_comparitor->accept(this);
  2293.       shadow_comparitor = this->result;
  2294.    }
  2295.  
  2296.    const glsl_type *lod_type = NULL, *sample_index_type = NULL;
  2297.    src_reg lod, dPdx, dPdy, sample_index;
  2298.    switch (ir->op) {
  2299.    case ir_tex:
  2300.       lod = src_reg(0.0f);
  2301.       lod_type = glsl_type::float_type;
  2302.       break;
  2303.    case ir_txf:
  2304.    case ir_txl:
  2305.    case ir_txs:
  2306.       ir->lod_info.lod->accept(this);
  2307.       lod = this->result;
  2308.       lod_type = ir->lod_info.lod->type;
  2309.       break;
  2310.    case ir_txf_ms:
  2311.       ir->lod_info.sample_index->accept(this);
  2312.       sample_index = this->result;
  2313.       sample_index_type = ir->lod_info.sample_index->type;
  2314.       break;
  2315.    case ir_txd:
  2316.       ir->lod_info.grad.dPdx->accept(this);
  2317.       dPdx = this->result;
  2318.  
  2319.       ir->lod_info.grad.dPdy->accept(this);
  2320.       dPdy = this->result;
  2321.  
  2322.       lod_type = ir->lod_info.grad.dPdx->type;
  2323.       break;
  2324.    case ir_txb:
  2325.    case ir_lod:
  2326.       break;
  2327.    }
  2328.  
  2329.    vec4_instruction *inst = NULL;
  2330.    switch (ir->op) {
  2331.    case ir_tex:
  2332.    case ir_txl:
  2333.       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
  2334.       break;
  2335.    case ir_txd:
  2336.       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
  2337.       break;
  2338.    case ir_txf:
  2339.       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
  2340.       break;
  2341.    case ir_txf_ms:
  2342.       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
  2343.       break;
  2344.    case ir_txs:
  2345.       inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
  2346.       break;
  2347.    case ir_txb:
  2348.       assert(!"TXB is not valid for vertex shaders.");
  2349.       break;
  2350.    case ir_lod:
  2351.       assert(!"LOD is not valid for vertex shaders.");
  2352.       break;
  2353.    }
  2354.  
  2355.    bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
  2356.  
  2357.    /* Texel offsets go in the message header; Gen4 also requires headers. */
  2358.    inst->header_present = use_texture_offset || brw->gen < 5;
  2359.    inst->base_mrf = 2;
  2360.    inst->mlen = inst->header_present + 1; /* always at least one */
  2361.    inst->sampler = sampler;
  2362.    inst->dst = dst_reg(this, ir->type);
  2363.    inst->dst.writemask = WRITEMASK_XYZW;
  2364.    inst->shadow_compare = ir->shadow_comparitor != NULL;
  2365.  
  2366.    if (use_texture_offset)
  2367.       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
  2368.  
  2369.    /* MRF for the first parameter */
  2370.    int param_base = inst->base_mrf + inst->header_present;
  2371.  
  2372.    if (ir->op == ir_txs) {
  2373.       int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
  2374.       emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
  2375.    } else {
  2376.       int i, coord_mask = 0, zero_mask = 0;
  2377.       /* Load the coordinate */
  2378.       /* FINISHME: gl_clamp_mask and saturate */
  2379.       for (i = 0; i < ir->coordinate->type->vector_elements; i++)
  2380.          coord_mask |= (1 << i);
  2381.       for (; i < 4; i++)
  2382.          zero_mask |= (1 << i);
  2383.  
  2384.       if (ir->offset && ir->op == ir_txf) {
  2385.          /* It appears that the ld instruction used for txf does its
  2386.           * address bounds check before adding in the offset.  To work
  2387.           * around this, just add the integer offset to the integer
  2388.           * texel coordinate, and don't put the offset in the header.
  2389.           */
  2390.          ir_constant *offset = ir->offset->as_constant();
  2391.          assert(offset);
  2392.  
  2393.          for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
  2394.             src_reg src = coordinate;
  2395.             src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
  2396.                                        BRW_GET_SWZ(src.swizzle, j),
  2397.                                        BRW_GET_SWZ(src.swizzle, j),
  2398.                                        BRW_GET_SWZ(src.swizzle, j));
  2399.             emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
  2400.                      src, offset->value.i[j]));
  2401.          }
  2402.       } else {
  2403.          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
  2404.                   coordinate));
  2405.       }
  2406.       if (zero_mask != 0) {
  2407.          emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
  2408.                   src_reg(0)));
  2409.       }
  2410.       /* Load the shadow comparitor */
  2411.       if (ir->shadow_comparitor && ir->op != ir_txd) {
  2412.          emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
  2413.                           WRITEMASK_X),
  2414.                   shadow_comparitor));
  2415.          inst->mlen++;
  2416.       }
  2417.  
  2418.       /* Load the LOD info */
  2419.       if (ir->op == ir_tex || ir->op == ir_txl) {
  2420.          int mrf, writemask;
  2421.          if (brw->gen >= 5) {
  2422.             mrf = param_base + 1;
  2423.             if (ir->shadow_comparitor) {
  2424.                writemask = WRITEMASK_Y;
  2425.                /* mlen already incremented */
  2426.             } else {
  2427.                writemask = WRITEMASK_X;
  2428.                inst->mlen++;
  2429.             }
  2430.          } else /* brw->gen == 4 */ {
  2431.             mrf = param_base;
  2432.             writemask = WRITEMASK_W;
  2433.          }
  2434.          emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
  2435.       } else if (ir->op == ir_txf) {
  2436.          emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
  2437.       } else if (ir->op == ir_txf_ms) {
  2438.          emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
  2439.                   sample_index));
  2440.          inst->mlen++;
  2441.  
  2442.          /* on Gen7, there is an additional MCS parameter here after SI,
  2443.           * but we don't bother to emit it since it's always zero. If
  2444.           * we start supporting texturing from CMS surfaces, this will have
  2445.           * to change
  2446.           */
  2447.       } else if (ir->op == ir_txd) {
  2448.          const glsl_type *type = lod_type;
  2449.  
  2450.          if (brw->gen >= 5) {
  2451.             dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
  2452.             dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
  2453.             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
  2454.             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
  2455.             inst->mlen++;
  2456.  
  2457.             if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
  2458.                dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
  2459.                dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
  2460.                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
  2461.                emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
  2462.                inst->mlen++;
  2463.  
  2464.                if (ir->shadow_comparitor) {
  2465.                   emit(MOV(dst_reg(MRF, param_base + 2,
  2466.                                    ir->shadow_comparitor->type, WRITEMASK_Z),
  2467.                            shadow_comparitor));
  2468.                }
  2469.             }
  2470.          } else /* brw->gen == 4 */ {
  2471.             emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
  2472.             emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
  2473.             inst->mlen += 2;
  2474.          }
  2475.       }
  2476.    }
  2477.  
  2478.    emit(inst);
  2479.  
  2480.    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
  2481.     * spec requires layers.
  2482.     */
  2483.    if (ir->op == ir_txs) {
  2484.       glsl_type const *type = ir->sampler->type;
  2485.       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
  2486.           type->sampler_array) {
  2487.          emit_math(SHADER_OPCODE_INT_QUOTIENT,
  2488.                    with_writemask(inst->dst, WRITEMASK_Z),
  2489.                    src_reg(inst->dst), src_reg(6));
  2490.       }
  2491.    }
  2492.  
  2493.    swizzle_result(ir, src_reg(inst->dst), sampler);
  2494. }
  2495.  
  2496. void
  2497. vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
  2498. {
  2499.    int s = key->tex.swizzles[sampler];
  2500.  
  2501.    this->result = src_reg(this, ir->type);
  2502.    dst_reg swizzled_result(this->result);
  2503.  
  2504.    if (ir->op == ir_txs || ir->type == glsl_type::float_type
  2505.                         || s == SWIZZLE_NOOP) {
  2506.       emit(MOV(swizzled_result, orig_val));
  2507.       return;
  2508.    }
  2509.  
  2510.    int zero_mask = 0, one_mask = 0, copy_mask = 0;
  2511.    int swizzle[4] = {0};
  2512.  
  2513.    for (int i = 0; i < 4; i++) {
  2514.       switch (GET_SWZ(s, i)) {
  2515.       case SWIZZLE_ZERO:
  2516.          zero_mask |= (1 << i);
  2517.          break;
  2518.       case SWIZZLE_ONE:
  2519.          one_mask |= (1 << i);
  2520.          break;
  2521.       default:
  2522.          copy_mask |= (1 << i);
  2523.          swizzle[i] = GET_SWZ(s, i);
  2524.          break;
  2525.       }
  2526.    }
  2527.  
  2528.    if (copy_mask) {
  2529.       orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
  2530.       swizzled_result.writemask = copy_mask;
  2531.       emit(MOV(swizzled_result, orig_val));
  2532.    }
  2533.  
  2534.    if (zero_mask) {
  2535.       swizzled_result.writemask = zero_mask;
  2536.       emit(MOV(swizzled_result, src_reg(0.0f)));
  2537.    }
  2538.  
  2539.    if (one_mask) {
  2540.       swizzled_result.writemask = one_mask;
  2541.       emit(MOV(swizzled_result, src_reg(1.0f)));
  2542.    }
  2543. }
  2544.  
  2545. void
  2546. vec4_visitor::visit(ir_return *ir)
  2547. {
  2548.    assert(!"not reached");
  2549. }
  2550.  
  2551. void
  2552. vec4_visitor::visit(ir_discard *ir)
  2553. {
  2554.    assert(!"not reached");
  2555. }
  2556.  
  2557. void
  2558. vec4_visitor::visit(ir_if *ir)
  2559. {
  2560.    /* Don't point the annotation at the if statement, because then it plus
  2561.     * the then and else blocks get printed.
  2562.     */
  2563.    this->base_ir = ir->condition;
  2564.  
  2565.    if (brw->gen == 6) {
  2566.       emit_if_gen6(ir);
  2567.    } else {
  2568.       uint32_t predicate;
  2569.       emit_bool_to_cond_code(ir->condition, &predicate);
  2570.       emit(IF(predicate));
  2571.    }
  2572.  
  2573.    visit_instructions(&ir->then_instructions);
  2574.  
  2575.    if (!ir->else_instructions.is_empty()) {
  2576.       this->base_ir = ir->condition;
  2577.       emit(BRW_OPCODE_ELSE);
  2578.  
  2579.       visit_instructions(&ir->else_instructions);
  2580.    }
  2581.  
  2582.    this->base_ir = ir->condition;
  2583.    emit(BRW_OPCODE_ENDIF);
  2584. }
  2585.  
  2586. void
  2587. vec4_visitor::emit_ndc_computation()
  2588. {
  2589.    /* Get the position */
  2590.    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
  2591.  
  2592.    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
  2593.    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
  2594.    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
  2595.  
  2596.    current_annotation = "NDC";
  2597.    dst_reg ndc_w = ndc;
  2598.    ndc_w.writemask = WRITEMASK_W;
  2599.    src_reg pos_w = pos;
  2600.    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
  2601.    emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
  2602.  
  2603.    dst_reg ndc_xyz = ndc;
  2604.    ndc_xyz.writemask = WRITEMASK_XYZ;
  2605.  
  2606.    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
  2607. }
  2608.  
  2609. void
  2610. vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
  2611. {
  2612.    if (brw->gen < 6 &&
  2613.        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
  2614.         key->userclip_active || brw->has_negative_rhw_bug)) {
  2615.       dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
  2616.       dst_reg header1_w = header1;
  2617.       header1_w.writemask = WRITEMASK_W;
  2618.       GLuint i;
  2619.  
  2620.       emit(MOV(header1, 0u));
  2621.  
  2622.       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
  2623.          src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
  2624.  
  2625.          current_annotation = "Point size";
  2626.          emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
  2627.          emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
  2628.       }
  2629.  
  2630.       current_annotation = "Clipping flags";
  2631.       for (i = 0; i < key->nr_userclip_plane_consts; i++) {
  2632.          vec4_instruction *inst;
  2633.          gl_varying_slot slot = (prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)
  2634.             ? VARYING_SLOT_CLIP_VERTEX : VARYING_SLOT_POS;
  2635.  
  2636.          inst = emit(DP4(dst_null_f(), src_reg(output_reg[slot]),
  2637.                          src_reg(this->userplane[i])));
  2638.          inst->conditional_mod = BRW_CONDITIONAL_L;
  2639.  
  2640.          inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
  2641.          inst->predicate = BRW_PREDICATE_NORMAL;
  2642.       }
  2643.  
  2644.       /* i965 clipping workaround:
  2645.        * 1) Test for -ve rhw
  2646.        * 2) If set,
  2647.        *      set ndc = (0,0,0,0)
  2648.        *      set ucp[6] = 1
  2649.        *
  2650.        * Later, clipping will detect ucp[6] and ensure the primitive is
  2651.        * clipped against all fixed planes.
  2652.        */
  2653.       if (brw->has_negative_rhw_bug) {
  2654.          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
  2655.          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
  2656.          emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
  2657.          vec4_instruction *inst;
  2658.          inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
  2659.          inst->predicate = BRW_PREDICATE_NORMAL;
  2660.          inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
  2661.          inst->predicate = BRW_PREDICATE_NORMAL;
  2662.       }
  2663.  
  2664.       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
  2665.    } else if (brw->gen < 6) {
  2666.       emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
  2667.    } else {
  2668.       emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
  2669.       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
  2670.          emit(MOV(brw_writemask(reg, WRITEMASK_W),
  2671.                   src_reg(output_reg[VARYING_SLOT_PSIZ])));
  2672.       }
  2673.       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
  2674.          emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
  2675.                   src_reg(output_reg[VARYING_SLOT_LAYER])));
  2676.       }
  2677.    }
  2678. }
  2679.  
  2680. void
  2681. vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
  2682. {
  2683.    if (brw->gen < 6) {
  2684.       /* Clip distance slots are set aside in gen5, but they are not used.  It
  2685.        * is not clear whether we actually need to set aside space for them,
  2686.        * but the performance cost is negligible.
  2687.        */
  2688.       return;
  2689.    }
  2690.  
  2691.    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
  2692.     *
  2693.     *     "If a linked set of shaders forming the vertex stage contains no
  2694.     *     static write to gl_ClipVertex or gl_ClipDistance, but the
  2695.     *     application has requested clipping against user clip planes through
  2696.     *     the API, then the coordinate written to gl_Position is used for
  2697.     *     comparison against the user clip planes."
  2698.     *
  2699.     * This function is only called if the shader didn't write to
  2700.     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
  2701.     * if the user wrote to it; otherwise we use gl_Position.
  2702.     */
  2703.    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
  2704.    if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
  2705.       clip_vertex = VARYING_SLOT_POS;
  2706.    }
  2707.  
  2708.    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
  2709.         ++i) {
  2710.       emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
  2711.                src_reg(output_reg[clip_vertex]),
  2712.                src_reg(this->userplane[i + offset])));
  2713.    }
  2714. }
  2715.  
  2716. void
  2717. vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
  2718. {
  2719.    assert (varying < VARYING_SLOT_MAX);
  2720.    reg.type = output_reg[varying].type;
  2721.    current_annotation = output_reg_annotation[varying];
  2722.    /* Copy the register, saturating if necessary */
  2723.    vec4_instruction *inst = emit(MOV(reg,
  2724.                                      src_reg(output_reg[varying])));
  2725.    if ((varying == VARYING_SLOT_COL0 ||
  2726.         varying == VARYING_SLOT_COL1 ||
  2727.         varying == VARYING_SLOT_BFC0 ||
  2728.         varying == VARYING_SLOT_BFC1) &&
  2729.        key->clamp_vertex_color) {
  2730.       inst->saturate = true;
  2731.    }
  2732. }
  2733.  
  2734. void
  2735. vec4_visitor::emit_urb_slot(int mrf, int varying)
  2736. {
  2737.    struct brw_reg hw_reg = brw_message_reg(mrf);
  2738.    dst_reg reg = dst_reg(MRF, mrf);
  2739.    reg.type = BRW_REGISTER_TYPE_F;
  2740.  
  2741.    switch (varying) {
  2742.    case VARYING_SLOT_PSIZ:
  2743.       /* PSIZ is always in slot 0, and is coupled with other flags. */
  2744.       current_annotation = "indices, point width, clip flags";
  2745.       emit_psiz_and_flags(hw_reg);
  2746.       break;
  2747.    case BRW_VARYING_SLOT_NDC:
  2748.       current_annotation = "NDC";
  2749.       emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
  2750.       break;
  2751.    case VARYING_SLOT_POS:
  2752.       current_annotation = "gl_Position";
  2753.       emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
  2754.       break;
  2755.    case VARYING_SLOT_CLIP_DIST0:
  2756.    case VARYING_SLOT_CLIP_DIST1:
  2757.       if (this->key->uses_clip_distance) {
  2758.          emit_generic_urb_slot(reg, varying);
  2759.       } else {
  2760.          current_annotation = "user clip distances";
  2761.          emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
  2762.       }
  2763.       break;
  2764.    case VARYING_SLOT_EDGE:
  2765.       /* This is present when doing unfilled polygons.  We're supposed to copy
  2766.        * the edge flag from the user-provided vertex array
  2767.        * (glEdgeFlagPointer), or otherwise we'll copy from the current value
  2768.        * of that attribute (starts as 1.0f).  This is then used in clipping to
  2769.        * determine which edges should be drawn as wireframe.
  2770.        */
  2771.       current_annotation = "edge flag";
  2772.       emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
  2773.                                     glsl_type::float_type, WRITEMASK_XYZW))));
  2774.       break;
  2775.    case BRW_VARYING_SLOT_PAD:
  2776.       /* No need to write to this slot */
  2777.       break;
  2778.    default:
  2779.       emit_generic_urb_slot(reg, varying);
  2780.       break;
  2781.    }
  2782. }
  2783.  
  2784. static int
  2785. align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
  2786. {
  2787.    if (brw->gen >= 6) {
  2788.       /* URB data written (does not include the message header reg) must
  2789.        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
  2790.        * section 5.4.3.2.2: URB_INTERLEAVED.
  2791.        *
  2792.        * URB entries are allocated on a multiple of 1024 bits, so an
  2793.        * extra 128 bits written here to make the end align to 256 is
  2794.        * no problem.
  2795.        */
  2796.       if ((mlen % 2) != 1)
  2797.          mlen++;
  2798.    }
  2799.  
  2800.    return mlen;
  2801. }
  2802.  
  2803. void
  2804. vec4_vs_visitor::emit_urb_write_header(int mrf)
  2805. {
  2806.    /* No need to do anything for VS; an implied write to this MRF will be
  2807.     * performed by VS_OPCODE_URB_WRITE.
  2808.     */
  2809.    (void) mrf;
  2810. }
  2811.  
  2812. vec4_instruction *
  2813. vec4_vs_visitor::emit_urb_write_opcode(bool complete)
  2814. {
  2815.    /* For VS, the URB writes end the thread. */
  2816.    if (complete) {
  2817.       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  2818.          emit_shader_time_end();
  2819.    }
  2820.  
  2821.    vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
  2822.    inst->eot = complete;
  2823.  
  2824.    return inst;
  2825. }
  2826.  
  2827. /**
  2828.  * Generates the VUE payload plus the necessary URB write instructions to
  2829.  * output it.
  2830.  *
  2831.  * The VUE layout is documented in Volume 2a.
  2832.  */
  2833. void
  2834. vec4_visitor::emit_vertex()
  2835. {
  2836.    /* MRF 0 is reserved for the debugger, so start with message header
  2837.     * in MRF 1.
  2838.     */
  2839.    int base_mrf = 1;
  2840.    int mrf = base_mrf;
  2841.    /* In the process of generating our URB write message contents, we
  2842.     * may need to unspill a register or load from an array.  Those
  2843.     * reads would use MRFs 14-15.
  2844.     */
  2845.    int max_usable_mrf = 13;
  2846.  
  2847.    /* The following assertion verifies that max_usable_mrf causes an
  2848.     * even-numbered amount of URB write data, which will meet gen6's
  2849.     * requirements for length alignment.
  2850.     */
  2851.    assert ((max_usable_mrf - base_mrf) % 2 == 0);
  2852.  
  2853.    /* First mrf is the g0-based message header containing URB handles and
  2854.     * such.
  2855.     */
  2856.    emit_urb_write_header(mrf++);
  2857.  
  2858.    if (brw->gen < 6) {
  2859.       emit_ndc_computation();
  2860.    }
  2861.  
  2862.    /* Set up the VUE data for the first URB write */
  2863.    int slot;
  2864.    for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
  2865.       emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
  2866.  
  2867.       /* If this was max_usable_mrf, we can't fit anything more into this URB
  2868.        * WRITE.
  2869.        */
  2870.       if (mrf > max_usable_mrf) {
  2871.          slot++;
  2872.          break;
  2873.       }
  2874.    }
  2875.  
  2876.    bool complete = slot >= prog_data->vue_map.num_slots;
  2877.    current_annotation = "URB write";
  2878.    vec4_instruction *inst = emit_urb_write_opcode(complete);
  2879.    inst->base_mrf = base_mrf;
  2880.    inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
  2881.  
  2882.    /* Optional second URB write */
  2883.    if (!complete) {
  2884.       mrf = base_mrf + 1;
  2885.  
  2886.       for (; slot < prog_data->vue_map.num_slots; ++slot) {
  2887.          assert(mrf < max_usable_mrf);
  2888.  
  2889.          emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
  2890.       }
  2891.  
  2892.       current_annotation = "URB write";
  2893.       inst = emit_urb_write_opcode(true /* complete */);
  2894.       inst->base_mrf = base_mrf;
  2895.       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
  2896.       /* URB destination offset.  In the previous write, we got MRFs
  2897.        * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
  2898.        * URB row increments, and each of our MRFs is half of one of
  2899.        * those, since we're doing interleaved writes.
  2900.        */
  2901.       inst->offset = (max_usable_mrf - base_mrf) / 2;
  2902.    }
  2903. }
  2904.  
  2905. void
  2906. vec4_vs_visitor::emit_thread_end()
  2907. {
  2908.    /* For VS, we always end the thread by emitting a single vertex.
  2909.     * emit_urb_write_opcode() will take care of setting the eot flag on the
  2910.     * SEND instruction.
  2911.     */
  2912.    emit_vertex();
  2913. }
  2914.  
  2915. src_reg
  2916. vec4_visitor::get_scratch_offset(vec4_instruction *inst,
  2917.                                  src_reg *reladdr, int reg_offset)
  2918. {
  2919.    /* Because we store the values to scratch interleaved like our
  2920.     * vertex data, we need to scale the vec4 index by 2.
  2921.     */
  2922.    int message_header_scale = 2;
  2923.  
  2924.    /* Pre-gen6, the message header uses byte offsets instead of vec4
  2925.     * (16-byte) offset units.
  2926.     */
  2927.    if (brw->gen < 6)
  2928.       message_header_scale *= 16;
  2929.  
  2930.    if (reladdr) {
  2931.       src_reg index = src_reg(this, glsl_type::int_type);
  2932.  
  2933.       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
  2934.       emit_before(inst, MUL(dst_reg(index),
  2935.                             index, src_reg(message_header_scale)));
  2936.  
  2937.       return index;
  2938.    } else {
  2939.       return src_reg(reg_offset * message_header_scale);
  2940.    }
  2941. }
  2942.  
  2943. src_reg
  2944. vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
  2945.                                        src_reg *reladdr, int reg_offset)
  2946. {
  2947.    if (reladdr) {
  2948.       src_reg index = src_reg(this, glsl_type::int_type);
  2949.  
  2950.       emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
  2951.  
  2952.       /* Pre-gen6, the message header uses byte offsets instead of vec4
  2953.        * (16-byte) offset units.
  2954.        */
  2955.       if (brw->gen < 6) {
  2956.          emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
  2957.       }
  2958.  
  2959.       return index;
  2960.    } else {
  2961.       int message_header_scale = brw->gen < 6 ? 16 : 1;
  2962.       return src_reg(reg_offset * message_header_scale);
  2963.    }
  2964. }
  2965.  
  2966. /**
  2967.  * Emits an instruction before @inst to load the value named by @orig_src
  2968.  * from scratch space at @base_offset to @temp.
  2969.  *
  2970.  * @base_offset is measured in 32-byte units (the size of a register).
  2971.  */
  2972. void
  2973. vec4_visitor::emit_scratch_read(vec4_instruction *inst,
  2974.                                 dst_reg temp, src_reg orig_src,
  2975.                                 int base_offset)
  2976. {
  2977.    int reg_offset = base_offset + orig_src.reg_offset;
  2978.    src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
  2979.  
  2980.    emit_before(inst, SCRATCH_READ(temp, index));
  2981. }
  2982.  
  2983. /**
  2984.  * Emits an instruction after @inst to store the value to be written
  2985.  * to @orig_dst to scratch space at @base_offset, from @temp.
  2986.  *
  2987.  * @base_offset is measured in 32-byte units (the size of a register).
  2988.  */
  2989. void
  2990. vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
  2991. {
  2992.    int reg_offset = base_offset + inst->dst.reg_offset;
  2993.    src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
  2994.  
  2995.    /* Create a temporary register to store *inst's result in.
  2996.     *
  2997.     * We have to be careful in MOVing from our temporary result register in
  2998.     * the scratch write.  If we swizzle from channels of the temporary that
  2999.     * weren't initialized, it will confuse live interval analysis, which will
  3000.     * make spilling fail to make progress.
  3001.     */
  3002.    src_reg temp = src_reg(this, glsl_type::vec4_type);
  3003.    temp.type = inst->dst.type;
  3004.    int first_writemask_chan = ffs(inst->dst.writemask) - 1;
  3005.    int swizzles[4];
  3006.    for (int i = 0; i < 4; i++)
  3007.       if (inst->dst.writemask & (1 << i))
  3008.          swizzles[i] = i;
  3009.       else
  3010.          swizzles[i] = first_writemask_chan;
  3011.    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
  3012.                                swizzles[2], swizzles[3]);
  3013.  
  3014.    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
  3015.                                        inst->dst.writemask));
  3016.    vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
  3017.    write->predicate = inst->predicate;
  3018.    write->ir = inst->ir;
  3019.    write->annotation = inst->annotation;
  3020.    inst->insert_after(write);
  3021.  
  3022.    inst->dst.file = temp.file;
  3023.    inst->dst.reg = temp.reg;
  3024.    inst->dst.reg_offset = temp.reg_offset;
  3025.    inst->dst.reladdr = NULL;
  3026. }
  3027.  
  3028. /**
  3029.  * We can't generally support array access in GRF space, because a
  3030.  * single instruction's destination can only span 2 contiguous
  3031.  * registers.  So, we send all GRF arrays that get variable index
  3032.  * access to scratch space.
  3033.  */
  3034. void
  3035. vec4_visitor::move_grf_array_access_to_scratch()
  3036. {
  3037.    int scratch_loc[this->virtual_grf_count];
  3038.  
  3039.    for (int i = 0; i < this->virtual_grf_count; i++) {
  3040.       scratch_loc[i] = -1;
  3041.    }
  3042.  
  3043.    /* First, calculate the set of virtual GRFs that need to be punted
  3044.     * to scratch due to having any array access on them, and where in
  3045.     * scratch.
  3046.     */
  3047.    foreach_list(node, &this->instructions) {
  3048.       vec4_instruction *inst = (vec4_instruction *)node;
  3049.  
  3050.       if (inst->dst.file == GRF && inst->dst.reladdr &&
  3051.           scratch_loc[inst->dst.reg] == -1) {
  3052.          scratch_loc[inst->dst.reg] = c->last_scratch;
  3053.          c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
  3054.       }
  3055.  
  3056.       for (int i = 0 ; i < 3; i++) {
  3057.          src_reg *src = &inst->src[i];
  3058.  
  3059.          if (src->file == GRF && src->reladdr &&
  3060.              scratch_loc[src->reg] == -1) {
  3061.             scratch_loc[src->reg] = c->last_scratch;
  3062.             c->last_scratch += this->virtual_grf_sizes[src->reg];
  3063.          }
  3064.       }
  3065.    }
  3066.  
  3067.    /* Now, for anything that will be accessed through scratch, rewrite
  3068.     * it to load/store.  Note that this is a _safe list walk, because
  3069.     * we may generate a new scratch_write instruction after the one
  3070.     * we're processing.
  3071.     */
  3072.    foreach_list_safe(node, &this->instructions) {
  3073.       vec4_instruction *inst = (vec4_instruction *)node;
  3074.  
  3075.       /* Set up the annotation tracking for new generated instructions. */
  3076.       base_ir = inst->ir;
  3077.       current_annotation = inst->annotation;
  3078.  
  3079.       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
  3080.          emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
  3081.       }
  3082.  
  3083.       for (int i = 0 ; i < 3; i++) {
  3084.          if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
  3085.             continue;
  3086.  
  3087.          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
  3088.  
  3089.          emit_scratch_read(inst, temp, inst->src[i],
  3090.                            scratch_loc[inst->src[i].reg]);
  3091.  
  3092.          inst->src[i].file = temp.file;
  3093.          inst->src[i].reg = temp.reg;
  3094.          inst->src[i].reg_offset = temp.reg_offset;
  3095.          inst->src[i].reladdr = NULL;
  3096.       }
  3097.    }
  3098. }
  3099.  
  3100. /**
  3101.  * Emits an instruction before @inst to load the value named by @orig_src
  3102.  * from the pull constant buffer (surface) at @base_offset to @temp.
  3103.  */
  3104. void
  3105. vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
  3106.                                       dst_reg temp, src_reg orig_src,
  3107.                                       int base_offset)
  3108. {
  3109.    int reg_offset = base_offset + orig_src.reg_offset;
  3110.    src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
  3111.    src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
  3112.    vec4_instruction *load;
  3113.  
  3114.    if (brw->gen >= 7) {
  3115.       dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
  3116.       grf_offset.type = offset.type;
  3117.       emit_before(inst, MOV(grf_offset, offset));
  3118.  
  3119.       load = new(mem_ctx) vec4_instruction(this,
  3120.                                            VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
  3121.                                            temp, index, src_reg(grf_offset));
  3122.    } else {
  3123.       load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
  3124.                                            temp, index, offset);
  3125.       load->base_mrf = 14;
  3126.       load->mlen = 1;
  3127.    }
  3128.    emit_before(inst, load);
  3129. }
  3130.  
  3131. /**
  3132.  * Implements array access of uniforms by inserting a
  3133.  * PULL_CONSTANT_LOAD instruction.
  3134.  *
  3135.  * Unlike temporary GRF array access (where we don't support it due to
  3136.  * the difficulty of doing relative addressing on instruction
  3137.  * destinations), we could potentially do array access of uniforms
  3138.  * that were loaded in GRF space as push constants.  In real-world
  3139.  * usage we've seen, though, the arrays being used are always larger
  3140.  * than we could load as push constants, so just always move all
  3141.  * uniform array access out to a pull constant buffer.
  3142.  */
  3143. void
  3144. vec4_visitor::move_uniform_array_access_to_pull_constants()
  3145. {
  3146.    int pull_constant_loc[this->uniforms];
  3147.  
  3148.    for (int i = 0; i < this->uniforms; i++) {
  3149.       pull_constant_loc[i] = -1;
  3150.    }
  3151.  
  3152.    /* Walk through and find array access of uniforms.  Put a copy of that
  3153.     * uniform in the pull constant buffer.
  3154.     *
  3155.     * Note that we don't move constant-indexed accesses to arrays.  No
  3156.     * testing has been done of the performance impact of this choice.
  3157.     */
  3158.    foreach_list_safe(node, &this->instructions) {
  3159.       vec4_instruction *inst = (vec4_instruction *)node;
  3160.  
  3161.       for (int i = 0 ; i < 3; i++) {
  3162.          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
  3163.             continue;
  3164.  
  3165.          int uniform = inst->src[i].reg;
  3166.  
  3167.          /* If this array isn't already present in the pull constant buffer,
  3168.           * add it.
  3169.           */
  3170.          if (pull_constant_loc[uniform] == -1) {
  3171.             const float **values = &prog_data->param[uniform * 4];
  3172.  
  3173.             pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
  3174.  
  3175.             for (int j = 0; j < uniform_size[uniform] * 4; j++) {
  3176.                prog_data->pull_param[prog_data->nr_pull_params++]
  3177.                   = values[j];
  3178.             }
  3179.          }
  3180.  
  3181.          /* Set up the annotation tracking for new generated instructions. */
  3182.          base_ir = inst->ir;
  3183.          current_annotation = inst->annotation;
  3184.  
  3185.          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
  3186.  
  3187.          emit_pull_constant_load(inst, temp, inst->src[i],
  3188.                                  pull_constant_loc[uniform]);
  3189.  
  3190.          inst->src[i].file = temp.file;
  3191.          inst->src[i].reg = temp.reg;
  3192.          inst->src[i].reg_offset = temp.reg_offset;
  3193.          inst->src[i].reladdr = NULL;
  3194.       }
  3195.    }
  3196.  
  3197.    /* Now there are no accesses of the UNIFORM file with a reladdr, so
  3198.     * no need to track them as larger-than-vec4 objects.  This will be
  3199.     * relied on in cutting out unused uniform vectors from push
  3200.     * constants.
  3201.     */
  3202.    split_uniform_registers();
  3203. }
  3204.  
  3205. void
  3206. vec4_visitor::resolve_ud_negate(src_reg *reg)
  3207. {
  3208.    if (reg->type != BRW_REGISTER_TYPE_UD ||
  3209.        !reg->negate)
  3210.       return;
  3211.  
  3212.    src_reg temp = src_reg(this, glsl_type::uvec4_type);
  3213.    emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
  3214.    *reg = temp;
  3215. }
  3216.  
  3217. vec4_visitor::vec4_visitor(struct brw_context *brw,
  3218.                            struct brw_vec4_compile *c,
  3219.                            struct gl_program *prog,
  3220.                            const struct brw_vec4_prog_key *key,
  3221.                            struct brw_vec4_prog_data *prog_data,
  3222.                            struct gl_shader_program *shader_prog,
  3223.                            struct brw_shader *shader,
  3224.                            void *mem_ctx,
  3225.                            bool debug_flag)
  3226.    : debug_flag(debug_flag)
  3227. {
  3228.    this->brw = brw;
  3229.    this->ctx = &brw->ctx;
  3230.    this->shader_prog = shader_prog;
  3231.    this->shader = shader;
  3232.  
  3233.    this->mem_ctx = mem_ctx;
  3234.    this->failed = false;
  3235.  
  3236.    this->base_ir = NULL;
  3237.    this->current_annotation = NULL;
  3238.    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
  3239.  
  3240.    this->c = c;
  3241.    this->prog = prog;
  3242.    this->key = key;
  3243.    this->prog_data = prog_data;
  3244.  
  3245.    this->variable_ht = hash_table_ctor(0,
  3246.                                        hash_table_pointer_hash,
  3247.                                        hash_table_pointer_compare);
  3248.  
  3249.    this->virtual_grf_start = NULL;
  3250.    this->virtual_grf_end = NULL;
  3251.    this->virtual_grf_sizes = NULL;
  3252.    this->virtual_grf_count = 0;
  3253.    this->virtual_grf_reg_map = NULL;
  3254.    this->virtual_grf_reg_count = 0;
  3255.    this->virtual_grf_array_size = 0;
  3256.    this->live_intervals_valid = false;
  3257.  
  3258.    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
  3259.  
  3260.    this->uniforms = 0;
  3261. }
  3262.  
  3263. vec4_visitor::~vec4_visitor()
  3264. {
  3265.    hash_table_dtor(this->variable_ht);
  3266. }
  3267.  
  3268.  
  3269. vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
  3270.                                  struct brw_vs_compile *vs_compile,
  3271.                                  struct brw_vs_prog_data *vs_prog_data,
  3272.                                  struct gl_shader_program *prog,
  3273.                                  struct brw_shader *shader,
  3274.                                  void *mem_ctx)
  3275.    : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
  3276.                   &vs_compile->key.base, &vs_prog_data->base, prog, shader,
  3277.                   mem_ctx, INTEL_DEBUG & DEBUG_VS),
  3278.      vs_compile(vs_compile),
  3279.      vs_prog_data(vs_prog_data)
  3280. {
  3281. }
  3282.  
  3283.  
  3284. void
  3285. vec4_visitor::fail(const char *format, ...)
  3286. {
  3287.    va_list va;
  3288.    char *msg;
  3289.  
  3290.    if (failed)
  3291.       return;
  3292.  
  3293.    failed = true;
  3294.  
  3295.    va_start(va, format);
  3296.    msg = ralloc_vasprintf(mem_ctx, format, va);
  3297.    va_end(va);
  3298.    msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
  3299.  
  3300.    this->fail_msg = msg;
  3301.  
  3302.    if (debug_flag) {
  3303.       fprintf(stderr, "%s",  msg);
  3304.    }
  3305. }
  3306.  
  3307. } /* namespace brw */
  3308.