Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_fs_visitor.cpp
  25.  *
  26.  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27.  * makes it easier to do backend-specific optimizations than doing so
  28.  * in the GLSL IR or in the native code.
  29.  */
  30. #include <sys/types.h>
  31.  
  32. #include "main/macros.h"
  33. #include "main/shaderobj.h"
  34. #include "program/prog_parameter.h"
  35. #include "program/prog_print.h"
  36. #include "program/prog_optimize.h"
  37. #include "util/register_allocate.h"
  38. #include "program/hash_table.h"
  39. #include "brw_context.h"
  40. #include "brw_eu.h"
  41. #include "brw_wm.h"
  42. #include "brw_cs.h"
  43. #include "brw_vec4.h"
  44. #include "brw_fs.h"
  45. #include "main/uniforms.h"
  46. #include "glsl/glsl_types.h"
  47. #include "glsl/ir_optimization.h"
  48. #include "program/sampler.h"
  49.  
  50.  
  51. fs_reg *
  52. fs_visitor::emit_vs_system_value(int location)
  53. {
  54.    fs_reg *reg = new(this->mem_ctx)
  55.       fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
  56.    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  57.  
  58.    switch (location) {
  59.    case SYSTEM_VALUE_BASE_VERTEX:
  60.       reg->reg_offset = 0;
  61.       vs_prog_data->uses_vertexid = true;
  62.       break;
  63.    case SYSTEM_VALUE_VERTEX_ID:
  64.    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  65.       reg->reg_offset = 2;
  66.       vs_prog_data->uses_vertexid = true;
  67.       break;
  68.    case SYSTEM_VALUE_INSTANCE_ID:
  69.       reg->reg_offset = 3;
  70.       vs_prog_data->uses_instanceid = true;
  71.       break;
  72.    default:
  73.       unreachable("not reached");
  74.    }
  75.  
  76.    return reg;
  77. }
  78.  
  79. void
  80. fs_visitor::visit(ir_variable *ir)
  81. {
  82.    fs_reg *reg = NULL;
  83.  
  84.    if (variable_storage(ir))
  85.       return;
  86.  
  87.    if (ir->data.mode == ir_var_shader_in) {
  88.       assert(ir->data.location != -1);
  89.       if (stage == MESA_SHADER_VERTEX) {
  90.          reg = new(this->mem_ctx)
  91.             fs_reg(ATTR, ir->data.location,
  92.                    brw_type_for_base_type(ir->type->get_scalar_type()));
  93.       } else if (ir->data.location == VARYING_SLOT_POS) {
  94.          reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
  95.                                             ir->data.origin_upper_left);
  96.       } else if (ir->data.location == VARYING_SLOT_FACE) {
  97.          reg = emit_frontfacing_interpolation();
  98.       } else {
  99.          reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
  100.          emit_general_interpolation(*reg, ir->name, ir->type,
  101.                                     (glsl_interp_qualifier) ir->data.interpolation,
  102.                                     ir->data.location, ir->data.centroid,
  103.                                     ir->data.sample);
  104.       }
  105.       assert(reg);
  106.       hash_table_insert(this->variable_ht, reg, ir);
  107.       return;
  108.    } else if (ir->data.mode == ir_var_shader_out) {
  109.       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
  110.  
  111.       if (stage == MESA_SHADER_VERTEX) {
  112.          int vector_elements =
  113.             ir->type->is_array() ? ir->type->fields.array->vector_elements
  114.                                  : ir->type->vector_elements;
  115.  
  116.          for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
  117.             int output = ir->data.location + i;
  118.             this->outputs[output] = *reg;
  119.             this->outputs[output].reg_offset = i * 4;
  120.             this->output_components[output] = vector_elements;
  121.          }
  122.  
  123.       } else if (ir->data.index > 0) {
  124.          assert(ir->data.location == FRAG_RESULT_DATA0);
  125.          assert(ir->data.index == 1);
  126.          this->dual_src_output = *reg;
  127.          this->do_dual_src = true;
  128.       } else if (ir->data.location == FRAG_RESULT_COLOR) {
  129.          /* Writing gl_FragColor outputs to all color regions. */
  130.          assert(stage == MESA_SHADER_FRAGMENT);
  131.          brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  132.          for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
  133.             this->outputs[i] = *reg;
  134.             this->output_components[i] = 4;
  135.          }
  136.       } else if (ir->data.location == FRAG_RESULT_DEPTH) {
  137.          this->frag_depth = *reg;
  138.       } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
  139.          this->sample_mask = *reg;
  140.       } else {
  141.          /* gl_FragData or a user-defined FS output */
  142.          assert(ir->data.location >= FRAG_RESULT_DATA0 &&
  143.                 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  144.  
  145.          int vector_elements =
  146.             ir->type->is_array() ? ir->type->fields.array->vector_elements
  147.                                  : ir->type->vector_elements;
  148.  
  149.          /* General color output. */
  150.          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
  151.             int output = ir->data.location - FRAG_RESULT_DATA0 + i;
  152.             this->outputs[output] = offset(*reg, vector_elements * i);
  153.             this->output_components[output] = vector_elements;
  154.          }
  155.       }
  156.    } else if (ir->data.mode == ir_var_uniform) {
  157.       int param_index = uniforms;
  158.  
  159.       /* Thanks to the lower_ubo_reference pass, we will see only
  160.        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
  161.        * variables, so no need for them to be in variable_ht.
  162.        *
  163.        * Some uniforms, such as samplers and atomic counters, have no actual
  164.        * storage, so we should ignore them.
  165.        */
  166.       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
  167.          return;
  168.  
  169.       if (dispatch_width == 16) {
  170.          if (!variable_storage(ir)) {
  171.             fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
  172.          }
  173.          return;
  174.       }
  175.  
  176.       param_size[param_index] = type_size(ir->type);
  177.       if (!strncmp(ir->name, "gl_", 3)) {
  178.          setup_builtin_uniform_values(ir);
  179.       } else {
  180.          setup_uniform_values(ir);
  181.       }
  182.  
  183.       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
  184.       reg->type = brw_type_for_base_type(ir->type);
  185.  
  186.    } else if (ir->data.mode == ir_var_system_value) {
  187.       switch (ir->data.location) {
  188.       case SYSTEM_VALUE_BASE_VERTEX:
  189.       case SYSTEM_VALUE_VERTEX_ID:
  190.       case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  191.       case SYSTEM_VALUE_INSTANCE_ID:
  192.          reg = emit_vs_system_value(ir->data.location);
  193.          break;
  194.       case SYSTEM_VALUE_SAMPLE_POS:
  195.          reg = emit_samplepos_setup();
  196.          break;
  197.       case SYSTEM_VALUE_SAMPLE_ID:
  198.          reg = emit_sampleid_setup();
  199.          break;
  200.       case SYSTEM_VALUE_SAMPLE_MASK_IN:
  201.          assert(devinfo->gen >= 7);
  202.          reg = new(mem_ctx)
  203.             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
  204.                           BRW_REGISTER_TYPE_D));
  205.          break;
  206.       }
  207.    }
  208.  
  209.    if (!reg)
  210.       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
  211.  
  212.    hash_table_insert(this->variable_ht, reg, ir);
  213. }
  214.  
  215. void
  216. fs_visitor::visit(ir_dereference_variable *ir)
  217. {
  218.    fs_reg *reg = variable_storage(ir->var);
  219.  
  220.    if (!reg) {
  221.       fail("Failed to find variable storage for %s\n", ir->var->name);
  222.       this->result = fs_reg(reg_null_d);
  223.       return;
  224.    }
  225.    this->result = *reg;
  226. }
  227.  
  228. void
  229. fs_visitor::visit(ir_dereference_record *ir)
  230. {
  231.    const glsl_type *struct_type = ir->record->type;
  232.  
  233.    ir->record->accept(this);
  234.  
  235.    unsigned int off = 0;
  236.    for (unsigned int i = 0; i < struct_type->length; i++) {
  237.       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
  238.          break;
  239.       off += type_size(struct_type->fields.structure[i].type);
  240.    }
  241.    this->result = offset(this->result, off);
  242.    this->result.type = brw_type_for_base_type(ir->type);
  243. }
  244.  
  245. void
  246. fs_visitor::visit(ir_dereference_array *ir)
  247. {
  248.    ir_constant *constant_index;
  249.    fs_reg src;
  250.    int element_size = type_size(ir->type);
  251.  
  252.    constant_index = ir->array_index->as_constant();
  253.  
  254.    ir->array->accept(this);
  255.    src = this->result;
  256.    src.type = brw_type_for_base_type(ir->type);
  257.  
  258.    if (constant_index) {
  259.       if (src.file == ATTR) {
  260.          /* Attribute arrays get loaded as one vec4 per element.  In that case
  261.           * offset the source register.
  262.           */
  263.          src.reg += constant_index->value.i[0];
  264.       } else {
  265.          assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
  266.          src = offset(src, constant_index->value.i[0] * element_size);
  267.       }
  268.    } else {
  269.       /* Variable index array dereference.  We attach the variable index
  270.        * component to the reg as a pointer to a register containing the
  271.        * offset.  Currently only uniform arrays are supported in this patch,
  272.        * and that reladdr pointer is resolved by
  273.        * move_uniform_array_access_to_pull_constants().  All other array types
  274.        * are lowered by lower_variable_index_to_cond_assign().
  275.        */
  276.       ir->array_index->accept(this);
  277.  
  278.       fs_reg index_reg;
  279.       index_reg = vgrf(glsl_type::int_type);
  280.       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
  281.  
  282.       if (src.reladdr) {
  283.          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
  284.       }
  285.  
  286.       src.reladdr = ralloc(mem_ctx, fs_reg);
  287.       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
  288.    }
  289.    this->result = src;
  290. }
  291.  
  292. fs_inst *
  293. fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
  294.                      const fs_reg &a)
  295. {
  296.    if (devinfo->gen < 6) {
  297.       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
  298.       fs_reg y_times_a           = vgrf(glsl_type::float_type);
  299.       fs_reg one_minus_a         = vgrf(glsl_type::float_type);
  300.       fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
  301.  
  302.       emit(MUL(y_times_a, y, a));
  303.  
  304.       fs_reg negative_a = a;
  305.       negative_a.negate = !a.negate;
  306.       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
  307.       emit(MUL(x_times_one_minus_a, x, one_minus_a));
  308.  
  309.       return emit(ADD(dst, x_times_one_minus_a, y_times_a));
  310.    } else {
  311.       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
  312.        * we need to reorder the operands.
  313.        */
  314.       return emit(LRP(dst, a, y, x));
  315.    }
  316. }
  317.  
  318. void
  319. fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
  320.                         const fs_reg &src0, const fs_reg &src1)
  321. {
  322.    assert(conditionalmod == BRW_CONDITIONAL_GE ||
  323.           conditionalmod == BRW_CONDITIONAL_L);
  324.  
  325.    fs_inst *inst;
  326.  
  327.    if (devinfo->gen >= 6) {
  328.       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
  329.       inst->conditional_mod = conditionalmod;
  330.    } else {
  331.       emit(CMP(reg_null_d, src0, src1, conditionalmod));
  332.  
  333.       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
  334.       inst->predicate = BRW_PREDICATE_NORMAL;
  335.    }
  336. }
  337.  
  338. void
  339. fs_visitor::emit_uniformize(const fs_reg &dst, const fs_reg &src)
  340. {
  341.    const fs_reg chan_index = vgrf(glsl_type::uint_type);
  342.  
  343.    emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0))
  344.       ->force_writemask_all = true;
  345.    emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
  346.         src, component(chan_index, 0))
  347.       ->force_writemask_all = true;
  348. }
  349.  
  350. bool
  351. fs_visitor::try_emit_saturate(ir_expression *ir)
  352. {
  353.    if (ir->operation != ir_unop_saturate)
  354.       return false;
  355.  
  356.    ir_rvalue *sat_val = ir->operands[0];
  357.  
  358.    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
  359.  
  360.    sat_val->accept(this);
  361.    fs_reg src = this->result;
  362.  
  363.    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
  364.  
  365.    /* If the last instruction from our accept() generated our
  366.     * src, just set the saturate flag instead of emmitting a separate mov.
  367.     */
  368.    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
  369.    if (modify && modify->regs_written == modify->dst.width / 8 &&
  370.        modify->can_do_saturate()) {
  371.       modify->saturate = true;
  372.       this->result = src;
  373.       return true;
  374.    }
  375.  
  376.    return false;
  377. }
  378.  
  379. bool
  380. fs_visitor::try_emit_line(ir_expression *ir)
  381. {
  382.    /* LINE's src0 must be of type float. */
  383.    if (ir->type != glsl_type::float_type)
  384.       return false;
  385.  
  386.    ir_rvalue *nonmul = ir->operands[1];
  387.    ir_expression *mul = ir->operands[0]->as_expression();
  388.  
  389.    if (!mul || mul->operation != ir_binop_mul) {
  390.       nonmul = ir->operands[0];
  391.       mul = ir->operands[1]->as_expression();
  392.  
  393.       if (!mul || mul->operation != ir_binop_mul)
  394.          return false;
  395.    }
  396.  
  397.    ir_constant *const_add = nonmul->as_constant();
  398.    if (!const_add)
  399.       return false;
  400.  
  401.    int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
  402.    if (add_operand_vf == -1)
  403.       return false;
  404.  
  405.    ir_rvalue *non_const_mul = mul->operands[1];
  406.    ir_constant *const_mul = mul->operands[0]->as_constant();
  407.    if (!const_mul) {
  408.       const_mul = mul->operands[1]->as_constant();
  409.  
  410.       if (!const_mul)
  411.          return false;
  412.  
  413.       non_const_mul = mul->operands[0];
  414.    }
  415.  
  416.    int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
  417.    if (mul_operand_vf == -1)
  418.       return false;
  419.  
  420.    non_const_mul->accept(this);
  421.    fs_reg src1 = this->result;
  422.  
  423.    fs_reg src0 = vgrf(ir->type);
  424.    emit(BRW_OPCODE_MOV, src0,
  425.         fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
  426.  
  427.    this->result = vgrf(ir->type);
  428.    emit(BRW_OPCODE_LINE, this->result, src0, src1);
  429.    return true;
  430. }
  431.  
  432. bool
  433. fs_visitor::try_emit_mad(ir_expression *ir)
  434. {
  435.    /* 3-src instructions were introduced in gen6. */
  436.    if (devinfo->gen < 6)
  437.       return false;
  438.  
  439.    /* MAD can only handle floating-point data. */
  440.    if (ir->type != glsl_type::float_type)
  441.       return false;
  442.  
  443.    ir_rvalue *nonmul;
  444.    ir_expression *mul;
  445.    bool mul_negate, mul_abs;
  446.  
  447.    for (int i = 0; i < 2; i++) {
  448.       mul_negate = false;
  449.       mul_abs = false;
  450.  
  451.       mul = ir->operands[i]->as_expression();
  452.       nonmul = ir->operands[1 - i];
  453.  
  454.       if (mul && mul->operation == ir_unop_abs) {
  455.          mul = mul->operands[0]->as_expression();
  456.          mul_abs = true;
  457.       } else if (mul && mul->operation == ir_unop_neg) {
  458.          mul = mul->operands[0]->as_expression();
  459.          mul_negate = true;
  460.       }
  461.  
  462.       if (mul && mul->operation == ir_binop_mul)
  463.          break;
  464.    }
  465.  
  466.    if (!mul || mul->operation != ir_binop_mul)
  467.       return false;
  468.  
  469.    nonmul->accept(this);
  470.    fs_reg src0 = this->result;
  471.  
  472.    mul->operands[0]->accept(this);
  473.    fs_reg src1 = this->result;
  474.    src1.negate ^= mul_negate;
  475.    src1.abs = mul_abs;
  476.    if (mul_abs)
  477.       src1.negate = false;
  478.  
  479.    mul->operands[1]->accept(this);
  480.    fs_reg src2 = this->result;
  481.    src2.abs = mul_abs;
  482.    if (mul_abs)
  483.       src2.negate = false;
  484.  
  485.    this->result = vgrf(ir->type);
  486.    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
  487.  
  488.    return true;
  489. }
  490.  
  491. bool
  492. fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
  493. {
  494.    /* On platforms that do not natively generate 0u and ~0u for Boolean
  495.     * results, b2f expressions that look like
  496.     *
  497.     *     f = b2f(expr cmp 0)
  498.     *
  499.     * will generate better code by pretending the expression is
  500.     *
  501.     *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
  502.     *
  503.     * This is because the last instruction of "expr" can generate the
  504.     * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
  505.     * trick to generate 0u or ~0u for the Boolean result.  This means code like
  506.     *
  507.     *     mov(16)         g16<1>F         1F
  508.     *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
  509.     *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
  510.     *
  511.     * will be generated instead of
  512.     *
  513.     *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
  514.     *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
  515.     *     and(16)         g4<1>D          g2<8,8,1>D      1D
  516.     *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
  517.     *
  518.     * When the comparison is != 0.0 using the knowledge that the false case
  519.     * already results in zero would allow better code generation by possibly
  520.     * avoiding a load-immediate instruction.
  521.     */
  522.    ir_expression *cmp = ir->operands[0]->as_expression();
  523.    if (cmp == NULL)
  524.       return false;
  525.  
  526.    if (cmp->operation == ir_binop_nequal) {
  527.       for (unsigned i = 0; i < 2; i++) {
  528.          ir_constant *c = cmp->operands[i]->as_constant();
  529.          if (c == NULL || !c->is_zero())
  530.             continue;
  531.  
  532.          ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
  533.          if (expr != NULL) {
  534.             fs_reg op[2];
  535.  
  536.             for (unsigned j = 0; j < 2; j++) {
  537.                cmp->operands[j]->accept(this);
  538.                op[j] = this->result;
  539.  
  540.                resolve_ud_negate(&op[j]);
  541.             }
  542.  
  543.             emit_bool_to_cond_code_of_reg(cmp, op);
  544.  
  545.             /* In this case we know when the condition is true, op[i ^ 1]
  546.              * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
  547.              * and immediate 1.0f as src1.
  548.              */
  549.             this->result = vgrf(ir->type);
  550.             op[i ^ 1].type = BRW_REGISTER_TYPE_F;
  551.  
  552.             fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
  553.             inst->predicate = BRW_PREDICATE_NORMAL;
  554.             inst->predicate_inverse = true;
  555.             return true;
  556.          }
  557.       }
  558.    }
  559.  
  560.    emit_bool_to_cond_code(cmp);
  561.  
  562.    fs_reg temp = vgrf(ir->type);
  563.    emit(MOV(temp, fs_reg(1.0f)));
  564.  
  565.    this->result = vgrf(ir->type);
  566.    fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
  567.    inst->predicate = BRW_PREDICATE_NORMAL;
  568.  
  569.    return true;
  570. }
  571.  
  572. static int
  573. pack_pixel_offset(float x)
  574. {
  575.    /* Clamp upper end of the range to +7/16. See explanation in non-constant
  576.     * offset case below. */
  577.    int n = MIN2((int)(x * 16), 7);
  578.    return n & 0xf;
  579. }
  580.  
  581. void
  582. fs_visitor::emit_interpolate_expression(ir_expression *ir)
  583. {
  584.    /* in SIMD16 mode, the pixel interpolator returns coords interleaved
  585.     * 8 channels at a time, same as the barycentric coords presented in
  586.     * the FS payload. this requires a bit of extra work to support.
  587.     */
  588.    no16("interpolate_at_* not yet supported in SIMD16 mode.");
  589.  
  590.    assert(stage == MESA_SHADER_FRAGMENT);
  591.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  592.  
  593.    ir_dereference * deref = ir->operands[0]->as_dereference();
  594.    ir_swizzle * swiz = NULL;
  595.    if (!deref) {
  596.       /* the api does not allow a swizzle here, but the varying packing code
  597.        * may have pushed one into here.
  598.        */
  599.       swiz = ir->operands[0]->as_swizzle();
  600.       assert(swiz);
  601.       deref = swiz->val->as_dereference();
  602.    }
  603.    assert(deref);
  604.    ir_variable * var = deref->variable_referenced();
  605.    assert(var);
  606.  
  607.    /* 1. collect interpolation factors */
  608.  
  609.    fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
  610.  
  611.    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
  612.     * even when there is no payload. in the per-slot offset case, we'll replace this with
  613.     * the proper source data. */
  614.    fs_reg src = vgrf(glsl_type::float_type);
  615.    int mlen = 1;     /* one reg unless overriden */
  616.    int reg_width = dispatch_width / 8;
  617.    fs_inst *inst;
  618.  
  619.    switch (ir->operation) {
  620.    case ir_unop_interpolate_at_centroid:
  621.       inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
  622.       break;
  623.  
  624.    case ir_binop_interpolate_at_sample: {
  625.       ir_constant *sample_num = ir->operands[1]->as_constant();
  626.       assert(sample_num || !"nonconstant sample number should have been lowered.");
  627.  
  628.       unsigned msg_data = sample_num->value.i[0] << 4;
  629.       inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
  630.       break;
  631.    }
  632.  
  633.    case ir_binop_interpolate_at_offset: {
  634.       ir_constant *const_offset = ir->operands[1]->as_constant();
  635.       if (const_offset) {
  636.          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
  637.                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
  638.          inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
  639.                      fs_reg(msg_data));
  640.       } else {
  641.          /* pack the operands: hw wants offsets as 4 bit signed ints */
  642.          ir->operands[1]->accept(this);
  643.          src = vgrf(glsl_type::ivec2_type);
  644.          fs_reg src2 = src;
  645.          for (int i = 0; i < 2; i++) {
  646.             fs_reg temp = vgrf(glsl_type::float_type);
  647.             emit(MUL(temp, this->result, fs_reg(16.0f)));
  648.             emit(MOV(src2, temp));  /* float to int */
  649.  
  650.             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
  651.              * that we support a maximum offset of +0.5, which isn't representable
  652.              * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
  653.              * which is the opposite of what the shader author wanted.
  654.              *
  655.              * This is legal due to ARB_gpu_shader5's quantization rules:
  656.              *
  657.              * "Not all values of <offset> may be supported; x and y offsets may
  658.              * be rounded to fixed-point values with the number of fraction bits
  659.              * given by the implementation-dependent constant
  660.              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
  661.              */
  662.  
  663.             fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
  664.             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
  665.  
  666.             src2 = offset(src2, 1);
  667.             this->result = offset(this->result, 1);
  668.          }
  669.  
  670.          mlen = 2 * reg_width;
  671.          inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
  672.                      fs_reg(0u));
  673.       }
  674.       break;
  675.    }
  676.  
  677.    default:
  678.       unreachable("not reached");
  679.    }
  680.  
  681.    inst->mlen = mlen;
  682.    inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
  683.    inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
  684.          INTERP_QUALIFIER_NOPERSPECTIVE;
  685.  
  686.    /* 2. emit linterp */
  687.  
  688.    fs_reg res = vgrf(ir->type);
  689.    this->result = res;
  690.  
  691.    for (int i = 0; i < ir->type->vector_elements; i++) {
  692.       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
  693.       emit(FS_OPCODE_LINTERP, res, dst_xy,
  694.            fs_reg(interp_reg(var->data.location, ch)));
  695.       res = offset(res, 1);
  696.    }
  697. }
  698.  
  699. void
  700. fs_visitor::visit(ir_expression *ir)
  701. {
  702.    unsigned int operand;
  703.    fs_reg op[3], temp;
  704.    fs_inst *inst;
  705.    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
  706.  
  707.    assert(ir->get_num_operands() <= 3);
  708.  
  709.    if (try_emit_saturate(ir))
  710.       return;
  711.  
  712.    /* Deal with the real oddball stuff first */
  713.    switch (ir->operation) {
  714.    case ir_binop_add:
  715.       if (devinfo->gen <= 5 && try_emit_line(ir))
  716.          return;
  717.       if (try_emit_mad(ir))
  718.          return;
  719.       break;
  720.  
  721.    case ir_triop_csel:
  722.       ir->operands[1]->accept(this);
  723.       op[1] = this->result;
  724.       ir->operands[2]->accept(this);
  725.       op[2] = this->result;
  726.  
  727.       emit_bool_to_cond_code(ir->operands[0]);
  728.  
  729.       this->result = vgrf(ir->type);
  730.       inst = emit(SEL(this->result, op[1], op[2]));
  731.       inst->predicate = BRW_PREDICATE_NORMAL;
  732.       return;
  733.  
  734.    case ir_unop_b2f:
  735.       if (devinfo->gen <= 5 && try_emit_b2f_of_comparison(ir))
  736.          return;
  737.       break;
  738.  
  739.    case ir_unop_interpolate_at_centroid:
  740.    case ir_binop_interpolate_at_offset:
  741.    case ir_binop_interpolate_at_sample:
  742.       emit_interpolate_expression(ir);
  743.       return;
  744.  
  745.    default:
  746.       break;
  747.    }
  748.  
  749.    for (operand = 0; operand < ir->get_num_operands(); operand++) {
  750.       ir->operands[operand]->accept(this);
  751.       if (this->result.file == BAD_FILE) {
  752.          fail("Failed to get tree for expression operand:\n");
  753.          ir->operands[operand]->fprint(stderr);
  754.          fprintf(stderr, "\n");
  755.       }
  756.       assert(this->result.file == GRF ||
  757.              this->result.file == UNIFORM || this->result.file == ATTR);
  758.       op[operand] = this->result;
  759.  
  760.       /* Matrix expression operands should have been broken down to vector
  761.        * operations already.
  762.        */
  763.       assert(!ir->operands[operand]->type->is_matrix());
  764.       /* And then those vector operands should have been broken down to scalar.
  765.        */
  766.       assert(!ir->operands[operand]->type->is_vector());
  767.    }
  768.  
  769.    /* Storage for our result.  If our result goes into an assignment, it will
  770.     * just get copy-propagated out, so no worries.
  771.     */
  772.    this->result = vgrf(ir->type);
  773.  
  774.    switch (ir->operation) {
  775.    case ir_unop_logic_not:
  776.       emit(NOT(this->result, op[0]));
  777.       break;
  778.    case ir_unop_neg:
  779.       op[0].negate = !op[0].negate;
  780.       emit(MOV(this->result, op[0]));
  781.       break;
  782.    case ir_unop_abs:
  783.       op[0].abs = true;
  784.       op[0].negate = false;
  785.       emit(MOV(this->result, op[0]));
  786.       break;
  787.    case ir_unop_sign:
  788.       if (ir->type->is_float()) {
  789.          /* AND(val, 0x80000000) gives the sign bit.
  790.           *
  791.           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
  792.           * zero.
  793.           */
  794.          emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
  795.  
  796.          op[0].type = BRW_REGISTER_TYPE_UD;
  797.          this->result.type = BRW_REGISTER_TYPE_UD;
  798.          emit(AND(this->result, op[0], fs_reg(0x80000000u)));
  799.  
  800.          inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
  801.          inst->predicate = BRW_PREDICATE_NORMAL;
  802.  
  803.          this->result.type = BRW_REGISTER_TYPE_F;
  804.       } else {
  805.          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
  806.           *               -> non-negative val generates 0x00000000.
  807.           *  Predicated OR sets 1 if val is positive.
  808.           */
  809.          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
  810.  
  811.          emit(ASR(this->result, op[0], fs_reg(31)));
  812.  
  813.          inst = emit(OR(this->result, this->result, fs_reg(1)));
  814.          inst->predicate = BRW_PREDICATE_NORMAL;
  815.       }
  816.       break;
  817.    case ir_unop_rcp:
  818.       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
  819.       break;
  820.  
  821.    case ir_unop_exp2:
  822.       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
  823.       break;
  824.    case ir_unop_log2:
  825.       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
  826.       break;
  827.    case ir_unop_exp:
  828.    case ir_unop_log:
  829.       unreachable("not reached: should be handled by ir_explog_to_explog2");
  830.    case ir_unop_sin:
  831.       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
  832.       break;
  833.    case ir_unop_cos:
  834.       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
  835.       break;
  836.  
  837.    case ir_unop_dFdx:
  838.       /* Select one of the two opcodes based on the glHint value. */
  839.       if (fs_key->high_quality_derivatives)
  840.          emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
  841.       else
  842.          emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
  843.       break;
  844.  
  845.    case ir_unop_dFdx_coarse:
  846.       emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
  847.       break;
  848.  
  849.    case ir_unop_dFdx_fine:
  850.       emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
  851.       break;
  852.  
  853.    case ir_unop_dFdy:
  854.       /* Select one of the two opcodes based on the glHint value. */
  855.       if (fs_key->high_quality_derivatives)
  856.          emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
  857.       else
  858.          emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
  859.       break;
  860.  
  861.    case ir_unop_dFdy_coarse:
  862.       emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
  863.       break;
  864.  
  865.    case ir_unop_dFdy_fine:
  866.       emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
  867.       break;
  868.  
  869.    case ir_binop_add:
  870.       emit(ADD(this->result, op[0], op[1]));
  871.       break;
  872.    case ir_binop_sub:
  873.       unreachable("not reached: should be handled by ir_sub_to_add_neg");
  874.  
  875.    case ir_binop_mul:
  876.       emit(MUL(this->result, op[0], op[1]));
  877.       break;
  878.    case ir_binop_imul_high: {
  879.       if (devinfo->gen >= 7)
  880.          no16("SIMD16 explicit accumulator operands unsupported\n");
  881.  
  882.       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
  883.                                   this->result.type);
  884.  
  885.       fs_inst *mul = emit(MUL(acc, op[0], op[1]));
  886.       emit(MACH(this->result, op[0], op[1]));
  887.  
  888.       /* Until Gen8, integer multiplies read 32-bits from one source, and
  889.        * 16-bits from the other, and relying on the MACH instruction to
  890.        * generate the high bits of the result.
  891.        *
  892.        * On Gen8, the multiply instruction does a full 32x32-bit multiply,
  893.        * but in order to do a 64x64-bit multiply we have to simulate the
  894.        * previous behavior and then use a MACH instruction.
  895.        *
  896.        * FINISHME: Don't use source modifiers on src1.
  897.        */
  898.       if (devinfo->gen >= 8) {
  899.          assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
  900.                 mul->src[1].type == BRW_REGISTER_TYPE_UD);
  901.          if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
  902.             mul->src[1].type = BRW_REGISTER_TYPE_W;
  903.             mul->src[1].stride = 2;
  904.          } else {
  905.             mul->src[1].type = BRW_REGISTER_TYPE_UW;
  906.             mul->src[1].stride = 2;
  907.          }
  908.       }
  909.  
  910.       break;
  911.    }
  912.    case ir_binop_div:
  913.       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
  914.       assert(ir->type->is_integer());
  915.       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
  916.       break;
  917.    case ir_binop_carry: {
  918.       if (devinfo->gen >= 7)
  919.          no16("SIMD16 explicit accumulator operands unsupported\n");
  920.  
  921.       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
  922.                                   BRW_REGISTER_TYPE_UD);
  923.  
  924.       emit(ADDC(reg_null_ud, op[0], op[1]));
  925.       emit(MOV(this->result, fs_reg(acc)));
  926.       break;
  927.    }
  928.    case ir_binop_borrow: {
  929.       if (devinfo->gen >= 7)
  930.          no16("SIMD16 explicit accumulator operands unsupported\n");
  931.  
  932.       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
  933.                                   BRW_REGISTER_TYPE_UD);
  934.  
  935.       emit(SUBB(reg_null_ud, op[0], op[1]));
  936.       emit(MOV(this->result, fs_reg(acc)));
  937.       break;
  938.    }
  939.    case ir_binop_mod:
  940.       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
  941.       assert(ir->type->is_integer());
  942.       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
  943.       break;
  944.  
  945.    case ir_binop_less:
  946.    case ir_binop_greater:
  947.    case ir_binop_lequal:
  948.    case ir_binop_gequal:
  949.    case ir_binop_equal:
  950.    case ir_binop_all_equal:
  951.    case ir_binop_nequal:
  952.    case ir_binop_any_nequal:
  953.       if (devinfo->gen <= 5) {
  954.          resolve_bool_comparison(ir->operands[0], &op[0]);
  955.          resolve_bool_comparison(ir->operands[1], &op[1]);
  956.       }
  957.  
  958.       emit(CMP(this->result, op[0], op[1],
  959.                brw_conditional_for_comparison(ir->operation)));
  960.       break;
  961.  
  962.    case ir_binop_logic_xor:
  963.       emit(XOR(this->result, op[0], op[1]));
  964.       break;
  965.  
  966.    case ir_binop_logic_or:
  967.       emit(OR(this->result, op[0], op[1]));
  968.       break;
  969.  
  970.    case ir_binop_logic_and:
  971.       emit(AND(this->result, op[0], op[1]));
  972.       break;
  973.  
  974.    case ir_binop_dot:
  975.    case ir_unop_any:
  976.       unreachable("not reached: should be handled by brw_fs_channel_expressions");
  977.  
  978.    case ir_unop_noise:
  979.       unreachable("not reached: should be handled by lower_noise");
  980.  
  981.    case ir_quadop_vector:
  982.       unreachable("not reached: should be handled by lower_quadop_vector");
  983.  
  984.    case ir_binop_vector_extract:
  985.       unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
  986.  
  987.    case ir_triop_vector_insert:
  988.       unreachable("not reached: should be handled by lower_vector_insert()");
  989.  
  990.    case ir_binop_ldexp:
  991.       unreachable("not reached: should be handled by ldexp_to_arith()");
  992.  
  993.    case ir_unop_sqrt:
  994.       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
  995.       break;
  996.  
  997.    case ir_unop_rsq:
  998.       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
  999.       break;
  1000.  
  1001.    case ir_unop_bitcast_i2f:
  1002.    case ir_unop_bitcast_u2f:
  1003.       op[0].type = BRW_REGISTER_TYPE_F;
  1004.       this->result = op[0];
  1005.       break;
  1006.    case ir_unop_i2u:
  1007.    case ir_unop_bitcast_f2u:
  1008.       op[0].type = BRW_REGISTER_TYPE_UD;
  1009.       this->result = op[0];
  1010.       break;
  1011.    case ir_unop_u2i:
  1012.    case ir_unop_bitcast_f2i:
  1013.       op[0].type = BRW_REGISTER_TYPE_D;
  1014.       this->result = op[0];
  1015.       break;
  1016.    case ir_unop_i2f:
  1017.    case ir_unop_u2f:
  1018.    case ir_unop_f2i:
  1019.    case ir_unop_f2u:
  1020.       emit(MOV(this->result, op[0]));
  1021.       break;
  1022.  
  1023.    case ir_unop_b2i:
  1024.       emit(AND(this->result, op[0], fs_reg(1)));
  1025.       break;
  1026.    case ir_unop_b2f:
  1027.       if (devinfo->gen <= 5) {
  1028.          resolve_bool_comparison(ir->operands[0], &op[0]);
  1029.       }
  1030.       op[0].type = BRW_REGISTER_TYPE_D;
  1031.       this->result.type = BRW_REGISTER_TYPE_D;
  1032.       emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
  1033.       this->result.type = BRW_REGISTER_TYPE_F;
  1034.       break;
  1035.  
  1036.    case ir_unop_f2b:
  1037.       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
  1038.       break;
  1039.    case ir_unop_i2b:
  1040.       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
  1041.       break;
  1042.  
  1043.    case ir_unop_trunc:
  1044.       emit(RNDZ(this->result, op[0]));
  1045.       break;
  1046.    case ir_unop_ceil: {
  1047.          fs_reg tmp = vgrf(ir->type);
  1048.          op[0].negate = !op[0].negate;
  1049.          emit(RNDD(tmp, op[0]));
  1050.          tmp.negate = true;
  1051.          emit(MOV(this->result, tmp));
  1052.       }
  1053.       break;
  1054.    case ir_unop_floor:
  1055.       emit(RNDD(this->result, op[0]));
  1056.       break;
  1057.    case ir_unop_fract:
  1058.       emit(FRC(this->result, op[0]));
  1059.       break;
  1060.    case ir_unop_round_even:
  1061.       emit(RNDE(this->result, op[0]));
  1062.       break;
  1063.  
  1064.    case ir_binop_min:
  1065.    case ir_binop_max:
  1066.       resolve_ud_negate(&op[0]);
  1067.       resolve_ud_negate(&op[1]);
  1068.       emit_minmax(ir->operation == ir_binop_min ?
  1069.                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
  1070.                   this->result, op[0], op[1]);
  1071.       break;
  1072.    case ir_unop_pack_snorm_2x16:
  1073.    case ir_unop_pack_snorm_4x8:
  1074.    case ir_unop_pack_unorm_2x16:
  1075.    case ir_unop_pack_unorm_4x8:
  1076.    case ir_unop_unpack_snorm_2x16:
  1077.    case ir_unop_unpack_snorm_4x8:
  1078.    case ir_unop_unpack_unorm_2x16:
  1079.    case ir_unop_unpack_unorm_4x8:
  1080.    case ir_unop_unpack_half_2x16:
  1081.    case ir_unop_pack_half_2x16:
  1082.       unreachable("not reached: should be handled by lower_packing_builtins");
  1083.    case ir_unop_unpack_half_2x16_split_x:
  1084.       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
  1085.       break;
  1086.    case ir_unop_unpack_half_2x16_split_y:
  1087.       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
  1088.       break;
  1089.    case ir_binop_pow:
  1090.       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
  1091.       break;
  1092.  
  1093.    case ir_unop_bitfield_reverse:
  1094.       emit(BFREV(this->result, op[0]));
  1095.       break;
  1096.    case ir_unop_bit_count:
  1097.       emit(CBIT(this->result, op[0]));
  1098.       break;
  1099.    case ir_unop_find_msb:
  1100.       temp = vgrf(glsl_type::uint_type);
  1101.       emit(FBH(temp, op[0]));
  1102.  
  1103.       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
  1104.        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
  1105.        * subtract the result from 31 to convert the MSB count into an LSB count.
  1106.        */
  1107.  
  1108.       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
  1109.       emit(MOV(this->result, temp));
  1110.       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
  1111.  
  1112.       temp.negate = true;
  1113.       inst = emit(ADD(this->result, temp, fs_reg(31)));
  1114.       inst->predicate = BRW_PREDICATE_NORMAL;
  1115.       break;
  1116.    case ir_unop_find_lsb:
  1117.       emit(FBL(this->result, op[0]));
  1118.       break;
  1119.    case ir_unop_saturate:
  1120.       inst = emit(MOV(this->result, op[0]));
  1121.       inst->saturate = true;
  1122.       break;
  1123.    case ir_triop_bitfield_extract:
  1124.       /* Note that the instruction's argument order is reversed from GLSL
  1125.        * and the IR.
  1126.        */
  1127.       emit(BFE(this->result, op[2], op[1], op[0]));
  1128.       break;
  1129.    case ir_binop_bfm:
  1130.       emit(BFI1(this->result, op[0], op[1]));
  1131.       break;
  1132.    case ir_triop_bfi:
  1133.       emit(BFI2(this->result, op[0], op[1], op[2]));
  1134.       break;
  1135.    case ir_quadop_bitfield_insert:
  1136.       unreachable("not reached: should be handled by "
  1137.               "lower_instructions::bitfield_insert_to_bfm_bfi");
  1138.  
  1139.    case ir_unop_bit_not:
  1140.       emit(NOT(this->result, op[0]));
  1141.       break;
  1142.    case ir_binop_bit_and:
  1143.       emit(AND(this->result, op[0], op[1]));
  1144.       break;
  1145.    case ir_binop_bit_xor:
  1146.       emit(XOR(this->result, op[0], op[1]));
  1147.       break;
  1148.    case ir_binop_bit_or:
  1149.       emit(OR(this->result, op[0], op[1]));
  1150.       break;
  1151.  
  1152.    case ir_binop_lshift:
  1153.       emit(SHL(this->result, op[0], op[1]));
  1154.       break;
  1155.  
  1156.    case ir_binop_rshift:
  1157.       if (ir->type->base_type == GLSL_TYPE_INT)
  1158.          emit(ASR(this->result, op[0], op[1]));
  1159.       else
  1160.          emit(SHR(this->result, op[0], op[1]));
  1161.       break;
  1162.    case ir_binop_pack_half_2x16_split:
  1163.       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
  1164.       break;
  1165.    case ir_binop_ubo_load: {
  1166.       /* This IR node takes a constant uniform block and a constant or
  1167.        * variable byte offset within the block and loads a vector from that.
  1168.        */
  1169.       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
  1170.       ir_constant *const_offset = ir->operands[1]->as_constant();
  1171.       fs_reg surf_index;
  1172.  
  1173.       if (const_uniform_block) {
  1174.          /* The block index is a constant, so just emit the binding table entry
  1175.           * as an immediate.
  1176.           */
  1177.          surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
  1178.                                  const_uniform_block->value.u[0]);
  1179.       } else {
  1180.          /* The block index is not a constant. Evaluate the index expression
  1181.           * per-channel and add the base UBO index; we have to select a value
  1182.           * from any live channel.
  1183.           */
  1184.          surf_index = vgrf(glsl_type::uint_type);
  1185.          emit(ADD(surf_index, op[0],
  1186.                   fs_reg(stage_prog_data->binding_table.ubo_start)));
  1187.          emit_uniformize(surf_index, surf_index);
  1188.  
  1189.          /* Assume this may touch any UBO. It would be nice to provide
  1190.           * a tighter bound, but the array information is already lowered away.
  1191.           */
  1192.          brw_mark_surface_used(prog_data,
  1193.                                stage_prog_data->binding_table.ubo_start +
  1194.                                shader_prog->NumUniformBlocks - 1);
  1195.       }
  1196.  
  1197.       if (const_offset) {
  1198.          fs_reg packed_consts = vgrf(glsl_type::float_type);
  1199.          packed_consts.type = result.type;
  1200.  
  1201.          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
  1202.          emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
  1203.                                    packed_consts, surf_index, const_offset_reg));
  1204.  
  1205.          for (int i = 0; i < ir->type->vector_elements; i++) {
  1206.             packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
  1207.  
  1208.             /* The std140 packing rules don't allow vectors to cross 16-byte
  1209.              * boundaries, and a reg is 32 bytes.
  1210.              */
  1211.             assert(packed_consts.subreg_offset < 32);
  1212.  
  1213.             /* UBO bools are any nonzero value.  We consider bools to be
  1214.              * values with the low bit set to 1.  Convert them using CMP.
  1215.              */
  1216.             if (ir->type->base_type == GLSL_TYPE_BOOL) {
  1217.                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
  1218.             } else {
  1219.                emit(MOV(result, packed_consts));
  1220.             }
  1221.  
  1222.             result = offset(result, 1);
  1223.          }
  1224.       } else {
  1225.          /* Turn the byte offset into a dword offset. */
  1226.          fs_reg base_offset = vgrf(glsl_type::int_type);
  1227.          emit(SHR(base_offset, op[1], fs_reg(2)));
  1228.  
  1229.          for (int i = 0; i < ir->type->vector_elements; i++) {
  1230.             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
  1231.                                             base_offset, i));
  1232.  
  1233.             if (ir->type->base_type == GLSL_TYPE_BOOL)
  1234.                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
  1235.  
  1236.             result = offset(result, 1);
  1237.          }
  1238.       }
  1239.  
  1240.       result.reg_offset = 0;
  1241.       break;
  1242.    }
  1243.  
  1244.    case ir_triop_fma:
  1245.       /* Note that the instruction's argument order is reversed from GLSL
  1246.        * and the IR.
  1247.        */
  1248.       emit(MAD(this->result, op[2], op[1], op[0]));
  1249.       break;
  1250.  
  1251.    case ir_triop_lrp:
  1252.       emit_lrp(this->result, op[0], op[1], op[2]);
  1253.       break;
  1254.  
  1255.    case ir_triop_csel:
  1256.    case ir_unop_interpolate_at_centroid:
  1257.    case ir_binop_interpolate_at_offset:
  1258.    case ir_binop_interpolate_at_sample:
  1259.       unreachable("already handled above");
  1260.       break;
  1261.  
  1262.    case ir_unop_d2f:
  1263.    case ir_unop_f2d:
  1264.    case ir_unop_d2i:
  1265.    case ir_unop_i2d:
  1266.    case ir_unop_d2u:
  1267.    case ir_unop_u2d:
  1268.    case ir_unop_d2b:
  1269.    case ir_unop_pack_double_2x32:
  1270.    case ir_unop_unpack_double_2x32:
  1271.    case ir_unop_frexp_sig:
  1272.    case ir_unop_frexp_exp:
  1273.       unreachable("fp64 todo");
  1274.       break;
  1275.    }
  1276. }
  1277.  
  1278. void
  1279. fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
  1280.                                    const glsl_type *type, bool predicated)
  1281. {
  1282.    switch (type->base_type) {
  1283.    case GLSL_TYPE_FLOAT:
  1284.    case GLSL_TYPE_UINT:
  1285.    case GLSL_TYPE_INT:
  1286.    case GLSL_TYPE_BOOL:
  1287.       for (unsigned int i = 0; i < type->components(); i++) {
  1288.          l.type = brw_type_for_base_type(type);
  1289.          r.type = brw_type_for_base_type(type);
  1290.  
  1291.          if (predicated || !l.equals(r)) {
  1292.             fs_inst *inst = emit(MOV(l, r));
  1293.             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
  1294.          }
  1295.  
  1296.          l = offset(l, 1);
  1297.          r = offset(r, 1);
  1298.       }
  1299.       break;
  1300.    case GLSL_TYPE_ARRAY:
  1301.       for (unsigned int i = 0; i < type->length; i++) {
  1302.          emit_assignment_writes(l, r, type->fields.array, predicated);
  1303.       }
  1304.       break;
  1305.  
  1306.    case GLSL_TYPE_STRUCT:
  1307.       for (unsigned int i = 0; i < type->length; i++) {
  1308.          emit_assignment_writes(l, r, type->fields.structure[i].type,
  1309.                                 predicated);
  1310.       }
  1311.       break;
  1312.  
  1313.    case GLSL_TYPE_SAMPLER:
  1314.    case GLSL_TYPE_IMAGE:
  1315.    case GLSL_TYPE_ATOMIC_UINT:
  1316.       break;
  1317.  
  1318.    case GLSL_TYPE_DOUBLE:
  1319.    case GLSL_TYPE_VOID:
  1320.    case GLSL_TYPE_ERROR:
  1321.    case GLSL_TYPE_INTERFACE:
  1322.       unreachable("not reached");
  1323.    }
  1324. }
  1325.  
  1326. /* If the RHS processing resulted in an instruction generating a
  1327.  * temporary value, and it would be easy to rewrite the instruction to
  1328.  * generate its result right into the LHS instead, do so.  This ends
  1329.  * up reliably removing instructions where it can be tricky to do so
  1330.  * later without real UD chain information.
  1331.  */
  1332. bool
  1333. fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
  1334.                                    fs_reg dst,
  1335.                                    fs_reg src,
  1336.                                    fs_inst *pre_rhs_inst,
  1337.                                    fs_inst *last_rhs_inst)
  1338. {
  1339.    /* Only attempt if we're doing a direct assignment. */
  1340.    if (ir->condition ||
  1341.        !(ir->lhs->type->is_scalar() ||
  1342.         (ir->lhs->type->is_vector() &&
  1343.          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
  1344.       return false;
  1345.  
  1346.    /* Make sure the last instruction generated our source reg. */
  1347.    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
  1348.                                                     last_rhs_inst,
  1349.                                                     src);
  1350.    if (!modify)
  1351.       return false;
  1352.  
  1353.    /* If last_rhs_inst wrote a different number of components than our LHS,
  1354.     * we can't safely rewrite it.
  1355.     */
  1356.    if (alloc.sizes[dst.reg] != modify->regs_written)
  1357.       return false;
  1358.  
  1359.    /* Success!  Rewrite the instruction. */
  1360.    modify->dst = dst;
  1361.  
  1362.    return true;
  1363. }
  1364.  
  1365. void
  1366. fs_visitor::visit(ir_assignment *ir)
  1367. {
  1368.    fs_reg l, r;
  1369.    fs_inst *inst;
  1370.  
  1371.    /* FINISHME: arrays on the lhs */
  1372.    ir->lhs->accept(this);
  1373.    l = this->result;
  1374.  
  1375.    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
  1376.  
  1377.    ir->rhs->accept(this);
  1378.    r = this->result;
  1379.  
  1380.    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
  1381.  
  1382.    assert(l.file != BAD_FILE);
  1383.    assert(r.file != BAD_FILE);
  1384.  
  1385.    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
  1386.       return;
  1387.  
  1388.    if (ir->condition) {
  1389.       emit_bool_to_cond_code(ir->condition);
  1390.    }
  1391.  
  1392.    if (ir->lhs->type->is_scalar() ||
  1393.        ir->lhs->type->is_vector()) {
  1394.       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
  1395.          if (ir->write_mask & (1 << i)) {
  1396.             inst = emit(MOV(l, r));
  1397.             if (ir->condition)
  1398.                inst->predicate = BRW_PREDICATE_NORMAL;
  1399.             r = offset(r, 1);
  1400.          }
  1401.          l = offset(l, 1);
  1402.       }
  1403.    } else {
  1404.       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
  1405.    }
  1406. }
  1407.  
  1408. fs_inst *
  1409. fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
  1410.                               fs_reg coordinate, int coord_components,
  1411.                               fs_reg shadow_c,
  1412.                               fs_reg lod, fs_reg dPdy, int grad_components,
  1413.                               uint32_t sampler)
  1414. {
  1415.    int mlen;
  1416.    int base_mrf = 1;
  1417.    bool simd16 = false;
  1418.    fs_reg orig_dst;
  1419.  
  1420.    /* g0 header. */
  1421.    mlen = 1;
  1422.  
  1423.    if (shadow_c.file != BAD_FILE) {
  1424.       for (int i = 0; i < coord_components; i++) {
  1425.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
  1426.          coordinate = offset(coordinate, 1);
  1427.       }
  1428.  
  1429.       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
  1430.        * the unused slots must be zeroed.
  1431.        */
  1432.       for (int i = coord_components; i < 3; i++) {
  1433.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
  1434.       }
  1435.       mlen += 3;
  1436.  
  1437.       if (op == ir_tex) {
  1438.          /* There's no plain shadow compare message, so we use shadow
  1439.           * compare with a bias of 0.0.
  1440.           */
  1441.          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
  1442.          mlen++;
  1443.       } else if (op == ir_txb || op == ir_txl) {
  1444.          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
  1445.          mlen++;
  1446.       } else {
  1447.          unreachable("Should not get here.");
  1448.       }
  1449.  
  1450.       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
  1451.       mlen++;
  1452.    } else if (op == ir_tex) {
  1453.       for (int i = 0; i < coord_components; i++) {
  1454.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
  1455.          coordinate = offset(coordinate, 1);
  1456.       }
  1457.       /* zero the others. */
  1458.       for (int i = coord_components; i<3; i++) {
  1459.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
  1460.       }
  1461.       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
  1462.       mlen += 3;
  1463.    } else if (op == ir_txd) {
  1464.       fs_reg &dPdx = lod;
  1465.  
  1466.       for (int i = 0; i < coord_components; i++) {
  1467.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
  1468.          coordinate = offset(coordinate, 1);
  1469.       }
  1470.       /* the slots for u and v are always present, but r is optional */
  1471.       mlen += MAX2(coord_components, 2);
  1472.  
  1473.       /*  P   = u, v, r
  1474.        * dPdx = dudx, dvdx, drdx
  1475.        * dPdy = dudy, dvdy, drdy
  1476.        *
  1477.        * 1-arg: Does not exist.
  1478.        *
  1479.        * 2-arg: dudx   dvdx   dudy   dvdy
  1480.        *        dPdx.x dPdx.y dPdy.x dPdy.y
  1481.        *        m4     m5     m6     m7
  1482.        *
  1483.        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
  1484.        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
  1485.        *        m5     m6     m7     m8     m9     m10
  1486.        */
  1487.       for (int i = 0; i < grad_components; i++) {
  1488.          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
  1489.          dPdx = offset(dPdx, 1);
  1490.       }
  1491.       mlen += MAX2(grad_components, 2);
  1492.  
  1493.       for (int i = 0; i < grad_components; i++) {
  1494.          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
  1495.          dPdy = offset(dPdy, 1);
  1496.       }
  1497.       mlen += MAX2(grad_components, 2);
  1498.    } else if (op == ir_txs) {
  1499.       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
  1500.       simd16 = true;
  1501.       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
  1502.       mlen += 2;
  1503.    } else {
  1504.       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
  1505.        * instructions.  We'll need to do SIMD16 here.
  1506.        */
  1507.       simd16 = true;
  1508.       assert(op == ir_txb || op == ir_txl || op == ir_txf);
  1509.  
  1510.       for (int i = 0; i < coord_components; i++) {
  1511.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
  1512.                   coordinate));
  1513.          coordinate = offset(coordinate, 1);
  1514.       }
  1515.  
  1516.       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
  1517.        * be necessary for TXF (ld), but seems wise to do for all messages.
  1518.        */
  1519.       for (int i = coord_components; i < 3; i++) {
  1520.          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
  1521.       }
  1522.  
  1523.       /* lod/bias appears after u/v/r. */
  1524.       mlen += 6;
  1525.  
  1526.       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
  1527.       mlen++;
  1528.  
  1529.       /* The unused upper half. */
  1530.       mlen++;
  1531.    }
  1532.  
  1533.    if (simd16) {
  1534.       /* Now, since we're doing simd16, the return is 2 interleaved
  1535.        * vec4s where the odd-indexed ones are junk. We'll need to move
  1536.        * this weirdness around to the expected layout.
  1537.        */
  1538.       orig_dst = dst;
  1539.       dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
  1540.    }
  1541.  
  1542.    enum opcode opcode;
  1543.    switch (op) {
  1544.    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
  1545.    case ir_txb: opcode = FS_OPCODE_TXB; break;
  1546.    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
  1547.    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
  1548.    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
  1549.    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
  1550.    default:
  1551.       unreachable("not reached");
  1552.    }
  1553.  
  1554.    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
  1555.    inst->base_mrf = base_mrf;
  1556.    inst->mlen = mlen;
  1557.    inst->header_size = 1;
  1558.    inst->regs_written = simd16 ? 8 : 4;
  1559.  
  1560.    if (simd16) {
  1561.       for (int i = 0; i < 4; i++) {
  1562.          emit(MOV(orig_dst, dst));
  1563.          orig_dst = offset(orig_dst, 1);
  1564.          dst = offset(dst, 2);
  1565.       }
  1566.    }
  1567.  
  1568.    return inst;
  1569. }
  1570.  
  1571. fs_inst *
  1572. fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
  1573.                                      fs_reg coordinate, int vector_elements,
  1574.                                      fs_reg shadow_c, fs_reg lod,
  1575.                                      uint32_t sampler)
  1576. {
  1577.    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
  1578.    bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
  1579.  
  1580.    if (has_lod && shadow_c.file != BAD_FILE)
  1581.       no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
  1582.  
  1583.    if (op == ir_txd)
  1584.       no16("textureGrad unsupported in SIMD16.");
  1585.  
  1586.    /* Copy the coordinates. */
  1587.    for (int i = 0; i < vector_elements; i++) {
  1588.       emit(MOV(retype(offset(message, i), coordinate.type), coordinate));
  1589.       coordinate = offset(coordinate, 1);
  1590.    }
  1591.  
  1592.    fs_reg msg_end = offset(message, vector_elements);
  1593.  
  1594.    /* Messages other than sample and ld require all three components */
  1595.    if (has_lod || shadow_c.file != BAD_FILE) {
  1596.       for (int i = vector_elements; i < 3; i++) {
  1597.          emit(MOV(offset(message, i), fs_reg(0.0f)));
  1598.       }
  1599.    }
  1600.  
  1601.    if (has_lod) {
  1602.       fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
  1603.                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
  1604.       emit(MOV(msg_lod, lod));
  1605.       msg_end = offset(msg_lod, 1);
  1606.    }
  1607.  
  1608.    if (shadow_c.file != BAD_FILE) {
  1609.       fs_reg msg_ref = offset(message, 3 + has_lod);
  1610.       emit(MOV(msg_ref, shadow_c));
  1611.       msg_end = offset(msg_ref, 1);
  1612.    }
  1613.  
  1614.    enum opcode opcode;
  1615.    switch (op) {
  1616.    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
  1617.    case ir_txb: opcode = FS_OPCODE_TXB;     break;
  1618.    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
  1619.    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
  1620.    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
  1621.    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
  1622.    default: unreachable("not reached");
  1623.    }
  1624.  
  1625.    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
  1626.    inst->base_mrf = message.reg - 1;
  1627.    inst->mlen = msg_end.reg - inst->base_mrf;
  1628.    inst->header_size = 1;
  1629.    inst->regs_written = 8;
  1630.  
  1631.    return inst;
  1632. }
  1633.  
  1634. /* gen5's sampler has slots for u, v, r, array index, then optional
  1635.  * parameters like shadow comparitor or LOD bias.  If optional
  1636.  * parameters aren't present, those base slots are optional and don't
  1637.  * need to be included in the message.
  1638.  *
  1639.  * We don't fill in the unnecessary slots regardless, which may look
  1640.  * surprising in the disassembly.
  1641.  */
  1642. fs_inst *
  1643. fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
  1644.                               fs_reg coordinate, int vector_elements,
  1645.                               fs_reg shadow_c,
  1646.                               fs_reg lod, fs_reg lod2, int grad_components,
  1647.                               fs_reg sample_index, uint32_t sampler,
  1648.                               bool has_offset)
  1649. {
  1650.    int reg_width = dispatch_width / 8;
  1651.    unsigned header_size = 0;
  1652.  
  1653.    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
  1654.    fs_reg msg_coords = message;
  1655.  
  1656.    if (has_offset) {
  1657.       /* The offsets set up by the ir_texture visitor are in the
  1658.        * m1 header, so we can't go headerless.
  1659.        */
  1660.       header_size = 1;
  1661.       message.reg--;
  1662.    }
  1663.  
  1664.    for (int i = 0; i < vector_elements; i++) {
  1665.       emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
  1666.       coordinate = offset(coordinate, 1);
  1667.    }
  1668.    fs_reg msg_end = offset(msg_coords, vector_elements);
  1669.    fs_reg msg_lod = offset(msg_coords, 4);
  1670.  
  1671.    if (shadow_c.file != BAD_FILE) {
  1672.       fs_reg msg_shadow = msg_lod;
  1673.       emit(MOV(msg_shadow, shadow_c));
  1674.       msg_lod = offset(msg_shadow, 1);
  1675.       msg_end = msg_lod;
  1676.    }
  1677.  
  1678.    enum opcode opcode;
  1679.    switch (op) {
  1680.    case ir_tex:
  1681.       opcode = SHADER_OPCODE_TEX;
  1682.       break;
  1683.    case ir_txb:
  1684.       emit(MOV(msg_lod, lod));
  1685.       msg_end = offset(msg_lod, 1);
  1686.  
  1687.       opcode = FS_OPCODE_TXB;
  1688.       break;
  1689.    case ir_txl:
  1690.       emit(MOV(msg_lod, lod));
  1691.       msg_end = offset(msg_lod, 1);
  1692.  
  1693.       opcode = SHADER_OPCODE_TXL;
  1694.       break;
  1695.    case ir_txd: {
  1696.       /**
  1697.        *  P   =  u,    v,    r
  1698.        * dPdx = dudx, dvdx, drdx
  1699.        * dPdy = dudy, dvdy, drdy
  1700.        *
  1701.        * Load up these values:
  1702.        * - dudx   dudy   dvdx   dvdy   drdx   drdy
  1703.        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
  1704.        */
  1705.       msg_end = msg_lod;
  1706.       for (int i = 0; i < grad_components; i++) {
  1707.          emit(MOV(msg_end, lod));
  1708.          lod = offset(lod, 1);
  1709.          msg_end = offset(msg_end, 1);
  1710.  
  1711.          emit(MOV(msg_end, lod2));
  1712.          lod2 = offset(lod2, 1);
  1713.          msg_end = offset(msg_end, 1);
  1714.       }
  1715.  
  1716.       opcode = SHADER_OPCODE_TXD;
  1717.       break;
  1718.    }
  1719.    case ir_txs:
  1720.       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
  1721.       emit(MOV(msg_lod, lod));
  1722.       msg_end = offset(msg_lod, 1);
  1723.  
  1724.       opcode = SHADER_OPCODE_TXS;
  1725.       break;
  1726.    case ir_query_levels:
  1727.       msg_lod = msg_end;
  1728.       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
  1729.       msg_end = offset(msg_lod, 1);
  1730.  
  1731.       opcode = SHADER_OPCODE_TXS;
  1732.       break;
  1733.    case ir_txf:
  1734.       msg_lod = offset(msg_coords, 3);
  1735.       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
  1736.       msg_end = offset(msg_lod, 1);
  1737.  
  1738.       opcode = SHADER_OPCODE_TXF;
  1739.       break;
  1740.    case ir_txf_ms:
  1741.       msg_lod = offset(msg_coords, 3);
  1742.       /* lod */
  1743.       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
  1744.       /* sample index */
  1745.       emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
  1746.       msg_end = offset(msg_lod, 2);
  1747.  
  1748.       opcode = SHADER_OPCODE_TXF_CMS;
  1749.       break;
  1750.    case ir_lod:
  1751.       opcode = SHADER_OPCODE_LOD;
  1752.       break;
  1753.    case ir_tg4:
  1754.       opcode = SHADER_OPCODE_TG4;
  1755.       break;
  1756.    default:
  1757.       unreachable("not reached");
  1758.    }
  1759.  
  1760.    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
  1761.    inst->base_mrf = message.reg;
  1762.    inst->mlen = msg_end.reg - message.reg;
  1763.    inst->header_size = header_size;
  1764.    inst->regs_written = 4 * reg_width;
  1765.  
  1766.    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
  1767.       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
  1768.            " disallowed by hardware\n");
  1769.    }
  1770.  
  1771.    return inst;
  1772. }
  1773.  
  1774. static bool
  1775. is_high_sampler(const struct brw_device_info *devinfo, fs_reg sampler)
  1776. {
  1777.    if (devinfo->gen < 8 && !devinfo->is_haswell)
  1778.       return false;
  1779.  
  1780.    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
  1781. }
  1782.  
  1783. fs_inst *
  1784. fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
  1785.                               fs_reg coordinate, int coord_components,
  1786.                               fs_reg shadow_c,
  1787.                               fs_reg lod, fs_reg lod2, int grad_components,
  1788.                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
  1789.                               fs_reg offset_value)
  1790. {
  1791.    int reg_width = dispatch_width / 8;
  1792.    unsigned header_size = 0;
  1793.  
  1794.    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
  1795.    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
  1796.       sources[i] = vgrf(glsl_type::float_type);
  1797.    }
  1798.    int length = 0;
  1799.  
  1800.    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
  1801.        is_high_sampler(devinfo, sampler)) {
  1802.       /* For general texture offsets (no txf workaround), we need a header to
  1803.        * put them in.  Note that for SIMD16 we're making space for two actual
  1804.        * hardware registers here, so the emit will have to fix up for this.
  1805.        *
  1806.        * * ir4_tg4 needs to place its channel select in the header,
  1807.        * for interaction with ARB_texture_swizzle
  1808.        *
  1809.        * The sampler index is only 4-bits, so for larger sampler numbers we
  1810.        * need to offset the Sampler State Pointer in the header.
  1811.        */
  1812.       header_size = 1;
  1813.       sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
  1814.       length++;
  1815.    }
  1816.  
  1817.    if (shadow_c.file != BAD_FILE) {
  1818.       emit(MOV(sources[length], shadow_c));
  1819.       length++;
  1820.    }
  1821.  
  1822.    bool has_nonconstant_offset =
  1823.       offset_value.file != BAD_FILE && offset_value.file != IMM;
  1824.    bool coordinate_done = false;
  1825.  
  1826.    /* The sampler can only meaningfully compute LOD for fragment shader
  1827.     * messages. For all other stages, we change the opcode to ir_txl and
  1828.     * hardcode the LOD to 0.
  1829.     */
  1830.    if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
  1831.       op = ir_txl;
  1832.       lod = fs_reg(0.0f);
  1833.    }
  1834.  
  1835.    /* Set up the LOD info */
  1836.    switch (op) {
  1837.    case ir_tex:
  1838.    case ir_lod:
  1839.       break;
  1840.    case ir_txb:
  1841.       emit(MOV(sources[length], lod));
  1842.       length++;
  1843.       break;
  1844.    case ir_txl:
  1845.       emit(MOV(sources[length], lod));
  1846.       length++;
  1847.       break;
  1848.    case ir_txd: {
  1849.       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
  1850.  
  1851.       /* Load dPdx and the coordinate together:
  1852.        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
  1853.        */
  1854.       for (int i = 0; i < coord_components; i++) {
  1855.          emit(MOV(sources[length], coordinate));
  1856.          coordinate = offset(coordinate, 1);
  1857.          length++;
  1858.  
  1859.          /* For cube map array, the coordinate is (u,v,r,ai) but there are
  1860.           * only derivatives for (u, v, r).
  1861.           */
  1862.          if (i < grad_components) {
  1863.             emit(MOV(sources[length], lod));
  1864.             lod = offset(lod, 1);
  1865.             length++;
  1866.  
  1867.             emit(MOV(sources[length], lod2));
  1868.             lod2 = offset(lod2, 1);
  1869.             length++;
  1870.          }
  1871.       }
  1872.  
  1873.       coordinate_done = true;
  1874.       break;
  1875.    }
  1876.    case ir_txs:
  1877.       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
  1878.       length++;
  1879.       break;
  1880.    case ir_query_levels:
  1881.       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
  1882.       length++;
  1883.       break;
  1884.    case ir_txf:
  1885.       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
  1886.        * On Gen9 they are u, v, lod, r
  1887.        */
  1888.  
  1889.       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
  1890.       coordinate = offset(coordinate, 1);
  1891.       length++;
  1892.  
  1893.       if (devinfo->gen >= 9) {
  1894.          if (coord_components >= 2) {
  1895.             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
  1896.             coordinate = offset(coordinate, 1);
  1897.          }
  1898.          length++;
  1899.       }
  1900.  
  1901.       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
  1902.       length++;
  1903.  
  1904.       for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
  1905.          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
  1906.          coordinate = offset(coordinate, 1);
  1907.          length++;
  1908.       }
  1909.  
  1910.       coordinate_done = true;
  1911.       break;
  1912.    case ir_txf_ms:
  1913.       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
  1914.       length++;
  1915.  
  1916.       /* data from the multisample control surface */
  1917.       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
  1918.       length++;
  1919.  
  1920.       /* there is no offsetting for this message; just copy in the integer
  1921.        * texture coordinates
  1922.        */
  1923.       for (int i = 0; i < coord_components; i++) {
  1924.          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
  1925.          coordinate = offset(coordinate, 1);
  1926.          length++;
  1927.       }
  1928.  
  1929.       coordinate_done = true;
  1930.       break;
  1931.    case ir_tg4:
  1932.       if (has_nonconstant_offset) {
  1933.          if (shadow_c.file != BAD_FILE)
  1934.             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
  1935.  
  1936.          /* More crazy intermixing */
  1937.          for (int i = 0; i < 2; i++) { /* u, v */
  1938.             emit(MOV(sources[length], coordinate));
  1939.             coordinate = offset(coordinate, 1);
  1940.             length++;
  1941.          }
  1942.  
  1943.          for (int i = 0; i < 2; i++) { /* offu, offv */
  1944.             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
  1945.             offset_value = offset(offset_value, 1);
  1946.             length++;
  1947.          }
  1948.  
  1949.          if (coord_components == 3) { /* r if present */
  1950.             emit(MOV(sources[length], coordinate));
  1951.             coordinate = offset(coordinate, 1);
  1952.             length++;
  1953.          }
  1954.  
  1955.          coordinate_done = true;
  1956.       }
  1957.       break;
  1958.    }
  1959.  
  1960.    /* Set up the coordinate (except for cases where it was done above) */
  1961.    if (!coordinate_done) {
  1962.       for (int i = 0; i < coord_components; i++) {
  1963.          emit(MOV(sources[length], coordinate));
  1964.          coordinate = offset(coordinate, 1);
  1965.          length++;
  1966.       }
  1967.    }
  1968.  
  1969.    int mlen;
  1970.    if (reg_width == 2)
  1971.       mlen = length * reg_width - header_size;
  1972.    else
  1973.       mlen = length * reg_width;
  1974.  
  1975.    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
  1976.                                BRW_REGISTER_TYPE_F, dispatch_width);
  1977.    emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
  1978.  
  1979.    /* Generate the SEND */
  1980.    enum opcode opcode;
  1981.    switch (op) {
  1982.    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
  1983.    case ir_txb: opcode = FS_OPCODE_TXB; break;
  1984.    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
  1985.    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
  1986.    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
  1987.    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
  1988.    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
  1989.    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
  1990.    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
  1991.    case ir_tg4:
  1992.       if (has_nonconstant_offset)
  1993.          opcode = SHADER_OPCODE_TG4_OFFSET;
  1994.       else
  1995.          opcode = SHADER_OPCODE_TG4;
  1996.       break;
  1997.    default:
  1998.       unreachable("not reached");
  1999.    }
  2000.    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
  2001.    inst->base_mrf = -1;
  2002.    inst->mlen = mlen;
  2003.    inst->header_size = header_size;
  2004.    inst->regs_written = 4 * reg_width;
  2005.  
  2006.    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
  2007.       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
  2008.            " disallowed by hardware\n");
  2009.    }
  2010.  
  2011.    return inst;
  2012. }
  2013.  
  2014. fs_reg
  2015. fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
  2016.                              bool is_rect, uint32_t sampler, int texunit)
  2017. {
  2018.    fs_inst *inst = NULL;
  2019.    bool needs_gl_clamp = true;
  2020.    fs_reg scale_x, scale_y;
  2021.  
  2022.    /* The 965 requires the EU to do the normalization of GL rectangle
  2023.     * texture coordinates.  We use the program parameter state
  2024.     * tracking to get the scaling factor.
  2025.     */
  2026.    if (is_rect &&
  2027.        (devinfo->gen < 6 ||
  2028.         (devinfo->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
  2029.                                key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
  2030.       struct gl_program_parameter_list *params = prog->Parameters;
  2031.       int tokens[STATE_LENGTH] = {
  2032.          STATE_INTERNAL,
  2033.          STATE_TEXRECT_SCALE,
  2034.          texunit,
  2035.          0,
  2036.          0
  2037.       };
  2038.  
  2039.       no16("rectangle scale uniform setup not supported on SIMD16\n");
  2040.       if (dispatch_width == 16) {
  2041.          return coordinate;
  2042.       }
  2043.  
  2044.       GLuint index = _mesa_add_state_reference(params,
  2045.                                                (gl_state_index *)tokens);
  2046.       /* Try to find existing copies of the texrect scale uniforms. */
  2047.       for (unsigned i = 0; i < uniforms; i++) {
  2048.          if (stage_prog_data->param[i] ==
  2049.              &prog->Parameters->ParameterValues[index][0]) {
  2050.             scale_x = fs_reg(UNIFORM, i);
  2051.             scale_y = fs_reg(UNIFORM, i + 1);
  2052.             break;
  2053.          }
  2054.       }
  2055.  
  2056.       /* If we didn't already set them up, do so now. */
  2057.       if (scale_x.file == BAD_FILE) {
  2058.          scale_x = fs_reg(UNIFORM, uniforms);
  2059.          scale_y = fs_reg(UNIFORM, uniforms + 1);
  2060.  
  2061.          stage_prog_data->param[uniforms++] =
  2062.             &prog->Parameters->ParameterValues[index][0];
  2063.          stage_prog_data->param[uniforms++] =
  2064.             &prog->Parameters->ParameterValues[index][1];
  2065.       }
  2066.    }
  2067.  
  2068.    /* The 965 requires the EU to do the normalization of GL rectangle
  2069.     * texture coordinates.  We use the program parameter state
  2070.     * tracking to get the scaling factor.
  2071.     */
  2072.    if (devinfo->gen < 6 && is_rect) {
  2073.       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
  2074.       fs_reg src = coordinate;
  2075.       coordinate = dst;
  2076.  
  2077.       emit(MUL(dst, src, scale_x));
  2078.       dst = offset(dst, 1);
  2079.       src = offset(src, 1);
  2080.       emit(MUL(dst, src, scale_y));
  2081.    } else if (is_rect) {
  2082.       /* On gen6+, the sampler handles the rectangle coordinates
  2083.        * natively, without needing rescaling.  But that means we have
  2084.        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
  2085.        * not [0, 1] like the default case below.
  2086.        */
  2087.       needs_gl_clamp = false;
  2088.  
  2089.       for (int i = 0; i < 2; i++) {
  2090.          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
  2091.             fs_reg chan = coordinate;
  2092.             chan = offset(chan, i);
  2093.  
  2094.             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
  2095.             inst->conditional_mod = BRW_CONDITIONAL_GE;
  2096.  
  2097.             /* Our parameter comes in as 1.0/width or 1.0/height,
  2098.              * because that's what people normally want for doing
  2099.              * texture rectangle handling.  We need width or height
  2100.              * for clamping, but we don't care enough to make a new
  2101.              * parameter type, so just invert back.
  2102.              */
  2103.             fs_reg limit = vgrf(glsl_type::float_type);
  2104.             emit(MOV(limit, i == 0 ? scale_x : scale_y));
  2105.             emit(SHADER_OPCODE_RCP, limit, limit);
  2106.  
  2107.             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
  2108.             inst->conditional_mod = BRW_CONDITIONAL_L;
  2109.          }
  2110.       }
  2111.    }
  2112.  
  2113.    if (coord_components > 0 && needs_gl_clamp) {
  2114.       for (int i = 0; i < MIN2(coord_components, 3); i++) {
  2115.          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
  2116.             fs_reg chan = coordinate;
  2117.             chan = offset(chan, i);
  2118.  
  2119.             fs_inst *inst = emit(MOV(chan, chan));
  2120.             inst->saturate = true;
  2121.          }
  2122.       }
  2123.    }
  2124.    return coordinate;
  2125. }
  2126.  
  2127. /* Sample from the MCS surface attached to this multisample texture. */
  2128. fs_reg
  2129. fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
  2130. {
  2131.    int reg_width = dispatch_width / 8;
  2132.    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
  2133.                            BRW_REGISTER_TYPE_F, dispatch_width);
  2134.    fs_reg dest = vgrf(glsl_type::uvec4_type);
  2135.    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
  2136.  
  2137.    /* parameters are: u, v, r; missing parameters are treated as zero */
  2138.    for (int i = 0; i < components; i++) {
  2139.       sources[i] = vgrf(glsl_type::float_type);
  2140.       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
  2141.       coordinate = offset(coordinate, 1);
  2142.    }
  2143.  
  2144.    emit(LOAD_PAYLOAD(payload, sources, components, 0));
  2145.  
  2146.    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
  2147.    inst->base_mrf = -1;
  2148.    inst->mlen = components * reg_width;
  2149.    inst->header_size = 0;
  2150.    inst->regs_written = 4 * reg_width; /* we only care about one reg of
  2151.                                         * response, but the sampler always
  2152.                                         * writes 4/8
  2153.                                         */
  2154.  
  2155.    return dest;
  2156. }
  2157.  
  2158. void
  2159. fs_visitor::emit_texture(ir_texture_opcode op,
  2160.                          const glsl_type *dest_type,
  2161.                          fs_reg coordinate, int coord_components,
  2162.                          fs_reg shadow_c,
  2163.                          fs_reg lod, fs_reg lod2, int grad_components,
  2164.                          fs_reg sample_index,
  2165.                          fs_reg offset_value,
  2166.                          fs_reg mcs,
  2167.                          int gather_component,
  2168.                          bool is_cube_array,
  2169.                          bool is_rect,
  2170.                          uint32_t sampler,
  2171.                          fs_reg sampler_reg, int texunit)
  2172. {
  2173.    fs_inst *inst = NULL;
  2174.  
  2175.    if (op == ir_tg4) {
  2176.       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
  2177.        * emitting anything other than setting up the constant result.
  2178.        */
  2179.       int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
  2180.       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
  2181.  
  2182.          fs_reg res = vgrf(glsl_type::vec4_type);
  2183.          this->result = res;
  2184.  
  2185.          for (int i=0; i<4; i++) {
  2186.             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
  2187.             res = offset(res, 1);
  2188.          }
  2189.          return;
  2190.       }
  2191.    }
  2192.  
  2193.    if (coordinate.file != BAD_FILE) {
  2194.       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
  2195.        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
  2196.        */
  2197.       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
  2198.                                     sampler, texunit);
  2199.    }
  2200.  
  2201.    /* Writemasking doesn't eliminate channels on SIMD8 texture
  2202.     * samples, so don't worry about them.
  2203.     */
  2204.    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
  2205.  
  2206.    if (devinfo->gen >= 7) {
  2207.       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
  2208.                                shadow_c, lod, lod2, grad_components,
  2209.                                sample_index, mcs, sampler_reg,
  2210.                                offset_value);
  2211.    } else if (devinfo->gen >= 5) {
  2212.       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
  2213.                                shadow_c, lod, lod2, grad_components,
  2214.                                sample_index, sampler,
  2215.                                offset_value.file != BAD_FILE);
  2216.    } else if (dispatch_width == 16) {
  2217.       inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
  2218.                                       shadow_c, lod, sampler);
  2219.    } else {
  2220.       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
  2221.                                shadow_c, lod, lod2, grad_components,
  2222.                                sampler);
  2223.    }
  2224.  
  2225.    if (shadow_c.file != BAD_FILE)
  2226.       inst->shadow_compare = true;
  2227.  
  2228.    if (offset_value.file == IMM)
  2229.       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
  2230.  
  2231.    if (op == ir_tg4) {
  2232.       inst->offset |=
  2233.          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
  2234.  
  2235.       if (devinfo->gen == 6)
  2236.          emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
  2237.    }
  2238.  
  2239.    /* fixup #layers for cube map arrays */
  2240.    if (op == ir_txs && is_cube_array) {
  2241.       fs_reg depth = offset(dst, 2);
  2242.       fs_reg fixed_depth = vgrf(glsl_type::int_type);
  2243.       emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
  2244.  
  2245.       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
  2246.       int components = inst->regs_written / (dst.width / 8);
  2247.       for (int i = 0; i < components; i++) {
  2248.          if (i == 2) {
  2249.             fixed_payload[i] = fixed_depth;
  2250.          } else {
  2251.             fixed_payload[i] = offset(dst, i);
  2252.          }
  2253.       }
  2254.       emit(LOAD_PAYLOAD(dst, fixed_payload, components, 0));
  2255.    }
  2256.  
  2257.    swizzle_result(op, dest_type->vector_elements, dst, sampler);
  2258. }
  2259.  
  2260. void
  2261. fs_visitor::visit(ir_texture *ir)
  2262. {
  2263.    uint32_t sampler =
  2264.       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
  2265.  
  2266.    ir_rvalue *nonconst_sampler_index =
  2267.       _mesa_get_sampler_array_nonconst_index(ir->sampler);
  2268.  
  2269.    /* Handle non-constant sampler array indexing */
  2270.    fs_reg sampler_reg;
  2271.    if (nonconst_sampler_index) {
  2272.       /* The highest sampler which may be used by this operation is
  2273.        * the last element of the array. Mark it here, because the generator
  2274.        * doesn't have enough information to determine the bound.
  2275.        */
  2276.       uint32_t array_size = ir->sampler->as_dereference_array()
  2277.          ->array->type->array_size();
  2278.  
  2279.       uint32_t max_used = sampler + array_size - 1;
  2280.       if (ir->op == ir_tg4 && devinfo->gen < 8) {
  2281.          max_used += stage_prog_data->binding_table.gather_texture_start;
  2282.       } else {
  2283.          max_used += stage_prog_data->binding_table.texture_start;
  2284.       }
  2285.  
  2286.       brw_mark_surface_used(prog_data, max_used);
  2287.  
  2288.       /* Emit code to evaluate the actual indexing expression */
  2289.       nonconst_sampler_index->accept(this);
  2290.       fs_reg temp = vgrf(glsl_type::uint_type);
  2291.       emit(ADD(temp, this->result, fs_reg(sampler)));
  2292.       emit_uniformize(temp, temp);
  2293.  
  2294.       sampler_reg = temp;
  2295.    } else {
  2296.       /* Single sampler, or constant array index; the indexing expression
  2297.        * is just an immediate.
  2298.        */
  2299.       sampler_reg = fs_reg(sampler);
  2300.    }
  2301.  
  2302.    /* FINISHME: We're failing to recompile our programs when the sampler is
  2303.     * updated.  This only matters for the texture rectangle scale parameters
  2304.     * (pre-gen6, or gen6+ with GL_CLAMP).
  2305.     */
  2306.    int texunit = prog->SamplerUnits[sampler];
  2307.  
  2308.    /* Should be lowered by do_lower_texture_projection */
  2309.    assert(!ir->projector);
  2310.  
  2311.    /* Should be lowered */
  2312.    assert(!ir->offset || !ir->offset->type->is_array());
  2313.  
  2314.    /* Generate code to compute all the subexpression trees.  This has to be
  2315.     * done before loading any values into MRFs for the sampler message since
  2316.     * generating these values may involve SEND messages that need the MRFs.
  2317.     */
  2318.    fs_reg coordinate;
  2319.    int coord_components = 0;
  2320.    if (ir->coordinate) {
  2321.       coord_components = ir->coordinate->type->vector_elements;
  2322.       ir->coordinate->accept(this);
  2323.       coordinate = this->result;
  2324.    }
  2325.  
  2326.    fs_reg shadow_comparitor;
  2327.    if (ir->shadow_comparitor) {
  2328.       ir->shadow_comparitor->accept(this);
  2329.       shadow_comparitor = this->result;
  2330.    }
  2331.  
  2332.    fs_reg offset_value;
  2333.    if (ir->offset) {
  2334.       ir_constant *const_offset = ir->offset->as_constant();
  2335.       if (const_offset) {
  2336.          /* Store the header bitfield in an IMM register.  This allows us to
  2337.           * use offset_value.file to distinguish between no offset, a constant
  2338.           * offset, and a non-constant offset.
  2339.           */
  2340.          offset_value =
  2341.             fs_reg(brw_texture_offset(const_offset->value.i,
  2342.                                       const_offset->type->vector_elements));
  2343.       } else {
  2344.          ir->offset->accept(this);
  2345.          offset_value = this->result;
  2346.       }
  2347.    }
  2348.  
  2349.    fs_reg lod, lod2, sample_index, mcs;
  2350.    int grad_components = 0;
  2351.    switch (ir->op) {
  2352.    case ir_tex:
  2353.    case ir_lod:
  2354.    case ir_tg4:
  2355.    case ir_query_levels:
  2356.       break;
  2357.    case ir_txb:
  2358.       ir->lod_info.bias->accept(this);
  2359.       lod = this->result;
  2360.       break;
  2361.    case ir_txd:
  2362.       ir->lod_info.grad.dPdx->accept(this);
  2363.       lod = this->result;
  2364.  
  2365.       ir->lod_info.grad.dPdy->accept(this);
  2366.       lod2 = this->result;
  2367.  
  2368.       grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
  2369.       break;
  2370.    case ir_txf:
  2371.    case ir_txl:
  2372.    case ir_txs:
  2373.       ir->lod_info.lod->accept(this);
  2374.       lod = this->result;
  2375.       break;
  2376.    case ir_txf_ms:
  2377.       ir->lod_info.sample_index->accept(this);
  2378.       sample_index = this->result;
  2379.  
  2380.       if (devinfo->gen >= 7 &&
  2381.           key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
  2382.          mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
  2383.                               sampler_reg);
  2384.       } else {
  2385.          mcs = fs_reg(0u);
  2386.       }
  2387.       break;
  2388.    default:
  2389.       unreachable("Unrecognized texture opcode");
  2390.    };
  2391.  
  2392.    int gather_component = 0;
  2393.    if (ir->op == ir_tg4)
  2394.       gather_component = ir->lod_info.component->as_constant()->value.i[0];
  2395.  
  2396.    bool is_rect =
  2397.       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
  2398.  
  2399.    bool is_cube_array =
  2400.       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
  2401.       ir->sampler->type->sampler_array;
  2402.  
  2403.    emit_texture(ir->op, ir->type, coordinate, coord_components,
  2404.                 shadow_comparitor, lod, lod2, grad_components,
  2405.                 sample_index, offset_value, mcs,
  2406.                 gather_component, is_cube_array, is_rect, sampler,
  2407.                 sampler_reg, texunit);
  2408. }
  2409.  
  2410. /**
  2411.  * Apply workarounds for Gen6 gather with UINT/SINT
  2412.  */
  2413. void
  2414. fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
  2415. {
  2416.    if (!wa)
  2417.       return;
  2418.  
  2419.    int width = (wa & WA_8BIT) ? 8 : 16;
  2420.  
  2421.    for (int i = 0; i < 4; i++) {
  2422.       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
  2423.       /* Convert from UNORM to UINT */
  2424.       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
  2425.       emit(MOV(dst, dst_f));
  2426.  
  2427.       if (wa & WA_SIGN) {
  2428.          /* Reinterpret the UINT value as a signed INT value by
  2429.           * shifting the sign bit into place, then shifting back
  2430.           * preserving sign.
  2431.           */
  2432.          emit(SHL(dst, dst, fs_reg(32 - width)));
  2433.          emit(ASR(dst, dst, fs_reg(32 - width)));
  2434.       }
  2435.  
  2436.       dst = offset(dst, 1);
  2437.    }
  2438. }
  2439.  
  2440. /**
  2441.  * Set up the gather channel based on the swizzle, for gather4.
  2442.  */
  2443. uint32_t
  2444. fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
  2445. {
  2446.    int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
  2447.    switch (swiz) {
  2448.       case SWIZZLE_X: return 0;
  2449.       case SWIZZLE_Y:
  2450.          /* gather4 sampler is broken for green channel on RG32F --
  2451.           * we must ask for blue instead.
  2452.           */
  2453.          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
  2454.             return 2;
  2455.          return 1;
  2456.       case SWIZZLE_Z: return 2;
  2457.       case SWIZZLE_W: return 3;
  2458.       default:
  2459.          unreachable("Not reached"); /* zero, one swizzles handled already */
  2460.    }
  2461. }
  2462.  
  2463. /**
  2464.  * Swizzle the result of a texture result.  This is necessary for
  2465.  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
  2466.  */
  2467. void
  2468. fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
  2469.                            fs_reg orig_val, uint32_t sampler)
  2470. {
  2471.    if (op == ir_query_levels) {
  2472.       /* # levels is in .w */
  2473.       this->result = offset(orig_val, 3);
  2474.       return;
  2475.    }
  2476.  
  2477.    this->result = orig_val;
  2478.  
  2479.    /* txs,lod don't actually sample the texture, so swizzling the result
  2480.     * makes no sense.
  2481.     */
  2482.    if (op == ir_txs || op == ir_lod || op == ir_tg4)
  2483.       return;
  2484.  
  2485.    if (dest_components == 1) {
  2486.       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
  2487.    } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
  2488.       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
  2489.       swizzled_result.type = orig_val.type;
  2490.  
  2491.       for (int i = 0; i < 4; i++) {
  2492.          int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
  2493.          fs_reg l = swizzled_result;
  2494.          l = offset(l, i);
  2495.  
  2496.          if (swiz == SWIZZLE_ZERO) {
  2497.             emit(MOV(l, fs_reg(0.0f)));
  2498.          } else if (swiz == SWIZZLE_ONE) {
  2499.             emit(MOV(l, fs_reg(1.0f)));
  2500.          } else {
  2501.             emit(MOV(l, offset(orig_val,
  2502.                                GET_SWZ(key_tex->swizzles[sampler], i))));
  2503.          }
  2504.       }
  2505.       this->result = swizzled_result;
  2506.    }
  2507. }
  2508.  
  2509. void
  2510. fs_visitor::visit(ir_swizzle *ir)
  2511. {
  2512.    ir->val->accept(this);
  2513.    fs_reg val = this->result;
  2514.  
  2515.    if (ir->type->vector_elements == 1) {
  2516.       this->result = offset(this->result, ir->mask.x);
  2517.       return;
  2518.    }
  2519.  
  2520.    fs_reg result = vgrf(ir->type);
  2521.    this->result = result;
  2522.  
  2523.    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
  2524.       fs_reg channel = val;
  2525.       int swiz = 0;
  2526.  
  2527.       switch (i) {
  2528.       case 0:
  2529.          swiz = ir->mask.x;
  2530.          break;
  2531.       case 1:
  2532.          swiz = ir->mask.y;
  2533.          break;
  2534.       case 2:
  2535.          swiz = ir->mask.z;
  2536.          break;
  2537.       case 3:
  2538.          swiz = ir->mask.w;
  2539.          break;
  2540.       }
  2541.  
  2542.       emit(MOV(result, offset(channel, swiz)));
  2543.       result = offset(result, 1);
  2544.    }
  2545. }
  2546.  
  2547. void
  2548. fs_visitor::visit(ir_discard *ir)
  2549. {
  2550.    /* We track our discarded pixels in f0.1.  By predicating on it, we can
  2551.     * update just the flag bits that aren't yet discarded.  If there's no
  2552.     * condition, we emit a CMP of g0 != g0, so all currently executing
  2553.     * channels will get turned off.
  2554.     */
  2555.    fs_inst *cmp;
  2556.    if (ir->condition) {
  2557.       emit_bool_to_cond_code(ir->condition);
  2558.       cmp = (fs_inst *) this->instructions.get_tail();
  2559.       cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
  2560.    } else {
  2561.       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
  2562.                                       BRW_REGISTER_TYPE_UW));
  2563.       cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
  2564.    }
  2565.    cmp->predicate = BRW_PREDICATE_NORMAL;
  2566.    cmp->flag_subreg = 1;
  2567.  
  2568.    if (devinfo->gen >= 6) {
  2569.       emit_discard_jump();
  2570.    }
  2571. }
  2572.  
  2573. void
  2574. fs_visitor::visit(ir_constant *ir)
  2575. {
  2576.    /* Set this->result to reg at the bottom of the function because some code
  2577.     * paths will cause this visitor to be applied to other fields.  This will
  2578.     * cause the value stored in this->result to be modified.
  2579.     *
  2580.     * Make reg constant so that it doesn't get accidentally modified along the
  2581.     * way.  Yes, I actually had this problem. :(
  2582.     */
  2583.    const fs_reg reg = vgrf(ir->type);
  2584.    fs_reg dst_reg = reg;
  2585.  
  2586.    if (ir->type->is_array()) {
  2587.       const unsigned size = type_size(ir->type->fields.array);
  2588.  
  2589.       for (unsigned i = 0; i < ir->type->length; i++) {
  2590.          ir->array_elements[i]->accept(this);
  2591.          fs_reg src_reg = this->result;
  2592.  
  2593.          dst_reg.type = src_reg.type;
  2594.          for (unsigned j = 0; j < size; j++) {
  2595.             emit(MOV(dst_reg, src_reg));
  2596.             src_reg = offset(src_reg, 1);
  2597.             dst_reg = offset(dst_reg, 1);
  2598.          }
  2599.       }
  2600.    } else if (ir->type->is_record()) {
  2601.       foreach_in_list(ir_constant, field, &ir->components) {
  2602.          const unsigned size = type_size(field->type);
  2603.  
  2604.          field->accept(this);
  2605.          fs_reg src_reg = this->result;
  2606.  
  2607.          dst_reg.type = src_reg.type;
  2608.          for (unsigned j = 0; j < size; j++) {
  2609.             emit(MOV(dst_reg, src_reg));
  2610.             src_reg = offset(src_reg, 1);
  2611.             dst_reg = offset(dst_reg, 1);
  2612.          }
  2613.       }
  2614.    } else {
  2615.       const unsigned size = type_size(ir->type);
  2616.  
  2617.       for (unsigned i = 0; i < size; i++) {
  2618.          switch (ir->type->base_type) {
  2619.          case GLSL_TYPE_FLOAT:
  2620.             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
  2621.             break;
  2622.          case GLSL_TYPE_UINT:
  2623.             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
  2624.             break;
  2625.          case GLSL_TYPE_INT:
  2626.             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
  2627.             break;
  2628.          case GLSL_TYPE_BOOL:
  2629.             emit(MOV(dst_reg, fs_reg(ir->value.b[i] != 0 ? ~0 : 0)));
  2630.             break;
  2631.          default:
  2632.             unreachable("Non-float/uint/int/bool constant");
  2633.          }
  2634.          dst_reg = offset(dst_reg, 1);
  2635.       }
  2636.    }
  2637.  
  2638.    this->result = reg;
  2639. }
  2640.  
  2641. void
  2642. fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
  2643. {
  2644.    ir_expression *expr = ir->as_expression();
  2645.  
  2646.    if (!expr || expr->operation == ir_binop_ubo_load) {
  2647.       ir->accept(this);
  2648.  
  2649.       fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
  2650.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2651.       return;
  2652.    }
  2653.  
  2654.    fs_reg op[3];
  2655.  
  2656.    assert(expr->get_num_operands() <= 3);
  2657.    for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
  2658.       assert(expr->operands[i]->type->is_scalar());
  2659.  
  2660.       expr->operands[i]->accept(this);
  2661.       op[i] = this->result;
  2662.  
  2663.       resolve_ud_negate(&op[i]);
  2664.    }
  2665.  
  2666.    emit_bool_to_cond_code_of_reg(expr, op);
  2667. }
  2668.  
  2669. void
  2670. fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
  2671. {
  2672.    fs_inst *inst;
  2673.  
  2674.    switch (expr->operation) {
  2675.    case ir_unop_logic_not:
  2676.       inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
  2677.       inst->conditional_mod = BRW_CONDITIONAL_Z;
  2678.       break;
  2679.  
  2680.    case ir_binop_logic_xor:
  2681.       if (devinfo->gen <= 5) {
  2682.          fs_reg temp = vgrf(expr->type);
  2683.          emit(XOR(temp, op[0], op[1]));
  2684.          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
  2685.       } else {
  2686.          inst = emit(XOR(reg_null_d, op[0], op[1]));
  2687.       }
  2688.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2689.       break;
  2690.  
  2691.    case ir_binop_logic_or:
  2692.       if (devinfo->gen <= 5) {
  2693.          fs_reg temp = vgrf(expr->type);
  2694.          emit(OR(temp, op[0], op[1]));
  2695.          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
  2696.       } else {
  2697.          inst = emit(OR(reg_null_d, op[0], op[1]));
  2698.       }
  2699.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2700.       break;
  2701.  
  2702.    case ir_binop_logic_and:
  2703.       if (devinfo->gen <= 5) {
  2704.          fs_reg temp = vgrf(expr->type);
  2705.          emit(AND(temp, op[0], op[1]));
  2706.          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
  2707.       } else {
  2708.          inst = emit(AND(reg_null_d, op[0], op[1]));
  2709.       }
  2710.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2711.       break;
  2712.  
  2713.    case ir_unop_f2b:
  2714.       if (devinfo->gen >= 6) {
  2715.          emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
  2716.       } else {
  2717.          inst = emit(MOV(reg_null_f, op[0]));
  2718.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2719.       }
  2720.       break;
  2721.  
  2722.    case ir_unop_i2b:
  2723.       if (devinfo->gen >= 6) {
  2724.          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
  2725.       } else {
  2726.          inst = emit(MOV(reg_null_d, op[0]));
  2727.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2728.       }
  2729.       break;
  2730.  
  2731.    case ir_binop_greater:
  2732.    case ir_binop_gequal:
  2733.    case ir_binop_less:
  2734.    case ir_binop_lequal:
  2735.    case ir_binop_equal:
  2736.    case ir_binop_all_equal:
  2737.    case ir_binop_nequal:
  2738.    case ir_binop_any_nequal:
  2739.       if (devinfo->gen <= 5) {
  2740.          resolve_bool_comparison(expr->operands[0], &op[0]);
  2741.          resolve_bool_comparison(expr->operands[1], &op[1]);
  2742.       }
  2743.  
  2744.       emit(CMP(reg_null_d, op[0], op[1],
  2745.                brw_conditional_for_comparison(expr->operation)));
  2746.       break;
  2747.  
  2748.    case ir_triop_csel: {
  2749.       /* Expand the boolean condition into the flag register. */
  2750.       inst = emit(MOV(reg_null_d, op[0]));
  2751.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2752.  
  2753.       /* Select which boolean to return. */
  2754.       fs_reg temp = vgrf(expr->operands[1]->type);
  2755.       inst = emit(SEL(temp, op[1], op[2]));
  2756.       inst->predicate = BRW_PREDICATE_NORMAL;
  2757.  
  2758.       /* Expand the result to a condition code. */
  2759.       inst = emit(MOV(reg_null_d, temp));
  2760.       inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2761.       break;
  2762.    }
  2763.  
  2764.    default:
  2765.       unreachable("not reached");
  2766.    }
  2767. }
  2768.  
  2769. /**
  2770.  * Emit a gen6 IF statement with the comparison folded into the IF
  2771.  * instruction.
  2772.  */
  2773. void
  2774. fs_visitor::emit_if_gen6(ir_if *ir)
  2775. {
  2776.    ir_expression *expr = ir->condition->as_expression();
  2777.  
  2778.    if (expr && expr->operation != ir_binop_ubo_load) {
  2779.       fs_reg op[3];
  2780.       fs_inst *inst;
  2781.       fs_reg temp;
  2782.  
  2783.       assert(expr->get_num_operands() <= 3);
  2784.       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
  2785.          assert(expr->operands[i]->type->is_scalar());
  2786.  
  2787.          expr->operands[i]->accept(this);
  2788.          op[i] = this->result;
  2789.       }
  2790.  
  2791.       switch (expr->operation) {
  2792.       case ir_unop_logic_not:
  2793.          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
  2794.          return;
  2795.  
  2796.       case ir_binop_logic_xor:
  2797.          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
  2798.          return;
  2799.  
  2800.       case ir_binop_logic_or:
  2801.          temp = vgrf(glsl_type::bool_type);
  2802.          emit(OR(temp, op[0], op[1]));
  2803.          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
  2804.          return;
  2805.  
  2806.       case ir_binop_logic_and:
  2807.          temp = vgrf(glsl_type::bool_type);
  2808.          emit(AND(temp, op[0], op[1]));
  2809.          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
  2810.          return;
  2811.  
  2812.       case ir_unop_f2b:
  2813.          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
  2814.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2815.          return;
  2816.  
  2817.       case ir_unop_i2b:
  2818.          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
  2819.          return;
  2820.  
  2821.       case ir_binop_greater:
  2822.       case ir_binop_gequal:
  2823.       case ir_binop_less:
  2824.       case ir_binop_lequal:
  2825.       case ir_binop_equal:
  2826.       case ir_binop_all_equal:
  2827.       case ir_binop_nequal:
  2828.       case ir_binop_any_nequal:
  2829.          if (devinfo->gen <= 5) {
  2830.             resolve_bool_comparison(expr->operands[0], &op[0]);
  2831.             resolve_bool_comparison(expr->operands[1], &op[1]);
  2832.          }
  2833.  
  2834.          emit(IF(op[0], op[1],
  2835.                  brw_conditional_for_comparison(expr->operation)));
  2836.          return;
  2837.  
  2838.       case ir_triop_csel: {
  2839.          /* Expand the boolean condition into the flag register. */
  2840.          fs_inst *inst = emit(MOV(reg_null_d, op[0]));
  2841.          inst->conditional_mod = BRW_CONDITIONAL_NZ;
  2842.  
  2843.          /* Select which boolean to use as the result. */
  2844.          fs_reg temp = vgrf(expr->operands[1]->type);
  2845.          inst = emit(SEL(temp, op[1], op[2]));
  2846.          inst->predicate = BRW_PREDICATE_NORMAL;
  2847.  
  2848.          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
  2849.          return;
  2850.       }
  2851.  
  2852.       default:
  2853.          unreachable("not reached");
  2854.       }
  2855.    }
  2856.  
  2857.    ir->condition->accept(this);
  2858.    emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
  2859. }
  2860.  
  2861. bool
  2862. fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
  2863. {
  2864.    ir_dereference_variable *deref = ir->condition->as_dereference_variable();
  2865.    if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
  2866.       return false;
  2867.  
  2868.    if (ir->then_instructions.length() != 1 ||
  2869.        ir->else_instructions.length() != 1)
  2870.       return false;
  2871.  
  2872.    ir_assignment *then_assign =
  2873.          ((ir_instruction *)ir->then_instructions.head)->as_assignment();
  2874.    ir_assignment *else_assign =
  2875.          ((ir_instruction *)ir->else_instructions.head)->as_assignment();
  2876.  
  2877.    if (!then_assign || then_assign->condition ||
  2878.        !else_assign || else_assign->condition ||
  2879.        then_assign->write_mask != else_assign->write_mask ||
  2880.        !then_assign->lhs->equals(else_assign->lhs))
  2881.       return false;
  2882.  
  2883.    ir_constant *then_rhs = then_assign->rhs->as_constant();
  2884.    ir_constant *else_rhs = else_assign->rhs->as_constant();
  2885.  
  2886.    if (!then_rhs || !else_rhs)
  2887.       return false;
  2888.  
  2889.    if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
  2890.       return false;
  2891.  
  2892.    if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
  2893.        (else_rhs->is_one() && then_rhs->is_negative_one())) {
  2894.       then_assign->lhs->accept(this);
  2895.       fs_reg dst = this->result;
  2896.       dst.type = BRW_REGISTER_TYPE_D;
  2897.       fs_reg tmp = vgrf(glsl_type::int_type);
  2898.  
  2899.       if (devinfo->gen >= 6) {
  2900.          /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
  2901.          fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
  2902.  
  2903.          /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
  2904.           *
  2905.           *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
  2906.           *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
  2907.           *
  2908.           * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
  2909.           */
  2910.  
  2911.          if (then_rhs->is_negative_one()) {
  2912.             assert(else_rhs->is_one());
  2913.             g0.negate = true;
  2914.          }
  2915.  
  2916.          tmp.type = BRW_REGISTER_TYPE_W;
  2917.          tmp.subreg_offset = 2;
  2918.          tmp.stride = 2;
  2919.  
  2920.          fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
  2921.          or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
  2922.  
  2923.          tmp.type = BRW_REGISTER_TYPE_D;
  2924.          tmp.subreg_offset = 0;
  2925.          tmp.stride = 1;
  2926.       } else {
  2927.          /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
  2928.          fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
  2929.  
  2930.          /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
  2931.           *
  2932.           *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
  2933.           *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
  2934.           *
  2935.           * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
  2936.           */
  2937.  
  2938.          if (then_rhs->is_negative_one()) {
  2939.             assert(else_rhs->is_one());
  2940.             g1_6.negate = true;
  2941.          }
  2942.  
  2943.          emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
  2944.       }
  2945.       emit(AND(dst, tmp, fs_reg(0xbf800000)));
  2946.       return true;
  2947.    }
  2948.  
  2949.    return false;
  2950. }
  2951.  
  2952. /**
  2953.  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
  2954.  *
  2955.  * Many GLSL shaders contain the following pattern:
  2956.  *
  2957.  *    x = condition ? foo : bar
  2958.  *
  2959.  * The compiler emits an ir_if tree for this, since each subexpression might be
  2960.  * a complex tree that could have side-effects or short-circuit logic.
  2961.  *
  2962.  * However, the common case is to simply select one of two constants or
  2963.  * variable values---which is exactly what SEL is for.  In this case, the
  2964.  * assembly looks like:
  2965.  *
  2966.  *    (+f0) IF
  2967.  *    MOV dst src0
  2968.  *    ELSE
  2969.  *    MOV dst src1
  2970.  *    ENDIF
  2971.  *
  2972.  * which can be easily translated into:
  2973.  *
  2974.  *    (+f0) SEL dst src0 src1
  2975.  *
  2976.  * If src0 is an immediate value, we promote it to a temporary GRF.
  2977.  */
  2978. bool
  2979. fs_visitor::try_replace_with_sel()
  2980. {
  2981.    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
  2982.    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
  2983.  
  2984.    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
  2985.    int opcodes[] = {
  2986.       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
  2987.    };
  2988.  
  2989.    fs_inst *match = (fs_inst *) endif_inst->prev;
  2990.    for (int i = 0; i < 4; i++) {
  2991.       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
  2992.          return false;
  2993.       match = (fs_inst *) match->prev;
  2994.    }
  2995.  
  2996.    /* The opcodes match; it looks like the right sequence of instructions. */
  2997.    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
  2998.    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
  2999.    fs_inst *if_inst = (fs_inst *) then_mov->prev;
  3000.  
  3001.    /* Check that the MOVs are the right form. */
  3002.    if (then_mov->dst.equals(else_mov->dst) &&
  3003.        !then_mov->is_partial_write() &&
  3004.        !else_mov->is_partial_write()) {
  3005.  
  3006.       /* Remove the matched instructions; we'll emit a SEL to replace them. */
  3007.       while (!if_inst->next->is_tail_sentinel())
  3008.          if_inst->next->exec_node::remove();
  3009.       if_inst->exec_node::remove();
  3010.  
  3011.       /* Only the last source register can be a constant, so if the MOV in
  3012.        * the "then" clause uses a constant, we need to put it in a temporary.
  3013.        */
  3014.       fs_reg src0(then_mov->src[0]);
  3015.       if (src0.file == IMM) {
  3016.          src0 = vgrf(glsl_type::float_type);
  3017.          src0.type = then_mov->src[0].type;
  3018.          emit(MOV(src0, then_mov->src[0]));
  3019.       }
  3020.  
  3021.       fs_inst *sel;
  3022.       if (if_inst->conditional_mod) {
  3023.          /* Sandybridge-specific IF with embedded comparison */
  3024.          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
  3025.                   if_inst->conditional_mod));
  3026.          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
  3027.          sel->predicate = BRW_PREDICATE_NORMAL;
  3028.       } else {
  3029.          /* Separate CMP and IF instructions */
  3030.          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
  3031.          sel->predicate = if_inst->predicate;
  3032.          sel->predicate_inverse = if_inst->predicate_inverse;
  3033.       }
  3034.  
  3035.       return true;
  3036.    }
  3037.  
  3038.    return false;
  3039. }
  3040.  
  3041. void
  3042. fs_visitor::visit(ir_if *ir)
  3043. {
  3044.    if (try_opt_frontfacing_ternary(ir))
  3045.       return;
  3046.  
  3047.    /* Don't point the annotation at the if statement, because then it plus
  3048.     * the then and else blocks get printed.
  3049.     */
  3050.    this->base_ir = ir->condition;
  3051.  
  3052.    if (devinfo->gen == 6) {
  3053.       emit_if_gen6(ir);
  3054.    } else {
  3055.       emit_bool_to_cond_code(ir->condition);
  3056.  
  3057.       emit(IF(BRW_PREDICATE_NORMAL));
  3058.    }
  3059.  
  3060.    foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
  3061.       this->base_ir = ir_;
  3062.       ir_->accept(this);
  3063.    }
  3064.  
  3065.    if (!ir->else_instructions.is_empty()) {
  3066.       emit(BRW_OPCODE_ELSE);
  3067.  
  3068.       foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
  3069.          this->base_ir = ir_;
  3070.          ir_->accept(this);
  3071.       }
  3072.    }
  3073.  
  3074.    emit(BRW_OPCODE_ENDIF);
  3075.  
  3076.    if (!try_replace_with_sel() && devinfo->gen < 6) {
  3077.       no16("Can't support (non-uniform) control flow on SIMD16\n");
  3078.    }
  3079. }
  3080.  
  3081. void
  3082. fs_visitor::visit(ir_loop *ir)
  3083. {
  3084.    if (devinfo->gen < 6) {
  3085.       no16("Can't support (non-uniform) control flow on SIMD16\n");
  3086.    }
  3087.  
  3088.    this->base_ir = NULL;
  3089.    emit(BRW_OPCODE_DO);
  3090.  
  3091.    foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
  3092.       this->base_ir = ir_;
  3093.       ir_->accept(this);
  3094.    }
  3095.  
  3096.    this->base_ir = NULL;
  3097.    emit(BRW_OPCODE_WHILE);
  3098. }
  3099.  
  3100. void
  3101. fs_visitor::visit(ir_loop_jump *ir)
  3102. {
  3103.    switch (ir->mode) {
  3104.    case ir_loop_jump::jump_break:
  3105.       emit(BRW_OPCODE_BREAK);
  3106.       break;
  3107.    case ir_loop_jump::jump_continue:
  3108.       emit(BRW_OPCODE_CONTINUE);
  3109.       break;
  3110.    }
  3111. }
  3112.  
  3113. void
  3114. fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
  3115. {
  3116.    ir_dereference *deref = static_cast<ir_dereference *>(
  3117.       ir->actual_parameters.get_head());
  3118.    ir_variable *location = deref->variable_referenced();
  3119.    unsigned surf_index = (stage_prog_data->binding_table.abo_start +
  3120.                           location->data.binding);
  3121.  
  3122.    /* Calculate the surface offset */
  3123.    fs_reg offset = vgrf(glsl_type::uint_type);
  3124.    ir_dereference_array *deref_array = deref->as_dereference_array();
  3125.  
  3126.    if (deref_array) {
  3127.       deref_array->array_index->accept(this);
  3128.  
  3129.       fs_reg tmp = vgrf(glsl_type::uint_type);
  3130.       emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
  3131.       emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
  3132.    } else {
  3133.       offset = fs_reg(location->data.atomic.offset);
  3134.    }
  3135.  
  3136.    /* Emit the appropriate machine instruction */
  3137.    const char *callee = ir->callee->function_name();
  3138.    ir->return_deref->accept(this);
  3139.    fs_reg dst = this->result;
  3140.  
  3141.    if (!strcmp("__intrinsic_atomic_read", callee)) {
  3142.       emit_untyped_surface_read(surf_index, dst, offset);
  3143.  
  3144.    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
  3145.       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
  3146.                           fs_reg(), fs_reg());
  3147.  
  3148.    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
  3149.       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
  3150.                           fs_reg(), fs_reg());
  3151.    }
  3152. }
  3153.  
  3154. void
  3155. fs_visitor::visit(ir_call *ir)
  3156. {
  3157.    const char *callee = ir->callee->function_name();
  3158.  
  3159.    if (!strcmp("__intrinsic_atomic_read", callee) ||
  3160.        !strcmp("__intrinsic_atomic_increment", callee) ||
  3161.        !strcmp("__intrinsic_atomic_predecrement", callee)) {
  3162.       visit_atomic_counter_intrinsic(ir);
  3163.    } else {
  3164.       unreachable("Unsupported intrinsic.");
  3165.    }
  3166. }
  3167.  
  3168. void
  3169. fs_visitor::visit(ir_return *)
  3170. {
  3171.    unreachable("FINISHME");
  3172. }
  3173.  
  3174. void
  3175. fs_visitor::visit(ir_function *ir)
  3176. {
  3177.    /* Ignore function bodies other than main() -- we shouldn't see calls to
  3178.     * them since they should all be inlined before we get to ir_to_mesa.
  3179.     */
  3180.    if (strcmp(ir->name, "main") == 0) {
  3181.       const ir_function_signature *sig;
  3182.       exec_list empty;
  3183.  
  3184.       sig = ir->matching_signature(NULL, &empty, false);
  3185.  
  3186.       assert(sig);
  3187.  
  3188.       foreach_in_list(ir_instruction, ir_, &sig->body) {
  3189.          this->base_ir = ir_;
  3190.          ir_->accept(this);
  3191.       }
  3192.    }
  3193. }
  3194.  
  3195. void
  3196. fs_visitor::visit(ir_function_signature *)
  3197. {
  3198.    unreachable("not reached");
  3199. }
  3200.  
  3201. void
  3202. fs_visitor::visit(ir_emit_vertex *)
  3203. {
  3204.    unreachable("not reached");
  3205. }
  3206.  
  3207. void
  3208. fs_visitor::visit(ir_end_primitive *)
  3209. {
  3210.    unreachable("not reached");
  3211. }
  3212.  
  3213. void
  3214. fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
  3215.                                 fs_reg dst, fs_reg offset, fs_reg src0,
  3216.                                 fs_reg src1)
  3217. {
  3218.    int reg_width = dispatch_width / 8;
  3219.    int length = 0;
  3220.  
  3221.    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
  3222.  
  3223.    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
  3224.    /* Initialize the sample mask in the message header. */
  3225.    emit(MOV(sources[0], fs_reg(0u)))
  3226.       ->force_writemask_all = true;
  3227.  
  3228.    if (stage == MESA_SHADER_FRAGMENT) {
  3229.       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
  3230.          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
  3231.             ->force_writemask_all = true;
  3232.       } else {
  3233.          emit(MOV(component(sources[0], 7),
  3234.                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
  3235.             ->force_writemask_all = true;
  3236.       }
  3237.    } else {
  3238.       /* The execution mask is part of the side-band information sent together with
  3239.        * the message payload to the data port. It's implicitly ANDed with the sample
  3240.        * mask sent in the header to compute the actual set of channels that execute
  3241.        * the atomic operation.
  3242.        */
  3243.       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
  3244.       emit(MOV(component(sources[0], 7),
  3245.                fs_reg(0xffffu)))->force_writemask_all = true;
  3246.    }
  3247.    length++;
  3248.  
  3249.    /* Set the atomic operation offset. */
  3250.    sources[1] = vgrf(glsl_type::uint_type);
  3251.    emit(MOV(sources[1], offset));
  3252.    length++;
  3253.  
  3254.    /* Set the atomic operation arguments. */
  3255.    if (src0.file != BAD_FILE) {
  3256.       sources[length] = vgrf(glsl_type::uint_type);
  3257.       emit(MOV(sources[length], src0));
  3258.       length++;
  3259.    }
  3260.  
  3261.    if (src1.file != BAD_FILE) {
  3262.       sources[length] = vgrf(glsl_type::uint_type);
  3263.       emit(MOV(sources[length], src1));
  3264.       length++;
  3265.    }
  3266.  
  3267.    int mlen = 1 + (length - 1) * reg_width;
  3268.    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
  3269.                                BRW_REGISTER_TYPE_UD, dispatch_width);
  3270.    emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
  3271.  
  3272.    /* Emit the instruction. */
  3273.    fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
  3274.                         fs_reg(surf_index), fs_reg(atomic_op));
  3275.    inst->mlen = mlen;
  3276. }
  3277.  
  3278. void
  3279. fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
  3280.                                       fs_reg offset)
  3281. {
  3282.    int reg_width = dispatch_width / 8;
  3283.  
  3284.    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
  3285.  
  3286.    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
  3287.    /* Initialize the sample mask in the message header. */
  3288.    emit(MOV(sources[0], fs_reg(0u)))
  3289.       ->force_writemask_all = true;
  3290.  
  3291.    if (stage == MESA_SHADER_FRAGMENT) {
  3292.       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
  3293.          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
  3294.             ->force_writemask_all = true;
  3295.       } else {
  3296.          emit(MOV(component(sources[0], 7),
  3297.                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
  3298.             ->force_writemask_all = true;
  3299.       }
  3300.    } else {
  3301.       /* The execution mask is part of the side-band information sent together with
  3302.        * the message payload to the data port. It's implicitly ANDed with the sample
  3303.        * mask sent in the header to compute the actual set of channels that execute
  3304.        * the atomic operation.
  3305.        */
  3306.       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
  3307.       emit(MOV(component(sources[0], 7),
  3308.                fs_reg(0xffffu)))->force_writemask_all = true;
  3309.    }
  3310.  
  3311.    /* Set the surface read offset. */
  3312.    sources[1] = vgrf(glsl_type::uint_type);
  3313.    emit(MOV(sources[1], offset));
  3314.  
  3315.    int mlen = 1 + reg_width;
  3316.    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
  3317.                                BRW_REGISTER_TYPE_UD, dispatch_width);
  3318.    fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
  3319.  
  3320.    /* Emit the instruction. */
  3321.    inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
  3322.                fs_reg(surf_index), fs_reg(1));
  3323.    inst->mlen = mlen;
  3324. }
  3325.  
  3326. fs_inst *
  3327. fs_visitor::emit(fs_inst *inst)
  3328. {
  3329.    if (dispatch_width == 16 && inst->exec_size == 8)
  3330.       inst->force_uncompressed = true;
  3331.  
  3332.    inst->annotation = this->current_annotation;
  3333.    inst->ir = this->base_ir;
  3334.  
  3335.    this->instructions.push_tail(inst);
  3336.  
  3337.    return inst;
  3338. }
  3339.  
  3340. void
  3341. fs_visitor::emit(exec_list list)
  3342. {
  3343.    foreach_in_list_safe(fs_inst, inst, &list) {
  3344.       inst->exec_node::remove();
  3345.       emit(inst);
  3346.    }
  3347. }
  3348.  
  3349. /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
  3350. void
  3351. fs_visitor::emit_dummy_fs()
  3352. {
  3353.    int reg_width = dispatch_width / 8;
  3354.  
  3355.    /* Everyone's favorite color. */
  3356.    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
  3357.    for (int i = 0; i < 4; i++) {
  3358.       emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
  3359.                       dispatch_width), fs_reg(color[i])));
  3360.    }
  3361.  
  3362.    fs_inst *write;
  3363.    write = emit(FS_OPCODE_FB_WRITE);
  3364.    write->eot = true;
  3365.    if (devinfo->gen >= 6) {
  3366.       write->base_mrf = 2;
  3367.       write->mlen = 4 * reg_width;
  3368.    } else {
  3369.       write->header_size = 2;
  3370.       write->base_mrf = 0;
  3371.       write->mlen = 2 + 4 * reg_width;
  3372.    }
  3373.  
  3374.    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
  3375.     * varying to avoid GPU hangs, so set that.
  3376.     */
  3377.    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
  3378.    wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
  3379.    memset(wm_prog_data->urb_setup, -1,
  3380.           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
  3381.  
  3382.    /* We don't have any uniforms. */
  3383.    stage_prog_data->nr_params = 0;
  3384.    stage_prog_data->nr_pull_params = 0;
  3385.    stage_prog_data->curb_read_length = 0;
  3386.    stage_prog_data->dispatch_grf_start_reg = 2;
  3387.    wm_prog_data->dispatch_grf_start_reg_16 = 2;
  3388.    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
  3389.  
  3390.    calculate_cfg();
  3391. }
  3392.  
  3393. /* The register location here is relative to the start of the URB
  3394.  * data.  It will get adjusted to be a real location before
  3395.  * generate_code() time.
  3396.  */
  3397. struct brw_reg
  3398. fs_visitor::interp_reg(int location, int channel)
  3399. {
  3400.    assert(stage == MESA_SHADER_FRAGMENT);
  3401.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  3402.    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
  3403.    int stride = (channel & 1) * 4;
  3404.  
  3405.    assert(prog_data->urb_setup[location] != -1);
  3406.  
  3407.    return brw_vec1_grf(regnr, stride);
  3408. }
  3409.  
  3410. /** Emits the interpolation for the varying inputs. */
  3411. void
  3412. fs_visitor::emit_interpolation_setup_gen4()
  3413. {
  3414.    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
  3415.  
  3416.    this->current_annotation = "compute pixel centers";
  3417.    this->pixel_x = vgrf(glsl_type::uint_type);
  3418.    this->pixel_y = vgrf(glsl_type::uint_type);
  3419.    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
  3420.    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
  3421.    emit(ADD(this->pixel_x,
  3422.             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
  3423.             fs_reg(brw_imm_v(0x10101010))));
  3424.    emit(ADD(this->pixel_y,
  3425.             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
  3426.             fs_reg(brw_imm_v(0x11001100))));
  3427.  
  3428.    this->current_annotation = "compute pixel deltas from v0";
  3429.  
  3430.    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
  3431.       vgrf(glsl_type::vec2_type);
  3432.    const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
  3433.    const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
  3434.    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
  3435.  
  3436.    if (devinfo->has_pln && dispatch_width == 16) {
  3437.       emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
  3438.       emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
  3439.       emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
  3440.          ->force_sechalf = true;
  3441.       emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
  3442.          ->force_sechalf = true;
  3443.    } else {
  3444.       emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
  3445.       emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
  3446.    }
  3447.  
  3448.    this->current_annotation = "compute pos.w and 1/pos.w";
  3449.    /* Compute wpos.w.  It's always in our setup, since it's needed to
  3450.     * interpolate the other attributes.
  3451.     */
  3452.    this->wpos_w = vgrf(glsl_type::float_type);
  3453.    emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
  3454.    /* Compute the pixel 1/W value from wpos.w. */
  3455.    this->pixel_w = vgrf(glsl_type::float_type);
  3456.    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
  3457.    this->current_annotation = NULL;
  3458. }
  3459.  
  3460. /** Emits the interpolation for the varying inputs. */
  3461. void
  3462. fs_visitor::emit_interpolation_setup_gen6()
  3463. {
  3464.    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
  3465.  
  3466.    this->current_annotation = "compute pixel centers";
  3467.    if (brw->gen >= 8 || dispatch_width == 8) {
  3468.       /* The "Register Region Restrictions" page says for BDW (and newer,
  3469.        * presumably):
  3470.        *
  3471.        *     "When destination spans two registers, the source may be one or
  3472.        *      two registers. The destination elements must be evenly split
  3473.        *      between the two registers."
  3474.        *
  3475.        * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
  3476.        * compute our pixel centers.
  3477.        */
  3478.       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
  3479.                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
  3480.       emit(ADD(int_pixel_xy,
  3481.                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
  3482.                fs_reg(brw_imm_v(0x11001010))))
  3483.          ->force_writemask_all = true;
  3484.  
  3485.       this->pixel_x = vgrf(glsl_type::float_type);
  3486.       this->pixel_y = vgrf(glsl_type::float_type);
  3487.       emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
  3488.       emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
  3489.    } else {
  3490.       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
  3491.        *
  3492.        *     "When destination spans two registers, the source MUST span two
  3493.        *      registers."
  3494.        *
  3495.        * Since the GRF source of the ADD will only read a single register, we
  3496.        * must do two separate ADDs in SIMD16.
  3497.        */
  3498.       fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
  3499.       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
  3500.       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
  3501.       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
  3502.       emit(ADD(int_pixel_x,
  3503.                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
  3504.                fs_reg(brw_imm_v(0x10101010))));
  3505.       emit(ADD(int_pixel_y,
  3506.                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
  3507.                fs_reg(brw_imm_v(0x11001100))));
  3508.  
  3509.       /* As of gen6, we can no longer mix float and int sources.  We have
  3510.        * to turn the integer pixel centers into floats for their actual
  3511.        * use.
  3512.        */
  3513.       this->pixel_x = vgrf(glsl_type::float_type);
  3514.       this->pixel_y = vgrf(glsl_type::float_type);
  3515.       emit(MOV(this->pixel_x, int_pixel_x));
  3516.       emit(MOV(this->pixel_y, int_pixel_y));
  3517.    }
  3518.  
  3519.    this->current_annotation = "compute pos.w";
  3520.    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
  3521.    this->wpos_w = vgrf(glsl_type::float_type);
  3522.    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
  3523.  
  3524.    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
  3525.       uint8_t reg = payload.barycentric_coord_reg[i];
  3526.       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
  3527.    }
  3528.  
  3529.    this->current_annotation = NULL;
  3530. }
  3531.  
  3532. void
  3533. fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
  3534.                                 unsigned exec_size, bool use_2nd_half)
  3535. {
  3536.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  3537.    fs_inst *inst;
  3538.  
  3539.    if (key->clamp_fragment_color) {
  3540.       fs_reg tmp = vgrf(glsl_type::vec4_type);
  3541.       assert(color.type == BRW_REGISTER_TYPE_F);
  3542.       for (unsigned i = 0; i < components; i++) {
  3543.          inst = emit(MOV(offset(tmp, i), offset(color, i)));
  3544.          inst->saturate = true;
  3545.       }
  3546.       color = tmp;
  3547.    }
  3548.  
  3549.    if (exec_size < dispatch_width) {
  3550.       unsigned half_idx = use_2nd_half ? 1 : 0;
  3551.       for (unsigned i = 0; i < components; i++)
  3552.          dst[i] = half(offset(color, i), half_idx);
  3553.    } else {
  3554.       for (unsigned i = 0; i < components; i++)
  3555.          dst[i] = offset(color, i);
  3556.    }
  3557. }
  3558.  
  3559. static enum brw_conditional_mod
  3560. cond_for_alpha_func(GLenum func)
  3561. {
  3562.    switch(func) {
  3563.       case GL_GREATER:
  3564.          return BRW_CONDITIONAL_G;
  3565.       case GL_GEQUAL:
  3566.          return BRW_CONDITIONAL_GE;
  3567.       case GL_LESS:
  3568.          return BRW_CONDITIONAL_L;
  3569.       case GL_LEQUAL:
  3570.          return BRW_CONDITIONAL_LE;
  3571.       case GL_EQUAL:
  3572.          return BRW_CONDITIONAL_EQ;
  3573.       case GL_NOTEQUAL:
  3574.          return BRW_CONDITIONAL_NEQ;
  3575.       default:
  3576.          unreachable("Not reached");
  3577.    }
  3578. }
  3579.  
  3580. /**
  3581.  * Alpha test support for when we compile it into the shader instead
  3582.  * of using the normal fixed-function alpha test.
  3583.  */
  3584. void
  3585. fs_visitor::emit_alpha_test()
  3586. {
  3587.    assert(stage == MESA_SHADER_FRAGMENT);
  3588.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  3589.    this->current_annotation = "Alpha test";
  3590.  
  3591.    fs_inst *cmp;
  3592.    if (key->alpha_test_func == GL_ALWAYS)
  3593.       return;
  3594.  
  3595.    if (key->alpha_test_func == GL_NEVER) {
  3596.       /* f0.1 = 0 */
  3597.       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
  3598.                                       BRW_REGISTER_TYPE_UW));
  3599.       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
  3600.                      BRW_CONDITIONAL_NEQ));
  3601.    } else {
  3602.       /* RT0 alpha */
  3603.       fs_reg color = offset(outputs[0], 3);
  3604.  
  3605.       /* f0.1 &= func(color, ref) */
  3606.       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
  3607.                      cond_for_alpha_func(key->alpha_test_func)));
  3608.    }
  3609.    cmp->predicate = BRW_PREDICATE_NORMAL;
  3610.    cmp->flag_subreg = 1;
  3611. }
  3612.  
  3613. fs_inst *
  3614. fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
  3615.                                  fs_reg src0_alpha, unsigned components,
  3616.                                  unsigned exec_size, bool use_2nd_half)
  3617. {
  3618.    assert(stage == MESA_SHADER_FRAGMENT);
  3619.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  3620.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  3621.  
  3622.    this->current_annotation = "FB write header";
  3623.    int header_size = 2, payload_header_size;
  3624.  
  3625.    /* We can potentially have a message length of up to 15, so we have to set
  3626.     * base_mrf to either 0 or 1 in order to fit in m0..m15.
  3627.     */
  3628.    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
  3629.    int length = 0;
  3630.  
  3631.    /* From the Sandy Bridge PRM, volume 4, page 198:
  3632.     *
  3633.     *     "Dispatched Pixel Enables. One bit per pixel indicating
  3634.     *      which pixels were originally enabled when the thread was
  3635.     *      dispatched. This field is only required for the end-of-
  3636.     *      thread message and on all dual-source messages."
  3637.     */
  3638.    if (devinfo->gen >= 6 &&
  3639.        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
  3640.        color1.file == BAD_FILE &&
  3641.        key->nr_color_regions == 1) {
  3642.       header_size = 0;
  3643.    }
  3644.  
  3645.    if (header_size != 0) {
  3646.       assert(header_size == 2);
  3647.       /* Allocate 2 registers for a header */
  3648.       length += 2;
  3649.    }
  3650.  
  3651.    if (payload.aa_dest_stencil_reg) {
  3652.       sources[length] = fs_reg(GRF, alloc.allocate(1));
  3653.       emit(MOV(sources[length],
  3654.                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
  3655.       length++;
  3656.    }
  3657.  
  3658.    prog_data->uses_omask =
  3659.       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
  3660.    if (prog_data->uses_omask) {
  3661.       this->current_annotation = "FB write oMask";
  3662.       assert(this->sample_mask.file != BAD_FILE);
  3663.       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
  3664.        * it's unsinged single words, one vgrf is always 16-wide.
  3665.        */
  3666.       sources[length] = fs_reg(GRF, alloc.allocate(1),
  3667.                                BRW_REGISTER_TYPE_UW, 16);
  3668.       emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
  3669.       length++;
  3670.    }
  3671.  
  3672.    payload_header_size = length;
  3673.  
  3674.    if (color0.file == BAD_FILE) {
  3675.       /* Even if there's no color buffers enabled, we still need to send
  3676.        * alpha out the pipeline to our null renderbuffer to support
  3677.        * alpha-testing, alpha-to-coverage, and so on.
  3678.        */
  3679.       if (this->outputs[0].file != BAD_FILE)
  3680.          setup_color_payload(&sources[length + 3], offset(this->outputs[0], 3),
  3681.                              1, exec_size, false);
  3682.       length += 4;
  3683.    } else if (color1.file == BAD_FILE) {
  3684.       if (src0_alpha.file != BAD_FILE) {
  3685.          setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
  3686.          length++;
  3687.       }
  3688.  
  3689.       setup_color_payload(&sources[length], color0, components,
  3690.                           exec_size, use_2nd_half);
  3691.       length += 4;
  3692.    } else {
  3693.       setup_color_payload(&sources[length], color0, components,
  3694.                           exec_size, use_2nd_half);
  3695.       length += 4;
  3696.       setup_color_payload(&sources[length], color1, components,
  3697.                           exec_size, use_2nd_half);
  3698.       length += 4;
  3699.    }
  3700.  
  3701.    if (source_depth_to_render_target) {
  3702.       if (devinfo->gen == 6) {
  3703.          /* For outputting oDepth on gen6, SIMD8 writes have to be
  3704.           * used.  This would require SIMD8 moves of each half to
  3705.           * message regs, kind of like pre-gen5 SIMD16 FB writes.
  3706.           * Just bail on doing so for now.
  3707.           */
  3708.          no16("Missing support for simd16 depth writes on gen6\n");
  3709.       }
  3710.  
  3711.       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
  3712.          /* Hand over gl_FragDepth. */
  3713.          assert(this->frag_depth.file != BAD_FILE);
  3714.          sources[length] = this->frag_depth;
  3715.       } else {
  3716.          /* Pass through the payload depth. */
  3717.          sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
  3718.       }
  3719.       length++;
  3720.    }
  3721.  
  3722.    if (payload.dest_depth_reg)
  3723.       sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
  3724.  
  3725.    fs_inst *load;
  3726.    fs_inst *write;
  3727.    if (devinfo->gen >= 7) {
  3728.       /* Send from the GRF */
  3729.       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
  3730.       load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
  3731.       payload.reg = alloc.allocate(load->regs_written);
  3732.       load->dst = payload;
  3733.       write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
  3734.       write->base_mrf = -1;
  3735.    } else {
  3736.       /* Send from the MRF */
  3737.       load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
  3738.                                sources, length, payload_header_size));
  3739.  
  3740.       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
  3741.        * will do this for us if we just give it a COMPR4 destination.
  3742.        */
  3743.       if (brw->gen < 6 && exec_size == 16)
  3744.          load->dst.reg |= BRW_MRF_COMPR4;
  3745.  
  3746.       write = emit(FS_OPCODE_FB_WRITE);
  3747.       write->exec_size = exec_size;
  3748.       write->base_mrf = 1;
  3749.    }
  3750.  
  3751.    write->mlen = load->regs_written;
  3752.    write->header_size = header_size;
  3753.    if (prog_data->uses_kill) {
  3754.       write->predicate = BRW_PREDICATE_NORMAL;
  3755.       write->flag_subreg = 1;
  3756.    }
  3757.    return write;
  3758. }
  3759.  
  3760. void
  3761. fs_visitor::emit_fb_writes()
  3762. {
  3763.    assert(stage == MESA_SHADER_FRAGMENT);
  3764.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  3765.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  3766.  
  3767.    fs_inst *inst = NULL;
  3768.    if (do_dual_src) {
  3769.       this->current_annotation = ralloc_asprintf(this->mem_ctx,
  3770.                                                  "FB dual-source write");
  3771.       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
  3772.                                   reg_undef, 4, 8);
  3773.       inst->target = 0;
  3774.  
  3775.       /* SIMD16 dual source blending requires to send two SIMD8 dual source
  3776.        * messages, where each message contains color data for 8 pixels. Color
  3777.        * data for the first group of pixels is stored in the "lower" half of
  3778.        * the color registers, so in SIMD16, the previous message did:
  3779.        * m + 0: r0
  3780.        * m + 1: g0
  3781.        * m + 2: b0
  3782.        * m + 3: a0
  3783.        *
  3784.        * Here goes the second message, which packs color data for the
  3785.        * remaining 8 pixels. Color data for these pixels is stored in the
  3786.        * "upper" half of the color registers, so we need to do:
  3787.        * m + 0: r1
  3788.        * m + 1: g1
  3789.        * m + 2: b1
  3790.        * m + 3: a1
  3791.        */
  3792.       if (dispatch_width == 16) {
  3793.          inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
  3794.                                      reg_undef, 4, 8, true);
  3795.          inst->target = 0;
  3796.       }
  3797.  
  3798.       prog_data->dual_src_blend = true;
  3799.    } else {
  3800.       for (int target = 0; target < key->nr_color_regions; target++) {
  3801.          /* Skip over outputs that weren't written. */
  3802.          if (this->outputs[target].file == BAD_FILE)
  3803.             continue;
  3804.  
  3805.          this->current_annotation = ralloc_asprintf(this->mem_ctx,
  3806.                                                     "FB write target %d",
  3807.                                                     target);
  3808.          fs_reg src0_alpha;
  3809.          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
  3810.             src0_alpha = offset(outputs[0], 3);
  3811.  
  3812.          inst = emit_single_fb_write(this->outputs[target], reg_undef,
  3813.                                      src0_alpha,
  3814.                                      this->output_components[target],
  3815.                                      dispatch_width);
  3816.          inst->target = target;
  3817.       }
  3818.    }
  3819.  
  3820.    if (inst == NULL) {
  3821.       /* Even if there's no color buffers enabled, we still need to send
  3822.        * alpha out the pipeline to our null renderbuffer to support
  3823.        * alpha-testing, alpha-to-coverage, and so on.
  3824.        */
  3825.       inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0,
  3826.                                   dispatch_width);
  3827.       inst->target = 0;
  3828.    }
  3829.  
  3830.    inst->eot = true;
  3831.    this->current_annotation = NULL;
  3832. }
  3833.  
  3834. void
  3835. fs_visitor::setup_uniform_clipplane_values()
  3836. {
  3837.    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
  3838.    const struct brw_vue_prog_key *key =
  3839.       (const struct brw_vue_prog_key *) this->key;
  3840.  
  3841.    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
  3842.       this->userplane[i] = fs_reg(UNIFORM, uniforms);
  3843.       for (int j = 0; j < 4; ++j) {
  3844.          stage_prog_data->param[uniforms + j] =
  3845.             (gl_constant_value *) &clip_planes[i][j];
  3846.       }
  3847.       uniforms += 4;
  3848.    }
  3849. }
  3850.  
  3851. void fs_visitor::compute_clip_distance()
  3852. {
  3853.    struct brw_vue_prog_data *vue_prog_data =
  3854.       (struct brw_vue_prog_data *) prog_data;
  3855.    const struct brw_vue_prog_key *key =
  3856.       (const struct brw_vue_prog_key *) this->key;
  3857.  
  3858.    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
  3859.     *
  3860.     *     "If a linked set of shaders forming the vertex stage contains no
  3861.     *     static write to gl_ClipVertex or gl_ClipDistance, but the
  3862.     *     application has requested clipping against user clip planes through
  3863.     *     the API, then the coordinate written to gl_Position is used for
  3864.     *     comparison against the user clip planes."
  3865.     *
  3866.     * This function is only called if the shader didn't write to
  3867.     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
  3868.     * if the user wrote to it; otherwise we use gl_Position.
  3869.     */
  3870.  
  3871.    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
  3872.    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
  3873.       clip_vertex = VARYING_SLOT_POS;
  3874.  
  3875.    /* If the clip vertex isn't written, skip this.  Typically this means
  3876.     * the GS will set up clipping. */
  3877.    if (outputs[clip_vertex].file == BAD_FILE)
  3878.       return;
  3879.  
  3880.    setup_uniform_clipplane_values();
  3881.  
  3882.    current_annotation = "user clip distances";
  3883.  
  3884.    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
  3885.    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
  3886.  
  3887.    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
  3888.       fs_reg u = userplane[i];
  3889.       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
  3890.       output.reg_offset = i & 3;
  3891.  
  3892.       emit(MUL(output, outputs[clip_vertex], u));
  3893.       for (int j = 1; j < 4; j++) {
  3894.          u.reg = userplane[i].reg + j;
  3895.          emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
  3896.       }
  3897.    }
  3898. }
  3899.  
  3900. void
  3901. fs_visitor::emit_urb_writes()
  3902. {
  3903.    int slot, urb_offset, length;
  3904.    struct brw_vs_prog_data *vs_prog_data =
  3905.       (struct brw_vs_prog_data *) prog_data;
  3906.    const struct brw_vs_prog_key *key =
  3907.       (const struct brw_vs_prog_key *) this->key;
  3908.    const GLbitfield64 psiz_mask =
  3909.       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
  3910.    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
  3911.    bool flush;
  3912.    fs_reg sources[8];
  3913.  
  3914.    /* Lower legacy ff and ClipVertex clipping to clip distances */
  3915.    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
  3916.       compute_clip_distance();
  3917.  
  3918.    /* If we don't have any valid slots to write, just do a minimal urb write
  3919.     * send to terminate the shader. */
  3920.    if (vue_map->slots_valid == 0) {
  3921.  
  3922.       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
  3923.       fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
  3924.                                                       BRW_REGISTER_TYPE_UD))));
  3925.       inst->force_writemask_all = true;
  3926.  
  3927.       inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
  3928.       inst->eot = true;
  3929.       inst->mlen = 1;
  3930.       inst->offset = 1;
  3931.       return;
  3932.    }
  3933.  
  3934.    length = 0;
  3935.    urb_offset = 0;
  3936.    flush = false;
  3937.    for (slot = 0; slot < vue_map->num_slots; slot++) {
  3938.       fs_reg reg, src, zero;
  3939.  
  3940.       int varying = vue_map->slot_to_varying[slot];
  3941.       switch (varying) {
  3942.       case VARYING_SLOT_PSIZ:
  3943.  
  3944.          /* The point size varying slot is the vue header and is always in the
  3945.           * vue map.  But often none of the special varyings that live there
  3946.           * are written and in that case we can skip writing to the vue
  3947.           * header, provided the corresponding state properly clamps the
  3948.           * values further down the pipeline. */
  3949.          if ((vue_map->slots_valid & psiz_mask) == 0) {
  3950.             assert(length == 0);
  3951.             urb_offset++;
  3952.             break;
  3953.          }
  3954.  
  3955.          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
  3956.          emit(MOV(zero, fs_reg(0u)));
  3957.  
  3958.          sources[length++] = zero;
  3959.          if (vue_map->slots_valid & VARYING_BIT_LAYER)
  3960.             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
  3961.          else
  3962.             sources[length++] = zero;
  3963.  
  3964.          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
  3965.             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
  3966.          else
  3967.             sources[length++] = zero;
  3968.  
  3969.          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
  3970.             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
  3971.          else
  3972.             sources[length++] = zero;
  3973.          break;
  3974.  
  3975.       case BRW_VARYING_SLOT_NDC:
  3976.       case VARYING_SLOT_EDGE:
  3977.          unreachable("unexpected scalar vs output");
  3978.          break;
  3979.  
  3980.       case BRW_VARYING_SLOT_PAD:
  3981.          break;
  3982.  
  3983.       default:
  3984.          /* gl_Position is always in the vue map, but isn't always written by
  3985.           * the shader.  Other varyings (clip distances) get added to the vue
  3986.           * map but don't always get written.  In those cases, the
  3987.           * corresponding this->output[] slot will be invalid we and can skip
  3988.           * the urb write for the varying.  If we've already queued up a vue
  3989.           * slot for writing we flush a mlen 5 urb write, otherwise we just
  3990.           * advance the urb_offset.
  3991.           */
  3992.          if (this->outputs[varying].file == BAD_FILE) {
  3993.             if (length > 0)
  3994.                flush = true;
  3995.             else
  3996.                urb_offset++;
  3997.             break;
  3998.          }
  3999.  
  4000.          if ((varying == VARYING_SLOT_COL0 ||
  4001.               varying == VARYING_SLOT_COL1 ||
  4002.               varying == VARYING_SLOT_BFC0 ||
  4003.               varying == VARYING_SLOT_BFC1) &&
  4004.              key->clamp_vertex_color) {
  4005.             /* We need to clamp these guys, so do a saturating MOV into a
  4006.              * temp register and use that for the payload.
  4007.              */
  4008.             for (int i = 0; i < 4; i++) {
  4009.                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
  4010.                src = offset(this->outputs[varying], i);
  4011.                fs_inst *inst = emit(MOV(reg, src));
  4012.                inst->saturate = true;
  4013.                sources[length++] = reg;
  4014.             }
  4015.          } else {
  4016.             for (int i = 0; i < 4; i++)
  4017.                sources[length++] = offset(this->outputs[varying], i);
  4018.          }
  4019.          break;
  4020.       }
  4021.  
  4022.       current_annotation = "URB write";
  4023.  
  4024.       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
  4025.        * the last slot or if we need to flush (see BAD_FILE varying case
  4026.        * above), emit a URB write send now to flush out the data.
  4027.        */
  4028.       int last = slot == vue_map->num_slots - 1;
  4029.       if (length == 8 || last)
  4030.          flush = true;
  4031.       if (flush) {
  4032.          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
  4033.          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
  4034.                                  BRW_REGISTER_TYPE_F, dispatch_width);
  4035.  
  4036.          /* We need WE_all on the MOV for the message header (the URB handles)
  4037.           * so do a MOV to a dummy register and set force_writemask_all on the
  4038.           * MOV.  LOAD_PAYLOAD will preserve that.
  4039.           */
  4040.          fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
  4041.                                BRW_REGISTER_TYPE_UD);
  4042.          fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
  4043.                                                        BRW_REGISTER_TYPE_UD))));
  4044.          inst->force_writemask_all = true;
  4045.          payload_sources[0] = dummy;
  4046.  
  4047.          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
  4048.          emit(LOAD_PAYLOAD(payload, payload_sources, length + 1, 1));
  4049.  
  4050.          inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
  4051.          inst->eot = last;
  4052.          inst->mlen = length + 1;
  4053.          inst->offset = urb_offset;
  4054.          urb_offset = slot + 1;
  4055.          length = 0;
  4056.          flush = false;
  4057.       }
  4058.    }
  4059. }
  4060.  
  4061. void
  4062. fs_visitor::resolve_ud_negate(fs_reg *reg)
  4063. {
  4064.    if (reg->type != BRW_REGISTER_TYPE_UD ||
  4065.        !reg->negate)
  4066.       return;
  4067.  
  4068.    fs_reg temp = vgrf(glsl_type::uint_type);
  4069.    emit(MOV(temp, *reg));
  4070.    *reg = temp;
  4071. }
  4072.  
  4073. void
  4074. fs_visitor::emit_cs_terminate()
  4075. {
  4076.    assert(brw->gen >= 7);
  4077.  
  4078.    /* We are getting the thread ID from the compute shader header */
  4079.    assert(stage == MESA_SHADER_COMPUTE);
  4080.  
  4081.    /* We can't directly send from g0, since sends with EOT have to use
  4082.     * g112-127. So, copy it to a virtual register, The register allocator will
  4083.     * make sure it uses the appropriate register range.
  4084.     */
  4085.    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
  4086.    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
  4087.    fs_inst *inst = emit(MOV(payload, g0));
  4088.    inst->force_writemask_all = true;
  4089.  
  4090.    /* Send a message to the thread spawner to terminate the thread. */
  4091.    inst = emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
  4092.    inst->eot = true;
  4093. }
  4094.  
  4095. /**
  4096.  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
  4097.  *
  4098.  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
  4099.  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
  4100.  */
  4101. void
  4102. fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
  4103. {
  4104.    assert(devinfo->gen <= 5);
  4105.  
  4106.    if (rvalue->type != glsl_type::bool_type)
  4107.       return;
  4108.  
  4109.    fs_reg and_result = vgrf(glsl_type::bool_type);
  4110.    fs_reg neg_result = vgrf(glsl_type::bool_type);
  4111.    emit(AND(and_result, *reg, fs_reg(1)));
  4112.    emit(MOV(neg_result, negate(and_result)));
  4113.    *reg = neg_result;
  4114. }
  4115.  
  4116. fs_visitor::fs_visitor(struct brw_context *brw,
  4117.                        void *mem_ctx,
  4118.                        gl_shader_stage stage,
  4119.                        const void *key,
  4120.                        struct brw_stage_prog_data *prog_data,
  4121.                        struct gl_shader_program *shader_prog,
  4122.                        struct gl_program *prog,
  4123.                        unsigned dispatch_width)
  4124.    : backend_visitor(brw, shader_prog, prog, prog_data, stage),
  4125.      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
  4126.      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
  4127.      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
  4128.      key(key), prog_data(prog_data),
  4129.      dispatch_width(dispatch_width), promoted_constants(0)
  4130. {
  4131.    this->mem_ctx = mem_ctx;
  4132.  
  4133.    switch (stage) {
  4134.    case MESA_SHADER_FRAGMENT:
  4135.       key_tex = &((const brw_wm_prog_key *) key)->tex;
  4136.       break;
  4137.    case MESA_SHADER_VERTEX:
  4138.    case MESA_SHADER_GEOMETRY:
  4139.       key_tex = &((const brw_vue_prog_key *) key)->tex;
  4140.       break;
  4141.    case MESA_SHADER_COMPUTE:
  4142.       key_tex = &((const brw_cs_prog_key*) key)->tex;
  4143.       break;
  4144.    default:
  4145.       unreachable("unhandled shader stage");
  4146.    }
  4147.  
  4148.    this->failed = false;
  4149.    this->simd16_unsupported = false;
  4150.    this->no16_msg = NULL;
  4151.    this->variable_ht = hash_table_ctor(0,
  4152.                                        hash_table_pointer_hash,
  4153.                                        hash_table_pointer_compare);
  4154.  
  4155.    this->nir_locals = NULL;
  4156.    this->nir_globals = NULL;
  4157.  
  4158.    memset(&this->payload, 0, sizeof(this->payload));
  4159.    memset(this->outputs, 0, sizeof(this->outputs));
  4160.    memset(this->output_components, 0, sizeof(this->output_components));
  4161.    this->source_depth_to_render_target = false;
  4162.    this->runtime_check_aads_emit = false;
  4163.    this->first_non_payload_grf = 0;
  4164.    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
  4165.  
  4166.    this->current_annotation = NULL;
  4167.    this->base_ir = NULL;
  4168.  
  4169.    this->virtual_grf_start = NULL;
  4170.    this->virtual_grf_end = NULL;
  4171.    this->live_intervals = NULL;
  4172.    this->regs_live_at_ip = NULL;
  4173.  
  4174.    this->uniforms = 0;
  4175.    this->last_scratch = 0;
  4176.    this->pull_constant_loc = NULL;
  4177.    this->push_constant_loc = NULL;
  4178.  
  4179.    this->spilled_any_registers = false;
  4180.    this->do_dual_src = false;
  4181.  
  4182.    if (dispatch_width == 8)
  4183.       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
  4184. }
  4185.  
  4186. fs_visitor::~fs_visitor()
  4187. {
  4188.    hash_table_dtor(this->variable_ht);
  4189. }
  4190.