Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2011 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. #include "brw_vec4.h"
  25. #include "brw_fs.h"
  26. #include "brw_cfg.h"
  27. #include "brw_vs.h"
  28. #include "brw_nir.h"
  29. #include "brw_vec4_live_variables.h"
  30. #include "brw_dead_control_flow.h"
  31.  
  32. extern "C" {
  33. #include "main/macros.h"
  34. #include "main/shaderobj.h"
  35. #include "program/prog_print.h"
  36. #include "program/prog_parameter.h"
  37. }
  38.  
  39. #define MAX_INSTRUCTION (1 << 30)
  40.  
  41. using namespace brw;
  42.  
  43. namespace brw {
  44.  
  45. void
  46. src_reg::init()
  47. {
  48.    memset(this, 0, sizeof(*this));
  49.  
  50.    this->file = BAD_FILE;
  51. }
  52.  
  53. src_reg::src_reg(register_file file, int reg, const glsl_type *type)
  54. {
  55.    init();
  56.  
  57.    this->file = file;
  58.    this->reg = reg;
  59.    if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
  60.       this->swizzle = brw_swizzle_for_size(type->vector_elements);
  61.    else
  62.       this->swizzle = BRW_SWIZZLE_XYZW;
  63. }
  64.  
  65. /** Generic unset register constructor. */
  66. src_reg::src_reg()
  67. {
  68.    init();
  69. }
  70.  
  71. src_reg::src_reg(float f)
  72. {
  73.    init();
  74.  
  75.    this->file = IMM;
  76.    this->type = BRW_REGISTER_TYPE_F;
  77.    this->fixed_hw_reg.dw1.f = f;
  78. }
  79.  
  80. src_reg::src_reg(uint32_t u)
  81. {
  82.    init();
  83.  
  84.    this->file = IMM;
  85.    this->type = BRW_REGISTER_TYPE_UD;
  86.    this->fixed_hw_reg.dw1.ud = u;
  87. }
  88.  
  89. src_reg::src_reg(int32_t i)
  90. {
  91.    init();
  92.  
  93.    this->file = IMM;
  94.    this->type = BRW_REGISTER_TYPE_D;
  95.    this->fixed_hw_reg.dw1.d = i;
  96. }
  97.  
  98. src_reg::src_reg(uint8_t vf[4])
  99. {
  100.    init();
  101.  
  102.    this->file = IMM;
  103.    this->type = BRW_REGISTER_TYPE_VF;
  104.    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
  105. }
  106.  
  107. src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
  108. {
  109.    init();
  110.  
  111.    this->file = IMM;
  112.    this->type = BRW_REGISTER_TYPE_VF;
  113.    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
  114.                                (vf1 <<  8) |
  115.                                (vf2 << 16) |
  116.                                (vf3 << 24);
  117. }
  118.  
  119. src_reg::src_reg(struct brw_reg reg)
  120. {
  121.    init();
  122.  
  123.    this->file = HW_REG;
  124.    this->fixed_hw_reg = reg;
  125.    this->type = reg.type;
  126. }
  127.  
  128. src_reg::src_reg(const dst_reg &reg)
  129. {
  130.    init();
  131.  
  132.    this->file = reg.file;
  133.    this->reg = reg.reg;
  134.    this->reg_offset = reg.reg_offset;
  135.    this->type = reg.type;
  136.    this->reladdr = reg.reladdr;
  137.    this->fixed_hw_reg = reg.fixed_hw_reg;
  138.    this->swizzle = brw_swizzle_for_mask(reg.writemask);
  139. }
  140.  
  141. void
  142. dst_reg::init()
  143. {
  144.    memset(this, 0, sizeof(*this));
  145.    this->file = BAD_FILE;
  146.    this->writemask = WRITEMASK_XYZW;
  147. }
  148.  
  149. dst_reg::dst_reg()
  150. {
  151.    init();
  152. }
  153.  
  154. dst_reg::dst_reg(register_file file, int reg)
  155. {
  156.    init();
  157.  
  158.    this->file = file;
  159.    this->reg = reg;
  160. }
  161.  
  162. dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
  163.                  unsigned writemask)
  164. {
  165.    init();
  166.  
  167.    this->file = file;
  168.    this->reg = reg;
  169.    this->type = brw_type_for_base_type(type);
  170.    this->writemask = writemask;
  171. }
  172.  
  173. dst_reg::dst_reg(struct brw_reg reg)
  174. {
  175.    init();
  176.  
  177.    this->file = HW_REG;
  178.    this->fixed_hw_reg = reg;
  179.    this->type = reg.type;
  180. }
  181.  
  182. dst_reg::dst_reg(const src_reg &reg)
  183. {
  184.    init();
  185.  
  186.    this->file = reg.file;
  187.    this->reg = reg.reg;
  188.    this->reg_offset = reg.reg_offset;
  189.    this->type = reg.type;
  190.    this->writemask = brw_mask_for_swizzle(reg.swizzle);
  191.    this->reladdr = reg.reladdr;
  192.    this->fixed_hw_reg = reg.fixed_hw_reg;
  193. }
  194.  
  195. bool
  196. dst_reg::equals(const dst_reg &r) const
  197. {
  198.    return (file == r.file &&
  199.            reg == r.reg &&
  200.            reg_offset == r.reg_offset &&
  201.            type == r.type &&
  202.            negate == r.negate &&
  203.            abs == r.abs &&
  204.            writemask == r.writemask &&
  205.            (reladdr == r.reladdr ||
  206.             (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
  207.            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
  208.                   sizeof(fixed_hw_reg)) == 0);
  209. }
  210.  
  211. bool
  212. vec4_instruction::is_send_from_grf()
  213. {
  214.    switch (opcode) {
  215.    case SHADER_OPCODE_SHADER_TIME_ADD:
  216.    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
  217.    case SHADER_OPCODE_UNTYPED_ATOMIC:
  218.    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
  219.    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
  220.    case SHADER_OPCODE_TYPED_ATOMIC:
  221.    case SHADER_OPCODE_TYPED_SURFACE_READ:
  222.    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
  223.       return true;
  224.    default:
  225.       return false;
  226.    }
  227. }
  228.  
  229. unsigned
  230. vec4_instruction::regs_read(unsigned arg) const
  231. {
  232.    if (src[arg].file == BAD_FILE)
  233.       return 0;
  234.  
  235.    switch (opcode) {
  236.    case SHADER_OPCODE_SHADER_TIME_ADD:
  237.    case SHADER_OPCODE_UNTYPED_ATOMIC:
  238.    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
  239.    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
  240.    case SHADER_OPCODE_TYPED_ATOMIC:
  241.    case SHADER_OPCODE_TYPED_SURFACE_READ:
  242.    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
  243.       return arg == 0 ? mlen : 1;
  244.  
  245.    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
  246.       return arg == 1 ? mlen : 1;
  247.  
  248.    default:
  249.       return 1;
  250.    }
  251. }
  252.  
  253. bool
  254. vec4_instruction::can_do_source_mods(const struct brw_device_info *devinfo)
  255. {
  256.    if (devinfo->gen == 6 && is_math())
  257.       return false;
  258.  
  259.    if (is_send_from_grf())
  260.       return false;
  261.  
  262.    if (!backend_instruction::can_do_source_mods())
  263.       return false;
  264.  
  265.    return true;
  266. }
  267.  
  268. /**
  269.  * Returns how many MRFs an opcode will write over.
  270.  *
  271.  * Note that this is not the 0 or 1 implied writes in an actual gen
  272.  * instruction -- the generate_* functions generate additional MOVs
  273.  * for setup.
  274.  */
  275. int
  276. vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
  277. {
  278.    if (inst->mlen == 0 || inst->is_send_from_grf())
  279.       return 0;
  280.  
  281.    switch (inst->opcode) {
  282.    case SHADER_OPCODE_RCP:
  283.    case SHADER_OPCODE_RSQ:
  284.    case SHADER_OPCODE_SQRT:
  285.    case SHADER_OPCODE_EXP2:
  286.    case SHADER_OPCODE_LOG2:
  287.    case SHADER_OPCODE_SIN:
  288.    case SHADER_OPCODE_COS:
  289.       return 1;
  290.    case SHADER_OPCODE_INT_QUOTIENT:
  291.    case SHADER_OPCODE_INT_REMAINDER:
  292.    case SHADER_OPCODE_POW:
  293.       return 2;
  294.    case VS_OPCODE_URB_WRITE:
  295.       return 1;
  296.    case VS_OPCODE_PULL_CONSTANT_LOAD:
  297.       return 2;
  298.    case SHADER_OPCODE_GEN4_SCRATCH_READ:
  299.       return 2;
  300.    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
  301.       return 3;
  302.    case GS_OPCODE_URB_WRITE:
  303.    case GS_OPCODE_URB_WRITE_ALLOCATE:
  304.    case GS_OPCODE_THREAD_END:
  305.       return 0;
  306.    case GS_OPCODE_FF_SYNC:
  307.       return 1;
  308.    case SHADER_OPCODE_SHADER_TIME_ADD:
  309.       return 0;
  310.    case SHADER_OPCODE_TEX:
  311.    case SHADER_OPCODE_TXL:
  312.    case SHADER_OPCODE_TXD:
  313.    case SHADER_OPCODE_TXF:
  314.    case SHADER_OPCODE_TXF_CMS:
  315.    case SHADER_OPCODE_TXF_MCS:
  316.    case SHADER_OPCODE_TXS:
  317.    case SHADER_OPCODE_TG4:
  318.    case SHADER_OPCODE_TG4_OFFSET:
  319.       return inst->header_size;
  320.    default:
  321.       unreachable("not reached");
  322.    }
  323. }
  324.  
  325. bool
  326. src_reg::equals(const src_reg &r) const
  327. {
  328.    return (file == r.file &&
  329.            reg == r.reg &&
  330.            reg_offset == r.reg_offset &&
  331.            type == r.type &&
  332.            negate == r.negate &&
  333.            abs == r.abs &&
  334.            swizzle == r.swizzle &&
  335.            !reladdr && !r.reladdr &&
  336.            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
  337.                   sizeof(fixed_hw_reg)) == 0);
  338. }
  339.  
  340. bool
  341. vec4_visitor::opt_vector_float()
  342. {
  343.    bool progress = false;
  344.  
  345.    int last_reg = -1, last_reg_offset = -1;
  346.    enum register_file last_reg_file = BAD_FILE;
  347.  
  348.    int remaining_channels = 0;
  349.    uint8_t imm[4];
  350.    int inst_count = 0;
  351.    vec4_instruction *imm_inst[4];
  352.  
  353.    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
  354.       if (last_reg != inst->dst.reg ||
  355.           last_reg_offset != inst->dst.reg_offset ||
  356.           last_reg_file != inst->dst.file) {
  357.          last_reg = inst->dst.reg;
  358.          last_reg_offset = inst->dst.reg_offset;
  359.          last_reg_file = inst->dst.file;
  360.          remaining_channels = WRITEMASK_XYZW;
  361.  
  362.          inst_count = 0;
  363.       }
  364.  
  365.       if (inst->opcode != BRW_OPCODE_MOV ||
  366.           inst->dst.writemask == WRITEMASK_XYZW ||
  367.           inst->src[0].file != IMM)
  368.          continue;
  369.  
  370.       int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f);
  371.       if (vf == -1)
  372.          continue;
  373.  
  374.       if ((inst->dst.writemask & WRITEMASK_X) != 0)
  375.          imm[0] = vf;
  376.       if ((inst->dst.writemask & WRITEMASK_Y) != 0)
  377.          imm[1] = vf;
  378.       if ((inst->dst.writemask & WRITEMASK_Z) != 0)
  379.          imm[2] = vf;
  380.       if ((inst->dst.writemask & WRITEMASK_W) != 0)
  381.          imm[3] = vf;
  382.  
  383.       imm_inst[inst_count++] = inst;
  384.  
  385.       remaining_channels &= ~inst->dst.writemask;
  386.       if (remaining_channels == 0) {
  387.          vec4_instruction *mov = MOV(inst->dst, imm);
  388.          mov->dst.type = BRW_REGISTER_TYPE_F;
  389.          mov->dst.writemask = WRITEMASK_XYZW;
  390.          inst->insert_after(block, mov);
  391.          last_reg = -1;
  392.  
  393.          for (int i = 0; i < inst_count; i++) {
  394.             imm_inst[i]->remove(block);
  395.          }
  396.          progress = true;
  397.       }
  398.    }
  399.  
  400.    if (progress)
  401.       invalidate_live_intervals();
  402.  
  403.    return progress;
  404. }
  405.  
  406. /* Replaces unused channels of a swizzle with channels that are used.
  407.  *
  408.  * For instance, this pass transforms
  409.  *
  410.  *    mov vgrf4.yz, vgrf5.wxzy
  411.  *
  412.  * into
  413.  *
  414.  *    mov vgrf4.yz, vgrf5.xxzx
  415.  *
  416.  * This eliminates false uses of some channels, letting dead code elimination
  417.  * remove the instructions that wrote them.
  418.  */
  419. bool
  420. vec4_visitor::opt_reduce_swizzle()
  421. {
  422.    bool progress = false;
  423.  
  424.    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
  425.       if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
  426.           inst->is_send_from_grf())
  427.          continue;
  428.  
  429.       unsigned swizzle;
  430.  
  431.       /* Determine which channels of the sources are read. */
  432.       switch (inst->opcode) {
  433.       case VEC4_OPCODE_PACK_BYTES:
  434.       case BRW_OPCODE_DP4:
  435.       case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
  436.                             *           but all four of src1.
  437.                             */
  438.          swizzle = brw_swizzle_for_size(4);
  439.          break;
  440.       case BRW_OPCODE_DP3:
  441.          swizzle = brw_swizzle_for_size(3);
  442.          break;
  443.       case BRW_OPCODE_DP2:
  444.          swizzle = brw_swizzle_for_size(2);
  445.          break;
  446.       default:
  447.          swizzle = brw_swizzle_for_mask(inst->dst.writemask);
  448.          break;
  449.       }
  450.  
  451.       /* Update sources' swizzles. */
  452.       for (int i = 0; i < 3; i++) {
  453.          if (inst->src[i].file != GRF &&
  454.              inst->src[i].file != ATTR &&
  455.              inst->src[i].file != UNIFORM)
  456.             continue;
  457.  
  458.          const unsigned new_swizzle =
  459.             brw_compose_swizzle(swizzle, inst->src[i].swizzle);
  460.          if (inst->src[i].swizzle != new_swizzle) {
  461.             inst->src[i].swizzle = new_swizzle;
  462.             progress = true;
  463.          }
  464.       }
  465.    }
  466.  
  467.    if (progress)
  468.       invalidate_live_intervals();
  469.  
  470.    return progress;
  471. }
  472.  
  473. void
  474. vec4_visitor::split_uniform_registers()
  475. {
  476.    /* Prior to this, uniforms have been in an array sized according to
  477.     * the number of vector uniforms present, sparsely filled (so an
  478.     * aggregate results in reg indices being skipped over).  Now we're
  479.     * going to cut those aggregates up so each .reg index is one
  480.     * vector.  The goal is to make elimination of unused uniform
  481.     * components easier later.
  482.     */
  483.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  484.       for (int i = 0 ; i < 3; i++) {
  485.          if (inst->src[i].file != UNIFORM)
  486.             continue;
  487.  
  488.          assert(!inst->src[i].reladdr);
  489.  
  490.          inst->src[i].reg += inst->src[i].reg_offset;
  491.          inst->src[i].reg_offset = 0;
  492.       }
  493.    }
  494.  
  495.    /* Update that everything is now vector-sized. */
  496.    for (int i = 0; i < this->uniforms; i++) {
  497.       this->uniform_size[i] = 1;
  498.    }
  499. }
  500.  
  501. void
  502. vec4_visitor::pack_uniform_registers()
  503. {
  504.    bool uniform_used[this->uniforms];
  505.    int new_loc[this->uniforms];
  506.    int new_chan[this->uniforms];
  507.  
  508.    memset(uniform_used, 0, sizeof(uniform_used));
  509.    memset(new_loc, 0, sizeof(new_loc));
  510.    memset(new_chan, 0, sizeof(new_chan));
  511.  
  512.    /* Find which uniform vectors are actually used by the program.  We
  513.     * expect unused vector elements when we've moved array access out
  514.     * to pull constants, and from some GLSL code generators like wine.
  515.     */
  516.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  517.       for (int i = 0 ; i < 3; i++) {
  518.          if (inst->src[i].file != UNIFORM)
  519.             continue;
  520.  
  521.          uniform_used[inst->src[i].reg] = true;
  522.       }
  523.    }
  524.  
  525.    int new_uniform_count = 0;
  526.  
  527.    /* Now, figure out a packing of the live uniform vectors into our
  528.     * push constants.
  529.     */
  530.    for (int src = 0; src < uniforms; src++) {
  531.       assert(src < uniform_array_size);
  532.       int size = this->uniform_vector_size[src];
  533.  
  534.       if (!uniform_used[src]) {
  535.          this->uniform_vector_size[src] = 0;
  536.          continue;
  537.       }
  538.  
  539.       int dst;
  540.       /* Find the lowest place we can slot this uniform in. */
  541.       for (dst = 0; dst < src; dst++) {
  542.          if (this->uniform_vector_size[dst] + size <= 4)
  543.             break;
  544.       }
  545.  
  546.       if (src == dst) {
  547.          new_loc[src] = dst;
  548.          new_chan[src] = 0;
  549.       } else {
  550.          new_loc[src] = dst;
  551.          new_chan[src] = this->uniform_vector_size[dst];
  552.  
  553.          /* Move the references to the data */
  554.          for (int j = 0; j < size; j++) {
  555.             stage_prog_data->param[dst * 4 + new_chan[src] + j] =
  556.                stage_prog_data->param[src * 4 + j];
  557.          }
  558.  
  559.          this->uniform_vector_size[dst] += size;
  560.          this->uniform_vector_size[src] = 0;
  561.       }
  562.  
  563.       new_uniform_count = MAX2(new_uniform_count, dst + 1);
  564.    }
  565.  
  566.    this->uniforms = new_uniform_count;
  567.  
  568.    /* Now, update the instructions for our repacked uniforms. */
  569.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  570.       for (int i = 0 ; i < 3; i++) {
  571.          int src = inst->src[i].reg;
  572.  
  573.          if (inst->src[i].file != UNIFORM)
  574.             continue;
  575.  
  576.          inst->src[i].reg = new_loc[src];
  577.          inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
  578.                                               new_chan[src], new_chan[src]);
  579.       }
  580.    }
  581. }
  582.  
  583. /**
  584.  * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
  585.  *
  586.  * While GLSL IR also performs this optimization, we end up with it in
  587.  * our instruction stream for a couple of reasons.  One is that we
  588.  * sometimes generate silly instructions, for example in array access
  589.  * where we'll generate "ADD offset, index, base" even if base is 0.
  590.  * The other is that GLSL IR's constant propagation doesn't track the
  591.  * components of aggregates, so some VS patterns (initialize matrix to
  592.  * 0, accumulate in vertex blending factors) end up breaking down to
  593.  * instructions involving 0.
  594.  */
  595. bool
  596. vec4_visitor::opt_algebraic()
  597. {
  598.    bool progress = false;
  599.  
  600.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  601.       switch (inst->opcode) {
  602.       case BRW_OPCODE_MOV:
  603.          if (inst->src[0].file != IMM)
  604.             break;
  605.  
  606.          if (inst->saturate) {
  607.             if (inst->dst.type != inst->src[0].type)
  608.                assert(!"unimplemented: saturate mixed types");
  609.  
  610.             if (brw_saturate_immediate(inst->dst.type,
  611.                                        &inst->src[0].fixed_hw_reg)) {
  612.                inst->saturate = false;
  613.                progress = true;
  614.             }
  615.          }
  616.          break;
  617.  
  618.       case VEC4_OPCODE_UNPACK_UNIFORM:
  619.          if (inst->src[0].file != UNIFORM) {
  620.             inst->opcode = BRW_OPCODE_MOV;
  621.             progress = true;
  622.          }
  623.          break;
  624.  
  625.       case BRW_OPCODE_ADD:
  626.          if (inst->src[1].is_zero()) {
  627.             inst->opcode = BRW_OPCODE_MOV;
  628.             inst->src[1] = src_reg();
  629.             progress = true;
  630.          }
  631.          break;
  632.  
  633.       case BRW_OPCODE_MUL:
  634.          if (inst->src[1].is_zero()) {
  635.             inst->opcode = BRW_OPCODE_MOV;
  636.             switch (inst->src[0].type) {
  637.             case BRW_REGISTER_TYPE_F:
  638.                inst->src[0] = src_reg(0.0f);
  639.                break;
  640.             case BRW_REGISTER_TYPE_D:
  641.                inst->src[0] = src_reg(0);
  642.                break;
  643.             case BRW_REGISTER_TYPE_UD:
  644.                inst->src[0] = src_reg(0u);
  645.                break;
  646.             default:
  647.                unreachable("not reached");
  648.             }
  649.             inst->src[1] = src_reg();
  650.             progress = true;
  651.          } else if (inst->src[1].is_one()) {
  652.             inst->opcode = BRW_OPCODE_MOV;
  653.             inst->src[1] = src_reg();
  654.             progress = true;
  655.          } else if (inst->src[1].is_negative_one()) {
  656.             inst->opcode = BRW_OPCODE_MOV;
  657.             inst->src[0].negate = !inst->src[0].negate;
  658.             inst->src[1] = src_reg();
  659.             progress = true;
  660.          }
  661.          break;
  662.       case BRW_OPCODE_CMP:
  663.          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
  664.              inst->src[0].abs &&
  665.              inst->src[0].negate &&
  666.              inst->src[1].is_zero()) {
  667.             inst->src[0].abs = false;
  668.             inst->src[0].negate = false;
  669.             inst->conditional_mod = BRW_CONDITIONAL_Z;
  670.             progress = true;
  671.             break;
  672.          }
  673.          break;
  674.       case SHADER_OPCODE_RCP: {
  675.          vec4_instruction *prev = (vec4_instruction *)inst->prev;
  676.          if (prev->opcode == SHADER_OPCODE_SQRT) {
  677.             if (inst->src[0].equals(src_reg(prev->dst))) {
  678.                inst->opcode = SHADER_OPCODE_RSQ;
  679.                inst->src[0] = prev->src[0];
  680.                progress = true;
  681.             }
  682.          }
  683.          break;
  684.       }
  685.       case SHADER_OPCODE_BROADCAST:
  686.          if (is_uniform(inst->src[0]) ||
  687.              inst->src[1].is_zero()) {
  688.             inst->opcode = BRW_OPCODE_MOV;
  689.             inst->src[1] = src_reg();
  690.             inst->force_writemask_all = true;
  691.             progress = true;
  692.          }
  693.          break;
  694.  
  695.       default:
  696.          break;
  697.       }
  698.    }
  699.  
  700.    if (progress)
  701.       invalidate_live_intervals();
  702.  
  703.    return progress;
  704. }
  705.  
  706. /**
  707.  * Only a limited number of hardware registers may be used for push
  708.  * constants, so this turns access to the overflowed constants into
  709.  * pull constants.
  710.  */
  711. void
  712. vec4_visitor::move_push_constants_to_pull_constants()
  713. {
  714.    int pull_constant_loc[this->uniforms];
  715.  
  716.    /* Only allow 32 registers (256 uniform components) as push constants,
  717.     * which is the limit on gen6.
  718.     *
  719.     * If changing this value, note the limitation about total_regs in
  720.     * brw_curbe.c.
  721.     */
  722.    int max_uniform_components = 32 * 8;
  723.    if (this->uniforms * 4 <= max_uniform_components)
  724.       return;
  725.  
  726.    /* Make some sort of choice as to which uniforms get sent to pull
  727.     * constants.  We could potentially do something clever here like
  728.     * look for the most infrequently used uniform vec4s, but leave
  729.     * that for later.
  730.     */
  731.    for (int i = 0; i < this->uniforms * 4; i += 4) {
  732.       pull_constant_loc[i / 4] = -1;
  733.  
  734.       if (i >= max_uniform_components) {
  735.          const gl_constant_value **values = &stage_prog_data->param[i];
  736.  
  737.          /* Try to find an existing copy of this uniform in the pull
  738.           * constants if it was part of an array access already.
  739.           */
  740.          for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
  741.             int matches;
  742.  
  743.             for (matches = 0; matches < 4; matches++) {
  744.                if (stage_prog_data->pull_param[j + matches] != values[matches])
  745.                   break;
  746.             }
  747.  
  748.             if (matches == 4) {
  749.                pull_constant_loc[i / 4] = j / 4;
  750.                break;
  751.             }
  752.          }
  753.  
  754.          if (pull_constant_loc[i / 4] == -1) {
  755.             assert(stage_prog_data->nr_pull_params % 4 == 0);
  756.             pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
  757.  
  758.             for (int j = 0; j < 4; j++) {
  759.                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
  760.                   values[j];
  761.             }
  762.          }
  763.       }
  764.    }
  765.  
  766.    /* Now actually rewrite usage of the things we've moved to pull
  767.     * constants.
  768.     */
  769.    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
  770.       for (int i = 0 ; i < 3; i++) {
  771.          if (inst->src[i].file != UNIFORM ||
  772.              pull_constant_loc[inst->src[i].reg] == -1)
  773.             continue;
  774.  
  775.          int uniform = inst->src[i].reg;
  776.  
  777.          dst_reg temp = dst_reg(this, glsl_type::vec4_type);
  778.  
  779.          emit_pull_constant_load(block, inst, temp, inst->src[i],
  780.                                  pull_constant_loc[uniform]);
  781.  
  782.          inst->src[i].file = temp.file;
  783.          inst->src[i].reg = temp.reg;
  784.          inst->src[i].reg_offset = temp.reg_offset;
  785.          inst->src[i].reladdr = NULL;
  786.       }
  787.    }
  788.  
  789.    /* Repack push constants to remove the now-unused ones. */
  790.    pack_uniform_registers();
  791. }
  792.  
  793. /* Conditions for which we want to avoid setting the dependency control bits */
  794. bool
  795. vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
  796. {
  797. #define IS_DWORD(reg) \
  798.    (reg.type == BRW_REGISTER_TYPE_UD || \
  799.     reg.type == BRW_REGISTER_TYPE_D)
  800.  
  801.    /* "When source or destination datatype is 64b or operation is integer DWord
  802.     * multiply, DepCtrl must not be used."
  803.     * May apply to future SoCs as well.
  804.     */
  805.    if (devinfo->is_cherryview) {
  806.       if (inst->opcode == BRW_OPCODE_MUL &&
  807.          IS_DWORD(inst->src[0]) &&
  808.          IS_DWORD(inst->src[1]))
  809.          return true;
  810.    }
  811. #undef IS_DWORD
  812.  
  813.    if (devinfo->gen >= 8) {
  814.       if (inst->opcode == BRW_OPCODE_F32TO16)
  815.          return true;
  816.    }
  817.  
  818.    /*
  819.     * mlen:
  820.     * In the presence of send messages, totally interrupt dependency
  821.     * control. They're long enough that the chance of dependency
  822.     * control around them just doesn't matter.
  823.     *
  824.     * predicate:
  825.     * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
  826.     * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
  827.     * completes the scoreboard clear must have a non-zero execution mask. This
  828.     * means, if any kind of predication can change the execution mask or channel
  829.     * enable of the last instruction, the optimization must be avoided. This is
  830.     * to avoid instructions being shot down the pipeline when no writes are
  831.     * required.
  832.     *
  833.     * math:
  834.     * Dependency control does not work well over math instructions.
  835.     * NB: Discovered empirically
  836.     */
  837.    return (inst->mlen || inst->predicate || inst->is_math());
  838. }
  839.  
  840. /**
  841.  * Sets the dependency control fields on instructions after register
  842.  * allocation and before the generator is run.
  843.  *
  844.  * When you have a sequence of instructions like:
  845.  *
  846.  * DP4 temp.x vertex uniform[0]
  847.  * DP4 temp.y vertex uniform[0]
  848.  * DP4 temp.z vertex uniform[0]
  849.  * DP4 temp.w vertex uniform[0]
  850.  *
  851.  * The hardware doesn't know that it can actually run the later instructions
  852.  * while the previous ones are in flight, producing stalls.  However, we have
  853.  * manual fields we can set in the instructions that let it do so.
  854.  */
  855. void
  856. vec4_visitor::opt_set_dependency_control()
  857. {
  858.    vec4_instruction *last_grf_write[BRW_MAX_GRF];
  859.    uint8_t grf_channels_written[BRW_MAX_GRF];
  860.    vec4_instruction *last_mrf_write[BRW_MAX_GRF];
  861.    uint8_t mrf_channels_written[BRW_MAX_GRF];
  862.  
  863.    assert(prog_data->total_grf ||
  864.           !"Must be called after register allocation");
  865.  
  866.    foreach_block (block, cfg) {
  867.       memset(last_grf_write, 0, sizeof(last_grf_write));
  868.       memset(last_mrf_write, 0, sizeof(last_mrf_write));
  869.  
  870.       foreach_inst_in_block (vec4_instruction, inst, block) {
  871.          /* If we read from a register that we were doing dependency control
  872.           * on, don't do dependency control across the read.
  873.           */
  874.          for (int i = 0; i < 3; i++) {
  875.             int reg = inst->src[i].reg + inst->src[i].reg_offset;
  876.             if (inst->src[i].file == GRF) {
  877.                last_grf_write[reg] = NULL;
  878.             } else if (inst->src[i].file == HW_REG) {
  879.                memset(last_grf_write, 0, sizeof(last_grf_write));
  880.                break;
  881.             }
  882.             assert(inst->src[i].file != MRF);
  883.          }
  884.  
  885.          if (is_dep_ctrl_unsafe(inst)) {
  886.             memset(last_grf_write, 0, sizeof(last_grf_write));
  887.             memset(last_mrf_write, 0, sizeof(last_mrf_write));
  888.             continue;
  889.          }
  890.  
  891.          /* Now, see if we can do dependency control for this instruction
  892.           * against a previous one writing to its destination.
  893.           */
  894.          int reg = inst->dst.reg + inst->dst.reg_offset;
  895.          if (inst->dst.file == GRF) {
  896.             if (last_grf_write[reg] &&
  897.                 !(inst->dst.writemask & grf_channels_written[reg])) {
  898.                last_grf_write[reg]->no_dd_clear = true;
  899.                inst->no_dd_check = true;
  900.             } else {
  901.                grf_channels_written[reg] = 0;
  902.             }
  903.  
  904.             last_grf_write[reg] = inst;
  905.             grf_channels_written[reg] |= inst->dst.writemask;
  906.          } else if (inst->dst.file == MRF) {
  907.             if (last_mrf_write[reg] &&
  908.                 !(inst->dst.writemask & mrf_channels_written[reg])) {
  909.                last_mrf_write[reg]->no_dd_clear = true;
  910.                inst->no_dd_check = true;
  911.             } else {
  912.                mrf_channels_written[reg] = 0;
  913.             }
  914.  
  915.             last_mrf_write[reg] = inst;
  916.             mrf_channels_written[reg] |= inst->dst.writemask;
  917.          } else if (inst->dst.reg == HW_REG) {
  918.             if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
  919.                memset(last_grf_write, 0, sizeof(last_grf_write));
  920.             if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
  921.                memset(last_mrf_write, 0, sizeof(last_mrf_write));
  922.          }
  923.       }
  924.    }
  925. }
  926.  
  927. bool
  928. vec4_instruction::can_reswizzle(int dst_writemask,
  929.                                 int swizzle,
  930.                                 int swizzle_mask)
  931. {
  932.    /* If this instruction sets anything not referenced by swizzle, then we'd
  933.     * totally break it when we reswizzle.
  934.     */
  935.    if (dst.writemask & ~swizzle_mask)
  936.       return false;
  937.  
  938.    if (mlen > 0)
  939.       return false;
  940.  
  941.    return true;
  942. }
  943.  
  944. /**
  945.  * For any channels in the swizzle's source that were populated by this
  946.  * instruction, rewrite the instruction to put the appropriate result directly
  947.  * in those channels.
  948.  *
  949.  * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
  950.  */
  951. void
  952. vec4_instruction::reswizzle(int dst_writemask, int swizzle)
  953. {
  954.    /* Destination write mask doesn't correspond to source swizzle for the dot
  955.     * product and pack_bytes instructions.
  956.     */
  957.    if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
  958.        opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
  959.        opcode != VEC4_OPCODE_PACK_BYTES) {
  960.       for (int i = 0; i < 3; i++) {
  961.          if (src[i].file == BAD_FILE || src[i].file == IMM)
  962.             continue;
  963.  
  964.          src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
  965.       }
  966.    }
  967.  
  968.    /* Apply the specified swizzle and writemask to the original mask of
  969.     * written components.
  970.     */
  971.    dst.writemask = dst_writemask &
  972.                    brw_apply_swizzle_to_mask(swizzle, dst.writemask);
  973. }
  974.  
  975. /*
  976.  * Tries to reduce extra MOV instructions by taking temporary GRFs that get
  977.  * just written and then MOVed into another reg and making the original write
  978.  * of the GRF write directly to the final destination instead.
  979.  */
  980. bool
  981. vec4_visitor::opt_register_coalesce()
  982. {
  983.    bool progress = false;
  984.    int next_ip = 0;
  985.  
  986.    calculate_live_intervals();
  987.  
  988.    foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
  989.       int ip = next_ip;
  990.       next_ip++;
  991.  
  992.       if (inst->opcode != BRW_OPCODE_MOV ||
  993.           (inst->dst.file != GRF && inst->dst.file != MRF) ||
  994.           inst->predicate ||
  995.           inst->src[0].file != GRF ||
  996.           inst->dst.type != inst->src[0].type ||
  997.           inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
  998.          continue;
  999.  
  1000.       bool to_mrf = (inst->dst.file == MRF);
  1001.  
  1002.       /* Can't coalesce this GRF if someone else was going to
  1003.        * read it later.
  1004.        */
  1005.       if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
  1006.          continue;
  1007.  
  1008.       /* We need to check interference with the final destination between this
  1009.        * instruction and the earliest instruction involved in writing the GRF
  1010.        * we're eliminating.  To do that, keep track of which of our source
  1011.        * channels we've seen initialized.
  1012.        */
  1013.       const unsigned chans_needed =
  1014.          brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
  1015.                                        inst->dst.writemask);
  1016.       unsigned chans_remaining = chans_needed;
  1017.  
  1018.       /* Now walk up the instruction stream trying to see if we can rewrite
  1019.        * everything writing to the temporary to write into the destination
  1020.        * instead.
  1021.        */
  1022.       vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
  1023.       foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
  1024.                                                   inst, block) {
  1025.          _scan_inst = scan_inst;
  1026.  
  1027.          if (inst->src[0].in_range(scan_inst->dst, scan_inst->regs_written)) {
  1028.             /* Found something writing to the reg we want to coalesce away. */
  1029.             if (to_mrf) {
  1030.                /* SEND instructions can't have MRF as a destination. */
  1031.                if (scan_inst->mlen)
  1032.                   break;
  1033.  
  1034.                if (devinfo->gen == 6) {
  1035.                   /* gen6 math instructions must have the destination be
  1036.                    * GRF, so no compute-to-MRF for them.
  1037.                    */
  1038.                   if (scan_inst->is_math()) {
  1039.                      break;
  1040.                   }
  1041.                }
  1042.             }
  1043.  
  1044.             /* If we can't handle the swizzle, bail. */
  1045.             if (!scan_inst->can_reswizzle(inst->dst.writemask,
  1046.                                           inst->src[0].swizzle,
  1047.                                           chans_needed)) {
  1048.                break;
  1049.             }
  1050.  
  1051.             /* This doesn't handle coalescing of multiple registers. */
  1052.             if (scan_inst->regs_written > 1)
  1053.                break;
  1054.  
  1055.             /* Mark which channels we found unconditional writes for. */
  1056.             if (!scan_inst->predicate)
  1057.                chans_remaining &= ~scan_inst->dst.writemask;
  1058.  
  1059.             if (chans_remaining == 0)
  1060.                break;
  1061.          }
  1062.  
  1063.          /* You can't read from an MRF, so if someone else reads our MRF's
  1064.           * source GRF that we wanted to rewrite, that stops us.  If it's a
  1065.           * GRF we're trying to coalesce to, we don't actually handle
  1066.           * rewriting sources so bail in that case as well.
  1067.           */
  1068.          bool interfered = false;
  1069.          for (int i = 0; i < 3; i++) {
  1070.             if (inst->src[0].in_range(scan_inst->src[i],
  1071.                                       scan_inst->regs_read(i)))
  1072.                interfered = true;
  1073.          }
  1074.          if (interfered)
  1075.             break;
  1076.  
  1077.          /* If somebody else writes our destination here, we can't coalesce
  1078.           * before that.
  1079.           */
  1080.          if (inst->dst.in_range(scan_inst->dst, scan_inst->regs_written))
  1081.             break;
  1082.  
  1083.          /* Check for reads of the register we're trying to coalesce into.  We
  1084.           * can't go rewriting instructions above that to put some other value
  1085.           * in the register instead.
  1086.           */
  1087.          if (to_mrf && scan_inst->mlen > 0) {
  1088.             if (inst->dst.reg >= scan_inst->base_mrf &&
  1089.                 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
  1090.                break;
  1091.             }
  1092.          } else {
  1093.             for (int i = 0; i < 3; i++) {
  1094.                if (inst->dst.in_range(scan_inst->src[i],
  1095.                                       scan_inst->regs_read(i)))
  1096.                   interfered = true;
  1097.             }
  1098.             if (interfered)
  1099.                break;
  1100.          }
  1101.       }
  1102.  
  1103.       if (chans_remaining == 0) {
  1104.          /* If we've made it here, we have an MOV we want to coalesce out, and
  1105.           * a scan_inst pointing to the earliest instruction involved in
  1106.           * computing the value.  Now go rewrite the instruction stream
  1107.           * between the two.
  1108.           */
  1109.          vec4_instruction *scan_inst = _scan_inst;
  1110.          while (scan_inst != inst) {
  1111.             if (scan_inst->dst.file == GRF &&
  1112.                 scan_inst->dst.reg == inst->src[0].reg &&
  1113.                 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
  1114.                scan_inst->reswizzle(inst->dst.writemask,
  1115.                                     inst->src[0].swizzle);
  1116.                scan_inst->dst.file = inst->dst.file;
  1117.                scan_inst->dst.reg = inst->dst.reg;
  1118.                scan_inst->dst.reg_offset = inst->dst.reg_offset;
  1119.                scan_inst->saturate |= inst->saturate;
  1120.             }
  1121.             scan_inst = (vec4_instruction *)scan_inst->next;
  1122.          }
  1123.          inst->remove(block);
  1124.          progress = true;
  1125.       }
  1126.    }
  1127.  
  1128.    if (progress)
  1129.       invalidate_live_intervals();
  1130.  
  1131.    return progress;
  1132. }
  1133.  
  1134. /**
  1135.  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
  1136.  * flow.  We could probably do better here with some form of divergence
  1137.  * analysis.
  1138.  */
  1139. bool
  1140. vec4_visitor::eliminate_find_live_channel()
  1141. {
  1142.    bool progress = false;
  1143.    unsigned depth = 0;
  1144.  
  1145.    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
  1146.       switch (inst->opcode) {
  1147.       case BRW_OPCODE_IF:
  1148.       case BRW_OPCODE_DO:
  1149.          depth++;
  1150.          break;
  1151.  
  1152.       case BRW_OPCODE_ENDIF:
  1153.       case BRW_OPCODE_WHILE:
  1154.          depth--;
  1155.          break;
  1156.  
  1157.       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
  1158.          if (depth == 0) {
  1159.             inst->opcode = BRW_OPCODE_MOV;
  1160.             inst->src[0] = src_reg(0);
  1161.             inst->force_writemask_all = true;
  1162.             progress = true;
  1163.          }
  1164.          break;
  1165.  
  1166.       default:
  1167.          break;
  1168.       }
  1169.    }
  1170.  
  1171.    return progress;
  1172. }
  1173.  
  1174. /**
  1175.  * Splits virtual GRFs requesting more than one contiguous physical register.
  1176.  *
  1177.  * We initially create large virtual GRFs for temporary structures, arrays,
  1178.  * and matrices, so that the dereference visitor functions can add reg_offsets
  1179.  * to work their way down to the actual member being accessed.  But when it
  1180.  * comes to optimization, we'd like to treat each register as individual
  1181.  * storage if possible.
  1182.  *
  1183.  * So far, the only thing that might prevent splitting is a send message from
  1184.  * a GRF on IVB.
  1185.  */
  1186. void
  1187. vec4_visitor::split_virtual_grfs()
  1188. {
  1189.    int num_vars = this->alloc.count;
  1190.    int new_virtual_grf[num_vars];
  1191.    bool split_grf[num_vars];
  1192.  
  1193.    memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
  1194.  
  1195.    /* Try to split anything > 0 sized. */
  1196.    for (int i = 0; i < num_vars; i++) {
  1197.       split_grf[i] = this->alloc.sizes[i] != 1;
  1198.    }
  1199.  
  1200.    /* Check that the instructions are compatible with the registers we're trying
  1201.     * to split.
  1202.     */
  1203.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  1204.       if (inst->dst.file == GRF && inst->regs_written > 1)
  1205.          split_grf[inst->dst.reg] = false;
  1206.  
  1207.       for (int i = 0; i < 3; i++) {
  1208.          if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
  1209.             split_grf[inst->src[i].reg] = false;
  1210.       }
  1211.    }
  1212.  
  1213.    /* Allocate new space for split regs.  Note that the virtual
  1214.     * numbers will be contiguous.
  1215.     */
  1216.    for (int i = 0; i < num_vars; i++) {
  1217.       if (!split_grf[i])
  1218.          continue;
  1219.  
  1220.       new_virtual_grf[i] = alloc.allocate(1);
  1221.       for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
  1222.          unsigned reg = alloc.allocate(1);
  1223.          assert(reg == new_virtual_grf[i] + j - 1);
  1224.          (void) reg;
  1225.       }
  1226.       this->alloc.sizes[i] = 1;
  1227.    }
  1228.  
  1229.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  1230.       if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
  1231.           inst->dst.reg_offset != 0) {
  1232.          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
  1233.                           inst->dst.reg_offset - 1);
  1234.          inst->dst.reg_offset = 0;
  1235.       }
  1236.       for (int i = 0; i < 3; i++) {
  1237.          if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
  1238.              inst->src[i].reg_offset != 0) {
  1239.             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
  1240.                                 inst->src[i].reg_offset - 1);
  1241.             inst->src[i].reg_offset = 0;
  1242.          }
  1243.       }
  1244.    }
  1245.    invalidate_live_intervals();
  1246. }
  1247.  
  1248. void
  1249. vec4_visitor::dump_instruction(backend_instruction *be_inst)
  1250. {
  1251.    dump_instruction(be_inst, stderr);
  1252. }
  1253.  
  1254. void
  1255. vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
  1256. {
  1257.    vec4_instruction *inst = (vec4_instruction *)be_inst;
  1258.  
  1259.    if (inst->predicate) {
  1260.       fprintf(file, "(%cf0.%d) ",
  1261.               inst->predicate_inverse ? '-' : '+',
  1262.               inst->flag_subreg);
  1263.    }
  1264.  
  1265.    fprintf(file, "%s", brw_instruction_name(inst->opcode));
  1266.    if (inst->saturate)
  1267.       fprintf(file, ".sat");
  1268.    if (inst->conditional_mod) {
  1269.       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
  1270.       if (!inst->predicate &&
  1271.           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
  1272.                                 inst->opcode != BRW_OPCODE_IF &&
  1273.                                 inst->opcode != BRW_OPCODE_WHILE))) {
  1274.          fprintf(file, ".f0.%d", inst->flag_subreg);
  1275.       }
  1276.    }
  1277.    fprintf(file, " ");
  1278.  
  1279.    switch (inst->dst.file) {
  1280.    case GRF:
  1281.       fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
  1282.       break;
  1283.    case MRF:
  1284.       fprintf(file, "m%d", inst->dst.reg);
  1285.       break;
  1286.    case HW_REG:
  1287.       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
  1288.          switch (inst->dst.fixed_hw_reg.nr) {
  1289.          case BRW_ARF_NULL:
  1290.             fprintf(file, "null");
  1291.             break;
  1292.          case BRW_ARF_ADDRESS:
  1293.             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
  1294.             break;
  1295.          case BRW_ARF_ACCUMULATOR:
  1296.             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
  1297.             break;
  1298.          case BRW_ARF_FLAG:
  1299.             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
  1300.                              inst->dst.fixed_hw_reg.subnr);
  1301.             break;
  1302.          default:
  1303.             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
  1304.                                inst->dst.fixed_hw_reg.subnr);
  1305.             break;
  1306.          }
  1307.       } else {
  1308.          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
  1309.       }
  1310.       if (inst->dst.fixed_hw_reg.subnr)
  1311.          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
  1312.       break;
  1313.    case BAD_FILE:
  1314.       fprintf(file, "(null)");
  1315.       break;
  1316.    default:
  1317.       fprintf(file, "???");
  1318.       break;
  1319.    }
  1320.    if (inst->dst.writemask != WRITEMASK_XYZW) {
  1321.       fprintf(file, ".");
  1322.       if (inst->dst.writemask & 1)
  1323.          fprintf(file, "x");
  1324.       if (inst->dst.writemask & 2)
  1325.          fprintf(file, "y");
  1326.       if (inst->dst.writemask & 4)
  1327.          fprintf(file, "z");
  1328.       if (inst->dst.writemask & 8)
  1329.          fprintf(file, "w");
  1330.    }
  1331.    fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
  1332.  
  1333.    if (inst->src[0].file != BAD_FILE)
  1334.       fprintf(file, ", ");
  1335.  
  1336.    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
  1337.       if (inst->src[i].negate)
  1338.          fprintf(file, "-");
  1339.       if (inst->src[i].abs)
  1340.          fprintf(file, "|");
  1341.       switch (inst->src[i].file) {
  1342.       case GRF:
  1343.          fprintf(file, "vgrf%d", inst->src[i].reg);
  1344.          break;
  1345.       case ATTR:
  1346.          fprintf(file, "attr%d", inst->src[i].reg);
  1347.          break;
  1348.       case UNIFORM:
  1349.          fprintf(file, "u%d", inst->src[i].reg);
  1350.          break;
  1351.       case IMM:
  1352.          switch (inst->src[i].type) {
  1353.          case BRW_REGISTER_TYPE_F:
  1354.             fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f);
  1355.             break;
  1356.          case BRW_REGISTER_TYPE_D:
  1357.             fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d);
  1358.             break;
  1359.          case BRW_REGISTER_TYPE_UD:
  1360.             fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud);
  1361.             break;
  1362.          case BRW_REGISTER_TYPE_VF:
  1363.             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
  1364.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
  1365.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
  1366.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
  1367.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
  1368.             break;
  1369.          default:
  1370.             fprintf(file, "???");
  1371.             break;
  1372.          }
  1373.          break;
  1374.       case HW_REG:
  1375.          if (inst->src[i].fixed_hw_reg.negate)
  1376.             fprintf(file, "-");
  1377.          if (inst->src[i].fixed_hw_reg.abs)
  1378.             fprintf(file, "|");
  1379.          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
  1380.             switch (inst->src[i].fixed_hw_reg.nr) {
  1381.             case BRW_ARF_NULL:
  1382.                fprintf(file, "null");
  1383.                break;
  1384.             case BRW_ARF_ADDRESS:
  1385.                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
  1386.                break;
  1387.             case BRW_ARF_ACCUMULATOR:
  1388.                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
  1389.                break;
  1390.             case BRW_ARF_FLAG:
  1391.                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
  1392.                                 inst->src[i].fixed_hw_reg.subnr);
  1393.                break;
  1394.             default:
  1395.                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
  1396.                                   inst->src[i].fixed_hw_reg.subnr);
  1397.                break;
  1398.             }
  1399.          } else {
  1400.             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
  1401.          }
  1402.          if (inst->src[i].fixed_hw_reg.subnr)
  1403.             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
  1404.          if (inst->src[i].fixed_hw_reg.abs)
  1405.             fprintf(file, "|");
  1406.          break;
  1407.       case BAD_FILE:
  1408.          fprintf(file, "(null)");
  1409.          break;
  1410.       default:
  1411.          fprintf(file, "???");
  1412.          break;
  1413.       }
  1414.  
  1415.       /* Don't print .0; and only VGRFs have reg_offsets and sizes */
  1416.       if (inst->src[i].reg_offset != 0 &&
  1417.           inst->src[i].file == GRF &&
  1418.           alloc.sizes[inst->src[i].reg] != 1)
  1419.          fprintf(file, ".%d", inst->src[i].reg_offset);
  1420.  
  1421.       if (inst->src[i].file != IMM) {
  1422.          static const char *chans[4] = {"x", "y", "z", "w"};
  1423.          fprintf(file, ".");
  1424.          for (int c = 0; c < 4; c++) {
  1425.             fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
  1426.          }
  1427.       }
  1428.  
  1429.       if (inst->src[i].abs)
  1430.          fprintf(file, "|");
  1431.  
  1432.       if (inst->src[i].file != IMM) {
  1433.          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
  1434.       }
  1435.  
  1436.       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
  1437.          fprintf(file, ", ");
  1438.    }
  1439.  
  1440.    fprintf(file, "\n");
  1441. }
  1442.  
  1443.  
  1444. static inline struct brw_reg
  1445. attribute_to_hw_reg(int attr, bool interleaved)
  1446. {
  1447.    if (interleaved)
  1448.       return stride(brw_vec4_grf(attr / 2, (attr % 2) * 4), 0, 4, 1);
  1449.    else
  1450.       return brw_vec8_grf(attr, 0);
  1451. }
  1452.  
  1453.  
  1454. /**
  1455.  * Replace each register of type ATTR in this->instructions with a reference
  1456.  * to a fixed HW register.
  1457.  *
  1458.  * If interleaved is true, then each attribute takes up half a register, with
  1459.  * register N containing attribute 2*N in its first half and attribute 2*N+1
  1460.  * in its second half (this corresponds to the payload setup used by geometry
  1461.  * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
  1462.  * false, then each attribute takes up a whole register, with register N
  1463.  * containing attribute N (this corresponds to the payload setup used by
  1464.  * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
  1465.  */
  1466. void
  1467. vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
  1468.                                           bool interleaved)
  1469. {
  1470.    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
  1471.       /* We have to support ATTR as a destination for GL_FIXED fixup. */
  1472.       if (inst->dst.file == ATTR) {
  1473.          int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
  1474.  
  1475.          /* All attributes used in the shader need to have been assigned a
  1476.           * hardware register by the caller
  1477.           */
  1478.          assert(grf != 0);
  1479.  
  1480.          struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
  1481.          reg.type = inst->dst.type;
  1482.          reg.dw1.bits.writemask = inst->dst.writemask;
  1483.  
  1484.          inst->dst.file = HW_REG;
  1485.          inst->dst.fixed_hw_reg = reg;
  1486.       }
  1487.  
  1488.       for (int i = 0; i < 3; i++) {
  1489.          if (inst->src[i].file != ATTR)
  1490.             continue;
  1491.  
  1492.          int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
  1493.  
  1494.          /* All attributes used in the shader need to have been assigned a
  1495.           * hardware register by the caller
  1496.           */
  1497.          assert(grf != 0);
  1498.  
  1499.          struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
  1500.          reg.dw1.bits.swizzle = inst->src[i].swizzle;
  1501.          reg.type = inst->src[i].type;
  1502.          if (inst->src[i].abs)
  1503.             reg = brw_abs(reg);
  1504.          if (inst->src[i].negate)
  1505.             reg = negate(reg);
  1506.  
  1507.          inst->src[i].file = HW_REG;
  1508.          inst->src[i].fixed_hw_reg = reg;
  1509.       }
  1510.    }
  1511. }
  1512.  
  1513. int
  1514. vec4_vs_visitor::setup_attributes(int payload_reg)
  1515. {
  1516.    int nr_attributes;
  1517.    int attribute_map[VERT_ATTRIB_MAX + 1];
  1518.    memset(attribute_map, 0, sizeof(attribute_map));
  1519.  
  1520.    nr_attributes = 0;
  1521.    for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
  1522.       if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
  1523.          attribute_map[i] = payload_reg + nr_attributes;
  1524.          nr_attributes++;
  1525.       }
  1526.    }
  1527.  
  1528.    /* VertexID is stored by the VF as the last vertex element, but we
  1529.     * don't represent it with a flag in inputs_read, so we call it
  1530.     * VERT_ATTRIB_MAX.
  1531.     */
  1532.    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
  1533.       attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
  1534.       nr_attributes++;
  1535.    }
  1536.  
  1537.    lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
  1538.  
  1539.    /* The BSpec says we always have to read at least one thing from
  1540.     * the VF, and it appears that the hardware wedges otherwise.
  1541.     */
  1542.    if (nr_attributes == 0)
  1543.       nr_attributes = 1;
  1544.  
  1545.    prog_data->urb_read_length = (nr_attributes + 1) / 2;
  1546.  
  1547.    unsigned vue_entries =
  1548.       MAX2(nr_attributes, prog_data->vue_map.num_slots);
  1549.  
  1550.    if (devinfo->gen == 6)
  1551.       prog_data->urb_entry_size = ALIGN(vue_entries, 8) / 8;
  1552.    else
  1553.       prog_data->urb_entry_size = ALIGN(vue_entries, 4) / 4;
  1554.  
  1555.    return payload_reg + nr_attributes;
  1556. }
  1557.  
  1558. int
  1559. vec4_visitor::setup_uniforms(int reg)
  1560. {
  1561.    prog_data->base.dispatch_grf_start_reg = reg;
  1562.  
  1563.    /* The pre-gen6 VS requires that some push constants get loaded no
  1564.     * matter what, or the GPU would hang.
  1565.     */
  1566.    if (devinfo->gen < 6 && this->uniforms == 0) {
  1567.       assert(this->uniforms < this->uniform_array_size);
  1568.       this->uniform_vector_size[this->uniforms] = 1;
  1569.  
  1570.       stage_prog_data->param =
  1571.          reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
  1572.       for (unsigned int i = 0; i < 4; i++) {
  1573.          unsigned int slot = this->uniforms * 4 + i;
  1574.          static gl_constant_value zero = { 0.0 };
  1575.          stage_prog_data->param[slot] = &zero;
  1576.       }
  1577.  
  1578.       this->uniforms++;
  1579.       reg++;
  1580.    } else {
  1581.       reg += ALIGN(uniforms, 2) / 2;
  1582.    }
  1583.  
  1584.    stage_prog_data->nr_params = this->uniforms * 4;
  1585.  
  1586.    prog_data->base.curb_read_length =
  1587.       reg - prog_data->base.dispatch_grf_start_reg;
  1588.  
  1589.    return reg;
  1590. }
  1591.  
  1592. void
  1593. vec4_vs_visitor::setup_payload(void)
  1594. {
  1595.    int reg = 0;
  1596.  
  1597.    /* The payload always contains important data in g0, which contains
  1598.     * the URB handles that are passed on to the URB write at the end
  1599.     * of the thread.  So, we always start push constants at g1.
  1600.     */
  1601.    reg++;
  1602.  
  1603.    reg = setup_uniforms(reg);
  1604.  
  1605.    reg = setup_attributes(reg);
  1606.  
  1607.    this->first_non_payload_grf = reg;
  1608. }
  1609.  
  1610. void
  1611. vec4_visitor::assign_binding_table_offsets()
  1612. {
  1613.    assign_common_binding_table_offsets(0);
  1614. }
  1615.  
  1616. src_reg
  1617. vec4_visitor::get_timestamp()
  1618. {
  1619.    assert(devinfo->gen >= 7);
  1620.  
  1621.    src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
  1622.                                 BRW_ARF_TIMESTAMP,
  1623.                                 0,
  1624.                                 0,
  1625.                                 0,
  1626.                                 BRW_REGISTER_TYPE_UD,
  1627.                                 BRW_VERTICAL_STRIDE_0,
  1628.                                 BRW_WIDTH_4,
  1629.                                 BRW_HORIZONTAL_STRIDE_4,
  1630.                                 BRW_SWIZZLE_XYZW,
  1631.                                 WRITEMASK_XYZW));
  1632.  
  1633.    dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
  1634.  
  1635.    vec4_instruction *mov = emit(MOV(dst, ts));
  1636.    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
  1637.     * even if it's not enabled in the dispatch.
  1638.     */
  1639.    mov->force_writemask_all = true;
  1640.  
  1641.    return src_reg(dst);
  1642. }
  1643.  
  1644. void
  1645. vec4_visitor::emit_shader_time_begin()
  1646. {
  1647.    current_annotation = "shader time start";
  1648.    shader_start_time = get_timestamp();
  1649. }
  1650.  
  1651. void
  1652. vec4_visitor::emit_shader_time_end()
  1653. {
  1654.    current_annotation = "shader time end";
  1655.    src_reg shader_end_time = get_timestamp();
  1656.  
  1657.  
  1658.    /* Check that there weren't any timestamp reset events (assuming these
  1659.     * were the only two timestamp reads that happened).
  1660.     */
  1661.    src_reg reset_end = shader_end_time;
  1662.    reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
  1663.    vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
  1664.    test->conditional_mod = BRW_CONDITIONAL_Z;
  1665.  
  1666.    emit(IF(BRW_PREDICATE_NORMAL));
  1667.  
  1668.    /* Take the current timestamp and get the delta. */
  1669.    shader_start_time.negate = true;
  1670.    dst_reg diff = dst_reg(this, glsl_type::uint_type);
  1671.    emit(ADD(diff, shader_start_time, shader_end_time));
  1672.  
  1673.    /* If there were no instructions between the two timestamp gets, the diff
  1674.     * is 2 cycles.  Remove that overhead, so I can forget about that when
  1675.     * trying to determine the time taken for single instructions.
  1676.     */
  1677.    emit(ADD(diff, src_reg(diff), src_reg(-2u)));
  1678.  
  1679.    emit_shader_time_write(st_base, src_reg(diff));
  1680.    emit_shader_time_write(st_written, src_reg(1u));
  1681.    emit(BRW_OPCODE_ELSE);
  1682.    emit_shader_time_write(st_reset, src_reg(1u));
  1683.    emit(BRW_OPCODE_ENDIF);
  1684. }
  1685.  
  1686. void
  1687. vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
  1688.                                      src_reg value)
  1689. {
  1690.    int shader_time_index =
  1691.       brw_get_shader_time_index(brw, shader_prog, prog, type);
  1692.  
  1693.    dst_reg dst =
  1694.       dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
  1695.  
  1696.    dst_reg offset = dst;
  1697.    dst_reg time = dst;
  1698.    time.reg_offset++;
  1699.  
  1700.    offset.type = BRW_REGISTER_TYPE_UD;
  1701.    emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
  1702.  
  1703.    time.type = BRW_REGISTER_TYPE_UD;
  1704.    emit(MOV(time, src_reg(value)));
  1705.  
  1706.    vec4_instruction *inst =
  1707.       emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
  1708.    inst->mlen = 2;
  1709. }
  1710.  
  1711. bool
  1712. vec4_visitor::run()
  1713. {
  1714.    sanity_param_count = prog->Parameters->NumParameters;
  1715.  
  1716.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  1717.       emit_shader_time_begin();
  1718.  
  1719.    assign_binding_table_offsets();
  1720.  
  1721.    emit_prolog();
  1722.  
  1723.    /* Generate VS IR for main().  (the visitor only descends into
  1724.     * functions called "main").
  1725.     */
  1726.    if (shader) {
  1727.       visit_instructions(shader->base.ir);
  1728.    } else {
  1729.       emit_program_code();
  1730.    }
  1731.    base_ir = NULL;
  1732.  
  1733.    if (key->userclip_active && !prog->UsesClipDistanceOut)
  1734.       setup_uniform_clipplane_values();
  1735.  
  1736.    emit_thread_end();
  1737.  
  1738.    calculate_cfg();
  1739.  
  1740.    /* Before any optimization, push array accesses out to scratch
  1741.     * space where we need them to be.  This pass may allocate new
  1742.     * virtual GRFs, so we want to do it early.  It also makes sure
  1743.     * that we have reladdr computations available for CSE, since we'll
  1744.     * often do repeated subexpressions for those.
  1745.     */
  1746.    if (shader) {
  1747.       move_grf_array_access_to_scratch();
  1748.       move_uniform_array_access_to_pull_constants();
  1749.    } else {
  1750.       /* The ARB_vertex_program frontend emits pull constant loads directly
  1751.        * rather than using reladdr, so we don't need to walk through all the
  1752.        * instructions looking for things to move.  There isn't anything.
  1753.        *
  1754.        * We do still need to split things to vec4 size.
  1755.        */
  1756.       split_uniform_registers();
  1757.    }
  1758.    pack_uniform_registers();
  1759.    move_push_constants_to_pull_constants();
  1760.    split_virtual_grfs();
  1761.  
  1762. #define OPT(pass, args...) ({                                          \
  1763.       pass_num++;                                                      \
  1764.       bool this_progress = pass(args);                                 \
  1765.                                                                        \
  1766.       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {  \
  1767.          char filename[64];                                            \
  1768.          snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass,            \
  1769.                   stage_abbrev, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
  1770.                                                                        \
  1771.          backend_visitor::dump_instructions(filename);                 \
  1772.       }                                                                \
  1773.                                                                        \
  1774.       progress = progress || this_progress;                            \
  1775.       this_progress;                                                   \
  1776.    })
  1777.  
  1778.  
  1779.    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
  1780.       char filename[64];
  1781.       snprintf(filename, 64, "%s-%04d-00-start",
  1782.                stage_abbrev, shader_prog ? shader_prog->Name : 0);
  1783.  
  1784.       backend_visitor::dump_instructions(filename);
  1785.    }
  1786.  
  1787.    bool progress;
  1788.    int iteration = 0;
  1789.    int pass_num = 0;
  1790.    do {
  1791.       progress = false;
  1792.       pass_num = 0;
  1793.       iteration++;
  1794.  
  1795.       OPT(opt_reduce_swizzle);
  1796.       OPT(dead_code_eliminate);
  1797.       OPT(dead_control_flow_eliminate, this);
  1798.       OPT(opt_copy_propagation);
  1799.       OPT(opt_cse);
  1800.       OPT(opt_algebraic);
  1801.       OPT(opt_register_coalesce);
  1802.       OPT(eliminate_find_live_channel);
  1803.    } while (progress);
  1804.  
  1805.    pass_num = 0;
  1806.  
  1807.    if (OPT(opt_vector_float)) {
  1808.       OPT(opt_cse);
  1809.       OPT(opt_copy_propagation, false);
  1810.       OPT(opt_copy_propagation, true);
  1811.       OPT(dead_code_eliminate);
  1812.    }
  1813.  
  1814.    if (failed)
  1815.       return false;
  1816.  
  1817.    setup_payload();
  1818.  
  1819.    if (false) {
  1820.       /* Debug of register spilling: Go spill everything. */
  1821.       const int grf_count = alloc.count;
  1822.       float spill_costs[alloc.count];
  1823.       bool no_spill[alloc.count];
  1824.       evaluate_spill_costs(spill_costs, no_spill);
  1825.       for (int i = 0; i < grf_count; i++) {
  1826.          if (no_spill[i])
  1827.             continue;
  1828.          spill_reg(i);
  1829.       }
  1830.    }
  1831.  
  1832.    while (!reg_allocate()) {
  1833.       if (failed)
  1834.          return false;
  1835.    }
  1836.  
  1837.    opt_schedule_instructions();
  1838.  
  1839.    opt_set_dependency_control();
  1840.  
  1841.    /* If any state parameters were appended, then ParameterValues could have
  1842.     * been realloced, in which case the driver uniform storage set up by
  1843.     * _mesa_associate_uniform_storage() would point to freed memory.  Make
  1844.     * sure that didn't happen.
  1845.     */
  1846.    assert(sanity_param_count == prog->Parameters->NumParameters);
  1847.  
  1848.    return !failed;
  1849. }
  1850.  
  1851. } /* namespace brw */
  1852.  
  1853. extern "C" {
  1854.  
  1855. /**
  1856.  * Compile a vertex shader.
  1857.  *
  1858.  * Returns the final assembly and the program's size.
  1859.  */
  1860. const unsigned *
  1861. brw_vs_emit(struct brw_context *brw,
  1862.             struct gl_shader_program *prog,
  1863.             struct brw_vs_compile *c,
  1864.             struct brw_vs_prog_data *prog_data,
  1865.             void *mem_ctx,
  1866.             unsigned *final_assembly_size)
  1867. {
  1868.    bool start_busy = false;
  1869.    double start_time = 0;
  1870.    const unsigned *assembly = NULL;
  1871.    bool use_nir =
  1872.       brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions != NULL;
  1873.  
  1874.    if (unlikely(brw->perf_debug)) {
  1875.       start_busy = (brw->batch.last_bo &&
  1876.                     drm_intel_bo_busy(brw->batch.last_bo));
  1877.       start_time = get_time();
  1878.    }
  1879.  
  1880.    struct brw_shader *shader = NULL;
  1881.    if (prog)
  1882.       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
  1883.  
  1884.    if (unlikely(INTEL_DEBUG & DEBUG_VS))
  1885.       brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
  1886.  
  1887.    if (use_nir && !c->vp->program.Base.nir) {
  1888.       /* Normally we generate NIR in LinkShader() or ProgramStringNotify(), but
  1889.        * Mesa's fixed-function vertex program handling doesn't notify the driver
  1890.        * at all.  Just do it here, at the last minute, even though it's lame.
  1891.        */
  1892.       assert(c->vp->program.Base.Id == 0 && prog == NULL);
  1893.       c->vp->program.Base.nir =
  1894.          brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
  1895.    }
  1896.  
  1897.    if (brw->scalar_vs && (prog || use_nir)) {
  1898.       fs_visitor v(brw, mem_ctx, MESA_SHADER_VERTEX, &c->key,
  1899.                    &prog_data->base.base, prog, &c->vp->program.Base, 8);
  1900.       if (!v.run_vs()) {
  1901.          if (prog) {
  1902.             prog->LinkStatus = false;
  1903.             ralloc_strcat(&prog->InfoLog, v.fail_msg);
  1904.          }
  1905.  
  1906.          _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
  1907.                        v.fail_msg);
  1908.  
  1909.          return NULL;
  1910.       }
  1911.  
  1912.       fs_generator g(brw, mem_ctx, (void *) &c->key, &prog_data->base.base,
  1913.                      &c->vp->program.Base, v.promoted_constants,
  1914.                      v.runtime_check_aads_emit, "VS");
  1915.       if (INTEL_DEBUG & DEBUG_VS) {
  1916.          char *name;
  1917.          if (prog) {
  1918.             name = ralloc_asprintf(mem_ctx, "%s vertex shader %d",
  1919.                                    prog->Label ? prog->Label : "unnamed",
  1920.                                    prog->Name);
  1921.          } else {
  1922.             name = ralloc_asprintf(mem_ctx, "vertex program %d",
  1923.                                    c->vp->program.Base.Id);
  1924.          }
  1925.          g.enable_debug(name);
  1926.       }
  1927.       g.generate_code(v.cfg, 8);
  1928.       assembly = g.get_assembly(final_assembly_size);
  1929.  
  1930.       prog_data->base.simd8 = true;
  1931.       c->base.last_scratch = v.last_scratch;
  1932.    }
  1933.  
  1934.    if (!assembly) {
  1935.       vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
  1936.       if (!v.run()) {
  1937.          if (prog) {
  1938.             prog->LinkStatus = false;
  1939.             ralloc_strcat(&prog->InfoLog, v.fail_msg);
  1940.          }
  1941.  
  1942.          _mesa_problem(NULL, "Failed to compile vertex shader: %s\n",
  1943.                        v.fail_msg);
  1944.  
  1945.          return NULL;
  1946.       }
  1947.  
  1948.       vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
  1949.                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
  1950.       assembly = g.generate_assembly(v.cfg, final_assembly_size);
  1951.    }
  1952.  
  1953.    if (unlikely(brw->perf_debug) && shader) {
  1954.       if (shader->compiled_once) {
  1955.          brw_vs_debug_recompile(brw, prog, &c->key);
  1956.       }
  1957.       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
  1958.          perf_debug("VS compile took %.03f ms and stalled the GPU\n",
  1959.                     (get_time() - start_time) * 1000);
  1960.       }
  1961.       shader->compiled_once = true;
  1962.    }
  1963.  
  1964.    return assembly;
  1965. }
  1966.  
  1967.  
  1968. void
  1969. brw_vue_setup_prog_key_for_precompile(struct gl_context *ctx,
  1970.                                       struct brw_vue_prog_key *key,
  1971.                                       GLuint id, struct gl_program *prog)
  1972. {
  1973.    struct brw_context *brw = brw_context(ctx);
  1974.    key->program_string_id = id;
  1975.  
  1976.    brw_setup_tex_for_precompile(brw, &key->tex, prog);
  1977. }
  1978.  
  1979. } /* extern "C" */
  1980.