Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_fs.cpp
  25.  *
  26.  * This file drives the GLSL IR -> LIR translation, contains the
  27.  * optimizations on the LIR, and drives the generation of native code
  28.  * from the LIR.
  29.  */
  30.  
  31. #include <sys/types.h>
  32.  
  33. #include "util/hash_table.h"
  34. #include "main/macros.h"
  35. #include "main/shaderobj.h"
  36. #include "main/fbobject.h"
  37. #include "program/prog_parameter.h"
  38. #include "program/prog_print.h"
  39. #include "util/register_allocate.h"
  40. #include "program/hash_table.h"
  41. #include "brw_context.h"
  42. #include "brw_eu.h"
  43. #include "brw_wm.h"
  44. #include "brw_fs.h"
  45. #include "brw_cfg.h"
  46. #include "brw_dead_control_flow.h"
  47. #include "main/uniforms.h"
  48. #include "brw_fs_live_variables.h"
  49. #include "glsl/glsl_types.h"
  50. #include "program/sampler.h"
  51.  
  52. void
  53. fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54.               const fs_reg *src, unsigned sources)
  55. {
  56.    memset(this, 0, sizeof(*this));
  57.  
  58.    this->src = new fs_reg[MAX2(sources, 3)];
  59.    for (unsigned i = 0; i < sources; i++)
  60.       this->src[i] = src[i];
  61.  
  62.    this->opcode = opcode;
  63.    this->dst = dst;
  64.    this->sources = sources;
  65.    this->exec_size = exec_size;
  66.  
  67.    assert(dst.file != IMM && dst.file != UNIFORM);
  68.  
  69.    /* If exec_size == 0, try to guess it from the registers.  Since all
  70.     * manner of things may use hardware registers, we first try to guess
  71.     * based on GRF registers.  If this fails, we will go ahead and take the
  72.     * width from the destination register.
  73.     */
  74.    if (this->exec_size == 0) {
  75.       if (dst.file == GRF) {
  76.          this->exec_size = dst.width;
  77.       } else {
  78.          for (unsigned i = 0; i < sources; ++i) {
  79.             if (src[i].file != GRF && src[i].file != ATTR)
  80.                continue;
  81.  
  82.             if (this->exec_size <= 1)
  83.                this->exec_size = src[i].width;
  84.             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85.          }
  86.       }
  87.  
  88.       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89.          this->exec_size = dst.width;
  90.    }
  91.    assert(this->exec_size != 0);
  92.  
  93.    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94.  
  95.    /* This will be the case for almost all instructions. */
  96.    switch (dst.file) {
  97.    case GRF:
  98.    case HW_REG:
  99.    case MRF:
  100.    case ATTR:
  101.       this->regs_written =
  102.          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
  103.       break;
  104.    case BAD_FILE:
  105.       this->regs_written = 0;
  106.       break;
  107.    case IMM:
  108.    case UNIFORM:
  109.       unreachable("Invalid destination register file");
  110.    default:
  111.       unreachable("Invalid register file");
  112.    }
  113.  
  114.    this->writes_accumulator = false;
  115. }
  116.  
  117. fs_inst::fs_inst()
  118. {
  119.    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
  120. }
  121.  
  122. fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
  123. {
  124.    init(opcode, exec_size, reg_undef, NULL, 0);
  125. }
  126.  
  127. fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  128. {
  129.    init(opcode, 0, dst, NULL, 0);
  130. }
  131.  
  132. fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  133.                  const fs_reg &src0)
  134. {
  135.    const fs_reg src[1] = { src0 };
  136.    init(opcode, exec_size, dst, src, 1);
  137. }
  138.  
  139. fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  140. {
  141.    const fs_reg src[1] = { src0 };
  142.    init(opcode, 0, dst, src, 1);
  143. }
  144.  
  145. fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  146.                  const fs_reg &src0, const fs_reg &src1)
  147. {
  148.    const fs_reg src[2] = { src0, src1 };
  149.    init(opcode, exec_size, dst, src, 2);
  150. }
  151.  
  152. fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  153.                  const fs_reg &src1)
  154. {
  155.    const fs_reg src[2] = { src0, src1 };
  156.    init(opcode, 0, dst, src, 2);
  157. }
  158.  
  159. fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  160.                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
  161. {
  162.    const fs_reg src[3] = { src0, src1, src2 };
  163.    init(opcode, exec_size, dst, src, 3);
  164. }
  165.  
  166. fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  167.                  const fs_reg &src1, const fs_reg &src2)
  168. {
  169.    const fs_reg src[3] = { src0, src1, src2 };
  170.    init(opcode, 0, dst, src, 3);
  171. }
  172.  
  173. fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
  174.                  const fs_reg src[], unsigned sources)
  175. {
  176.    init(opcode, 0, dst, src, sources);
  177. }
  178.  
  179. fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
  180.                  const fs_reg src[], unsigned sources)
  181. {
  182.    init(opcode, exec_width, dst, src, sources);
  183. }
  184.  
  185. fs_inst::fs_inst(const fs_inst &that)
  186. {
  187.    memcpy(this, &that, sizeof(that));
  188.  
  189.    this->src = new fs_reg[MAX2(that.sources, 3)];
  190.  
  191.    for (unsigned i = 0; i < that.sources; i++)
  192.       this->src[i] = that.src[i];
  193. }
  194.  
  195. fs_inst::~fs_inst()
  196. {
  197.    delete[] this->src;
  198. }
  199.  
  200. void
  201. fs_inst::resize_sources(uint8_t num_sources)
  202. {
  203.    if (this->sources != num_sources) {
  204.       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
  205.  
  206.       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
  207.          src[i] = this->src[i];
  208.  
  209.       delete[] this->src;
  210.       this->src = src;
  211.       this->sources = num_sources;
  212.    }
  213. }
  214.  
  215. #define ALU1(op)                                                        \
  216.    fs_inst *                                                            \
  217.    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
  218.    {                                                                    \
  219.       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
  220.    }
  221.  
  222. #define ALU2(op)                                                        \
  223.    fs_inst *                                                            \
  224.    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
  225.                   const fs_reg &src1)                                   \
  226.    {                                                                    \
  227.       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
  228.    }
  229.  
  230. #define ALU2_ACC(op)                                                    \
  231.    fs_inst *                                                            \
  232.    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
  233.                   const fs_reg &src1)                                   \
  234.    {                                                                    \
  235.       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
  236.       inst->writes_accumulator = true;                                  \
  237.       return inst;                                                      \
  238.    }
  239.  
  240. #define ALU3(op)                                                        \
  241.    fs_inst *                                                            \
  242.    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
  243.                   const fs_reg &src1, const fs_reg &src2)               \
  244.    {                                                                    \
  245.       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
  246.    }
  247.  
  248. ALU1(NOT)
  249. ALU1(MOV)
  250. ALU1(FRC)
  251. ALU1(RNDD)
  252. ALU1(RNDE)
  253. ALU1(RNDZ)
  254. ALU2(ADD)
  255. ALU2(MUL)
  256. ALU2_ACC(MACH)
  257. ALU2(AND)
  258. ALU2(OR)
  259. ALU2(XOR)
  260. ALU2(SHL)
  261. ALU2(SHR)
  262. ALU2(ASR)
  263. ALU3(LRP)
  264. ALU1(BFREV)
  265. ALU3(BFE)
  266. ALU2(BFI1)
  267. ALU3(BFI2)
  268. ALU1(FBH)
  269. ALU1(FBL)
  270. ALU1(CBIT)
  271. ALU3(MAD)
  272. ALU2_ACC(ADDC)
  273. ALU2_ACC(SUBB)
  274. ALU2(SEL)
  275. ALU2(MAC)
  276.  
  277. /** Gen4 predicated IF. */
  278. fs_inst *
  279. fs_visitor::IF(enum brw_predicate predicate)
  280. {
  281.    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
  282.    inst->predicate = predicate;
  283.    return inst;
  284. }
  285.  
  286. /** Gen6 IF with embedded comparison. */
  287. fs_inst *
  288. fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
  289.                enum brw_conditional_mod condition)
  290. {
  291.    assert(devinfo->gen == 6);
  292.    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
  293.                                         reg_null_d, src0, src1);
  294.    inst->conditional_mod = condition;
  295.    return inst;
  296. }
  297.  
  298. /**
  299.  * CMP: Sets the low bit of the destination channels with the result
  300.  * of the comparison, while the upper bits are undefined, and updates
  301.  * the flag register with the packed 16 bits of the result.
  302.  */
  303. fs_inst *
  304. fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
  305.                 enum brw_conditional_mod condition)
  306. {
  307.    fs_inst *inst;
  308.  
  309.    /* Take the instruction:
  310.     *
  311.     * CMP null<d> src0<f> src1<f>
  312.     *
  313.     * Original gen4 does type conversion to the destination type before
  314.     * comparison, producing garbage results for floating point comparisons.
  315.     *
  316.     * The destination type doesn't matter on newer generations, so we set the
  317.     * type to match src0 so we can compact the instruction.
  318.     */
  319.    dst.type = src0.type;
  320.    if (dst.file == HW_REG)
  321.       dst.fixed_hw_reg.type = dst.type;
  322.  
  323.    resolve_ud_negate(&src0);
  324.    resolve_ud_negate(&src1);
  325.  
  326.    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
  327.    inst->conditional_mod = condition;
  328.  
  329.    return inst;
  330. }
  331.  
  332. fs_inst *
  333. fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
  334.                          int header_size)
  335. {
  336.    assert(dst.width % 8 == 0);
  337.    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
  338.                                         dst, src, sources);
  339.    inst->header_size = header_size;
  340.  
  341.    for (int i = 0; i < header_size; i++)
  342.       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
  343.    inst->regs_written = header_size;
  344.  
  345.    for (int i = header_size; i < sources; ++i)
  346.       assert(src[i].file != GRF || src[i].width == dst.width);
  347.    inst->regs_written += (sources - header_size) * (dst.width / 8);
  348.  
  349.    return inst;
  350. }
  351.  
  352. exec_list
  353. fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
  354.                                        const fs_reg &surf_index,
  355.                                        const fs_reg &varying_offset,
  356.                                        uint32_t const_offset)
  357. {
  358.    exec_list instructions;
  359.    fs_inst *inst;
  360.  
  361.    /* We have our constant surface use a pitch of 4 bytes, so our index can
  362.     * be any component of a vector, and then we load 4 contiguous
  363.     * components starting from that.
  364.     *
  365.     * We break down the const_offset to a portion added to the variable
  366.     * offset and a portion done using reg_offset, which means that if you
  367.     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
  368.     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
  369.     * CSE can later notice that those loads are all the same and eliminate
  370.     * the redundant ones.
  371.     */
  372.    fs_reg vec4_offset = vgrf(glsl_type::int_type);
  373.    instructions.push_tail(ADD(vec4_offset,
  374.                               varying_offset, fs_reg(const_offset & ~3)));
  375.  
  376.    int scale = 1;
  377.    if (devinfo->gen == 4 && dst.width == 8) {
  378.       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
  379.        * u, v, r) as parameters, or we can just use the SIMD16 message
  380.        * consisting of (header, u).  We choose the second, at the cost of a
  381.        * longer return length.
  382.        */
  383.       scale = 2;
  384.    }
  385.  
  386.    enum opcode op;
  387.    if (devinfo->gen >= 7)
  388.       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
  389.    else
  390.       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
  391.  
  392.    assert(dst.width % 8 == 0);
  393.    int regs_written = 4 * (dst.width / 8) * scale;
  394.    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
  395.                                dst.type, dst.width);
  396.    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
  397.    inst->regs_written = regs_written;
  398.    instructions.push_tail(inst);
  399.  
  400.    if (devinfo->gen < 7) {
  401.       inst->base_mrf = 13;
  402.       inst->header_size = 1;
  403.       if (devinfo->gen == 4)
  404.          inst->mlen = 3;
  405.       else
  406.          inst->mlen = 1 + dispatch_width / 8;
  407.    }
  408.  
  409.    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
  410.    instructions.push_tail(MOV(dst, result));
  411.  
  412.    return instructions;
  413. }
  414.  
  415. /**
  416.  * A helper for MOV generation for fixing up broken hardware SEND dependency
  417.  * handling.
  418.  */
  419. fs_inst *
  420. fs_visitor::DEP_RESOLVE_MOV(int grf)
  421. {
  422.    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
  423.  
  424.    inst->ir = NULL;
  425.    inst->annotation = "send dependency resolve";
  426.  
  427.    /* The caller always wants uncompressed to emit the minimal extra
  428.     * dependencies, and to avoid having to deal with aligning its regs to 2.
  429.     */
  430.    inst->exec_size = 8;
  431.  
  432.    return inst;
  433. }
  434.  
  435. bool
  436. fs_inst::equals(fs_inst *inst) const
  437. {
  438.    return (opcode == inst->opcode &&
  439.            dst.equals(inst->dst) &&
  440.            src[0].equals(inst->src[0]) &&
  441.            src[1].equals(inst->src[1]) &&
  442.            src[2].equals(inst->src[2]) &&
  443.            saturate == inst->saturate &&
  444.            predicate == inst->predicate &&
  445.            conditional_mod == inst->conditional_mod &&
  446.            mlen == inst->mlen &&
  447.            base_mrf == inst->base_mrf &&
  448.            target == inst->target &&
  449.            eot == inst->eot &&
  450.            header_size == inst->header_size &&
  451.            shadow_compare == inst->shadow_compare &&
  452.            exec_size == inst->exec_size &&
  453.            offset == inst->offset);
  454. }
  455.  
  456. bool
  457. fs_inst::overwrites_reg(const fs_reg &reg) const
  458. {
  459.    return reg.in_range(dst, regs_written);
  460. }
  461.  
  462. bool
  463. fs_inst::is_send_from_grf() const
  464. {
  465.    switch (opcode) {
  466.    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
  467.    case SHADER_OPCODE_SHADER_TIME_ADD:
  468.    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
  469.    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
  470.    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
  471.    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
  472.    case SHADER_OPCODE_UNTYPED_ATOMIC:
  473.    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
  474.    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
  475.    case SHADER_OPCODE_TYPED_ATOMIC:
  476.    case SHADER_OPCODE_TYPED_SURFACE_READ:
  477.    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
  478.    case SHADER_OPCODE_URB_WRITE_SIMD8:
  479.       return true;
  480.    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  481.       return src[1].file == GRF;
  482.    case FS_OPCODE_FB_WRITE:
  483.       return src[0].file == GRF;
  484.    default:
  485.       if (is_tex())
  486.          return src[0].file == GRF;
  487.  
  488.       return false;
  489.    }
  490. }
  491.  
  492. bool
  493. fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
  494. {
  495.    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
  496.       return false;
  497.  
  498.    fs_reg reg = this->src[0];
  499.    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
  500.       return false;
  501.  
  502.    if (grf_alloc.sizes[reg.reg] != this->regs_written)
  503.       return false;
  504.  
  505.    for (int i = 0; i < this->sources; i++) {
  506.       reg.type = this->src[i].type;
  507.       reg.width = this->src[i].width;
  508.       if (!this->src[i].equals(reg))
  509.          return false;
  510.       reg = ::offset(reg, 1);
  511.    }
  512.  
  513.    return true;
  514. }
  515.  
  516. bool
  517. fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
  518. {
  519.    if (devinfo->gen == 6 && is_math())
  520.       return false;
  521.  
  522.    if (is_send_from_grf())
  523.       return false;
  524.  
  525.    if (!backend_instruction::can_do_source_mods())
  526.       return false;
  527.  
  528.    return true;
  529. }
  530.  
  531. bool
  532. fs_inst::has_side_effects() const
  533. {
  534.    return this->eot || backend_instruction::has_side_effects();
  535. }
  536.  
  537. void
  538. fs_reg::init()
  539. {
  540.    memset(this, 0, sizeof(*this));
  541.    stride = 1;
  542. }
  543.  
  544. /** Generic unset register constructor. */
  545. fs_reg::fs_reg()
  546. {
  547.    init();
  548.    this->file = BAD_FILE;
  549. }
  550.  
  551. /** Immediate value constructor. */
  552. fs_reg::fs_reg(float f)
  553. {
  554.    init();
  555.    this->file = IMM;
  556.    this->type = BRW_REGISTER_TYPE_F;
  557.    this->fixed_hw_reg.dw1.f = f;
  558.    this->width = 1;
  559. }
  560.  
  561. /** Immediate value constructor. */
  562. fs_reg::fs_reg(int32_t i)
  563. {
  564.    init();
  565.    this->file = IMM;
  566.    this->type = BRW_REGISTER_TYPE_D;
  567.    this->fixed_hw_reg.dw1.d = i;
  568.    this->width = 1;
  569. }
  570.  
  571. /** Immediate value constructor. */
  572. fs_reg::fs_reg(uint32_t u)
  573. {
  574.    init();
  575.    this->file = IMM;
  576.    this->type = BRW_REGISTER_TYPE_UD;
  577.    this->fixed_hw_reg.dw1.ud = u;
  578.    this->width = 1;
  579. }
  580.  
  581. /** Vector float immediate value constructor. */
  582. fs_reg::fs_reg(uint8_t vf[4])
  583. {
  584.    init();
  585.    this->file = IMM;
  586.    this->type = BRW_REGISTER_TYPE_VF;
  587.    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
  588. }
  589.  
  590. /** Vector float immediate value constructor. */
  591. fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
  592. {
  593.    init();
  594.    this->file = IMM;
  595.    this->type = BRW_REGISTER_TYPE_VF;
  596.    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
  597.                                (vf1 <<  8) |
  598.                                (vf2 << 16) |
  599.                                (vf3 << 24);
  600. }
  601.  
  602. /** Fixed brw_reg. */
  603. fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
  604. {
  605.    init();
  606.    this->file = HW_REG;
  607.    this->fixed_hw_reg = fixed_hw_reg;
  608.    this->type = fixed_hw_reg.type;
  609.    this->width = 1 << fixed_hw_reg.width;
  610. }
  611.  
  612. bool
  613. fs_reg::equals(const fs_reg &r) const
  614. {
  615.    return (file == r.file &&
  616.            reg == r.reg &&
  617.            reg_offset == r.reg_offset &&
  618.            subreg_offset == r.subreg_offset &&
  619.            type == r.type &&
  620.            negate == r.negate &&
  621.            abs == r.abs &&
  622.            !reladdr && !r.reladdr &&
  623.            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
  624.            width == r.width &&
  625.            stride == r.stride);
  626. }
  627.  
  628. fs_reg &
  629. fs_reg::set_smear(unsigned subreg)
  630. {
  631.    assert(file != HW_REG && file != IMM);
  632.    subreg_offset = subreg * type_sz(type);
  633.    stride = 0;
  634.    return *this;
  635. }
  636.  
  637. bool
  638. fs_reg::is_contiguous() const
  639. {
  640.    return stride == 1;
  641. }
  642.  
  643. int
  644. fs_visitor::type_size(const struct glsl_type *type)
  645. {
  646.    unsigned int size, i;
  647.  
  648.    switch (type->base_type) {
  649.    case GLSL_TYPE_UINT:
  650.    case GLSL_TYPE_INT:
  651.    case GLSL_TYPE_FLOAT:
  652.    case GLSL_TYPE_BOOL:
  653.       return type->components();
  654.    case GLSL_TYPE_ARRAY:
  655.       return type_size(type->fields.array) * type->length;
  656.    case GLSL_TYPE_STRUCT:
  657.       size = 0;
  658.       for (i = 0; i < type->length; i++) {
  659.          size += type_size(type->fields.structure[i].type);
  660.       }
  661.       return size;
  662.    case GLSL_TYPE_SAMPLER:
  663.       /* Samplers take up no register space, since they're baked in at
  664.        * link time.
  665.        */
  666.       return 0;
  667.    case GLSL_TYPE_ATOMIC_UINT:
  668.       return 0;
  669.    case GLSL_TYPE_IMAGE:
  670.    case GLSL_TYPE_VOID:
  671.    case GLSL_TYPE_ERROR:
  672.    case GLSL_TYPE_INTERFACE:
  673.    case GLSL_TYPE_DOUBLE:
  674.       unreachable("not reached");
  675.    }
  676.  
  677.    return 0;
  678. }
  679.  
  680. /**
  681.  * Create a MOV to read the timestamp register.
  682.  *
  683.  * The caller is responsible for emitting the MOV.  The return value is
  684.  * the destination of the MOV, with extra parameters set.
  685.  */
  686. fs_reg
  687. fs_visitor::get_timestamp(fs_inst **out_mov)
  688. {
  689.    assert(devinfo->gen >= 7);
  690.  
  691.    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
  692.                                           BRW_ARF_TIMESTAMP,
  693.                                           0),
  694.                              BRW_REGISTER_TYPE_UD));
  695.  
  696.    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
  697.  
  698.    fs_inst *mov = MOV(dst, ts);
  699.    /* We want to read the 3 fields we care about even if it's not enabled in
  700.     * the dispatch.
  701.     */
  702.    mov->force_writemask_all = true;
  703.  
  704.    /* The caller wants the low 32 bits of the timestamp.  Since it's running
  705.     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
  706.     * which is plenty of time for our purposes.  It is identical across the
  707.     * EUs, but since it's tracking GPU core speed it will increment at a
  708.     * varying rate as render P-states change.
  709.     *
  710.     * The caller could also check if render P-states have changed (or anything
  711.     * else that might disrupt timing) by setting smear to 2 and checking if
  712.     * that field is != 0.
  713.     */
  714.    dst.set_smear(0);
  715.  
  716.    *out_mov = mov;
  717.    return dst;
  718. }
  719.  
  720. void
  721. fs_visitor::emit_shader_time_begin()
  722. {
  723.    current_annotation = "shader time start";
  724.    fs_inst *mov;
  725.    shader_start_time = get_timestamp(&mov);
  726.    emit(mov);
  727. }
  728.  
  729. void
  730. fs_visitor::emit_shader_time_end()
  731. {
  732.    current_annotation = "shader time end";
  733.  
  734.    enum shader_time_shader_type type, written_type, reset_type;
  735.    switch (stage) {
  736.    case MESA_SHADER_VERTEX:
  737.       type = ST_VS;
  738.       written_type = ST_VS_WRITTEN;
  739.       reset_type = ST_VS_RESET;
  740.       break;
  741.    case MESA_SHADER_GEOMETRY:
  742.       type = ST_GS;
  743.       written_type = ST_GS_WRITTEN;
  744.       reset_type = ST_GS_RESET;
  745.       break;
  746.    case MESA_SHADER_FRAGMENT:
  747.       if (dispatch_width == 8) {
  748.          type = ST_FS8;
  749.          written_type = ST_FS8_WRITTEN;
  750.          reset_type = ST_FS8_RESET;
  751.       } else {
  752.          assert(dispatch_width == 16);
  753.          type = ST_FS16;
  754.          written_type = ST_FS16_WRITTEN;
  755.          reset_type = ST_FS16_RESET;
  756.       }
  757.       break;
  758.    case MESA_SHADER_COMPUTE:
  759.       type = ST_CS;
  760.       written_type = ST_CS_WRITTEN;
  761.       reset_type = ST_CS_RESET;
  762.       break;
  763.    default:
  764.       unreachable("fs_visitor::emit_shader_time_end missing code");
  765.    }
  766.  
  767.    /* Insert our code just before the final SEND with EOT. */
  768.    exec_node *end = this->instructions.get_tail();
  769.    assert(end && ((fs_inst *) end)->eot);
  770.  
  771.    fs_inst *tm_read;
  772.    fs_reg shader_end_time = get_timestamp(&tm_read);
  773.    end->insert_before(tm_read);
  774.  
  775.    /* Check that there weren't any timestamp reset events (assuming these
  776.     * were the only two timestamp reads that happened).
  777.     */
  778.    fs_reg reset = shader_end_time;
  779.    reset.set_smear(2);
  780.    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
  781.    test->conditional_mod = BRW_CONDITIONAL_Z;
  782.    test->force_writemask_all = true;
  783.    end->insert_before(test);
  784.    end->insert_before(IF(BRW_PREDICATE_NORMAL));
  785.  
  786.    fs_reg start = shader_start_time;
  787.    start.negate = true;
  788.    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
  789.    diff.set_smear(0);
  790.    fs_inst *add = ADD(diff, start, shader_end_time);
  791.    add->force_writemask_all = true;
  792.    end->insert_before(add);
  793.  
  794.    /* If there were no instructions between the two timestamp gets, the diff
  795.     * is 2 cycles.  Remove that overhead, so I can forget about that when
  796.     * trying to determine the time taken for single instructions.
  797.     */
  798.    add = ADD(diff, diff, fs_reg(-2u));
  799.    add->force_writemask_all = true;
  800.    end->insert_before(add);
  801.  
  802.    end->insert_before(SHADER_TIME_ADD(type, diff));
  803.    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
  804.    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
  805.    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
  806.    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
  807. }
  808.  
  809. fs_inst *
  810. fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
  811. {
  812.    int shader_time_index =
  813.       brw_get_shader_time_index(brw, shader_prog, prog, type);
  814.    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
  815.  
  816.    fs_reg payload;
  817.    if (dispatch_width == 8)
  818.       payload = vgrf(glsl_type::uvec2_type);
  819.    else
  820.       payload = vgrf(glsl_type::uint_type);
  821.  
  822.    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
  823.                                fs_reg(), payload, offset, value);
  824. }
  825.  
  826. void
  827. fs_visitor::vfail(const char *format, va_list va)
  828. {
  829.    char *msg;
  830.  
  831.    if (failed)
  832.       return;
  833.  
  834.    failed = true;
  835.  
  836.    msg = ralloc_vasprintf(mem_ctx, format, va);
  837.    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
  838.  
  839.    this->fail_msg = msg;
  840.  
  841.    if (debug_enabled) {
  842.       fprintf(stderr, "%s",  msg);
  843.    }
  844. }
  845.  
  846. void
  847. fs_visitor::fail(const char *format, ...)
  848. {
  849.    va_list va;
  850.  
  851.    va_start(va, format);
  852.    vfail(format, va);
  853.    va_end(va);
  854. }
  855.  
  856. /**
  857.  * Mark this program as impossible to compile in SIMD16 mode.
  858.  *
  859.  * During the SIMD8 compile (which happens first), we can detect and flag
  860.  * things that are unsupported in SIMD16 mode, so the compiler can skip
  861.  * the SIMD16 compile altogether.
  862.  *
  863.  * During a SIMD16 compile (if one happens anyway), this just calls fail().
  864.  */
  865. void
  866. fs_visitor::no16(const char *format, ...)
  867. {
  868.    va_list va;
  869.  
  870.    va_start(va, format);
  871.  
  872.    if (dispatch_width == 16) {
  873.       vfail(format, va);
  874.    } else {
  875.       simd16_unsupported = true;
  876.  
  877.       if (brw->perf_debug) {
  878.          if (no16_msg)
  879.             ralloc_vasprintf_append(&no16_msg, format, va);
  880.          else
  881.             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
  882.       }
  883.    }
  884.  
  885.    va_end(va);
  886. }
  887.  
  888. fs_inst *
  889. fs_visitor::emit(enum opcode opcode)
  890. {
  891.    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
  892. }
  893.  
  894. fs_inst *
  895. fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
  896. {
  897.    return emit(new(mem_ctx) fs_inst(opcode, dst));
  898. }
  899.  
  900. fs_inst *
  901. fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  902. {
  903.    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
  904. }
  905.  
  906. fs_inst *
  907. fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  908.                  const fs_reg &src1)
  909. {
  910.    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
  911. }
  912.  
  913. fs_inst *
  914. fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  915.                  const fs_reg &src1, const fs_reg &src2)
  916. {
  917.    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
  918. }
  919.  
  920. fs_inst *
  921. fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
  922.                  fs_reg src[], int sources)
  923. {
  924.    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
  925. }
  926.  
  927. /**
  928.  * Returns true if the instruction has a flag that means it won't
  929.  * update an entire destination register.
  930.  *
  931.  * For example, dead code elimination and live variable analysis want to know
  932.  * when a write to a variable screens off any preceding values that were in
  933.  * it.
  934.  */
  935. bool
  936. fs_inst::is_partial_write() const
  937. {
  938.    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
  939.            (this->dst.width * type_sz(this->dst.type)) < 32 ||
  940.            !this->dst.is_contiguous());
  941. }
  942.  
  943. int
  944. fs_inst::regs_read(int arg) const
  945. {
  946.    if (is_tex() && arg == 0 && src[0].file == GRF) {
  947.       return mlen;
  948.    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
  949.       return mlen;
  950.    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
  951.       return mlen;
  952.    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
  953.       return mlen;
  954.    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
  955.       return mlen;
  956.    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
  957.       return mlen;
  958.    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
  959.       return mlen;
  960.    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
  961.       return mlen;
  962.    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
  963.       return mlen;
  964.    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
  965.       return mlen;
  966.    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
  967.       return exec_size / 4;
  968.    }
  969.  
  970.    switch (src[arg].file) {
  971.    case BAD_FILE:
  972.    case UNIFORM:
  973.    case IMM:
  974.       return 1;
  975.    case GRF:
  976.    case HW_REG:
  977.       if (src[arg].stride == 0) {
  978.          return 1;
  979.       } else {
  980.          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
  981.          return (size + 31) / 32;
  982.       }
  983.    case MRF:
  984.       unreachable("MRF registers are not allowed as sources");
  985.    default:
  986.       unreachable("Invalid register file");
  987.    }
  988. }
  989.  
  990. bool
  991. fs_inst::reads_flag() const
  992. {
  993.    return predicate;
  994. }
  995.  
  996. bool
  997. fs_inst::writes_flag() const
  998. {
  999.    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
  1000.                                opcode != BRW_OPCODE_IF &&
  1001.                                opcode != BRW_OPCODE_WHILE)) ||
  1002.           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
  1003. }
  1004.  
  1005. /**
  1006.  * Returns how many MRFs an FS opcode will write over.
  1007.  *
  1008.  * Note that this is not the 0 or 1 implied writes in an actual gen
  1009.  * instruction -- the FS opcodes often generate MOVs in addition.
  1010.  */
  1011. int
  1012. fs_visitor::implied_mrf_writes(fs_inst *inst)
  1013. {
  1014.    if (inst->mlen == 0)
  1015.       return 0;
  1016.  
  1017.    if (inst->base_mrf == -1)
  1018.       return 0;
  1019.  
  1020.    switch (inst->opcode) {
  1021.    case SHADER_OPCODE_RCP:
  1022.    case SHADER_OPCODE_RSQ:
  1023.    case SHADER_OPCODE_SQRT:
  1024.    case SHADER_OPCODE_EXP2:
  1025.    case SHADER_OPCODE_LOG2:
  1026.    case SHADER_OPCODE_SIN:
  1027.    case SHADER_OPCODE_COS:
  1028.       return 1 * dispatch_width / 8;
  1029.    case SHADER_OPCODE_POW:
  1030.    case SHADER_OPCODE_INT_QUOTIENT:
  1031.    case SHADER_OPCODE_INT_REMAINDER:
  1032.       return 2 * dispatch_width / 8;
  1033.    case SHADER_OPCODE_TEX:
  1034.    case FS_OPCODE_TXB:
  1035.    case SHADER_OPCODE_TXD:
  1036.    case SHADER_OPCODE_TXF:
  1037.    case SHADER_OPCODE_TXF_CMS:
  1038.    case SHADER_OPCODE_TXF_MCS:
  1039.    case SHADER_OPCODE_TG4:
  1040.    case SHADER_OPCODE_TG4_OFFSET:
  1041.    case SHADER_OPCODE_TXL:
  1042.    case SHADER_OPCODE_TXS:
  1043.    case SHADER_OPCODE_LOD:
  1044.       return 1;
  1045.    case FS_OPCODE_FB_WRITE:
  1046.       return 2;
  1047.    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  1048.    case SHADER_OPCODE_GEN4_SCRATCH_READ:
  1049.       return 1;
  1050.    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
  1051.       return inst->mlen;
  1052.    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
  1053.       return inst->mlen;
  1054.    case SHADER_OPCODE_UNTYPED_ATOMIC:
  1055.    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
  1056.    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
  1057.    case SHADER_OPCODE_TYPED_ATOMIC:
  1058.    case SHADER_OPCODE_TYPED_SURFACE_READ:
  1059.    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
  1060.    case SHADER_OPCODE_URB_WRITE_SIMD8:
  1061.    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
  1062.    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
  1063.    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
  1064.    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
  1065.       return 0;
  1066.    default:
  1067.       unreachable("not reached");
  1068.    }
  1069. }
  1070.  
  1071. fs_reg
  1072. fs_visitor::vgrf(const glsl_type *const type)
  1073. {
  1074.    int reg_width = dispatch_width / 8;
  1075.    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
  1076.                  brw_type_for_base_type(type), dispatch_width);
  1077. }
  1078.  
  1079. fs_reg
  1080. fs_visitor::vgrf(int num_components)
  1081. {
  1082.    int reg_width = dispatch_width / 8;
  1083.    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
  1084.                  BRW_REGISTER_TYPE_F, dispatch_width);
  1085. }
  1086.  
  1087. /** Fixed HW reg constructor. */
  1088. fs_reg::fs_reg(enum register_file file, int reg)
  1089. {
  1090.    init();
  1091.    this->file = file;
  1092.    this->reg = reg;
  1093.    this->type = BRW_REGISTER_TYPE_F;
  1094.  
  1095.    switch (file) {
  1096.    case UNIFORM:
  1097.       this->width = 1;
  1098.       break;
  1099.    default:
  1100.       this->width = 8;
  1101.    }
  1102. }
  1103.  
  1104. /** Fixed HW reg constructor. */
  1105. fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
  1106. {
  1107.    init();
  1108.    this->file = file;
  1109.    this->reg = reg;
  1110.    this->type = type;
  1111.  
  1112.    switch (file) {
  1113.    case UNIFORM:
  1114.       this->width = 1;
  1115.       break;
  1116.    default:
  1117.       this->width = 8;
  1118.    }
  1119. }
  1120.  
  1121. /** Fixed HW reg constructor. */
  1122. fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
  1123.                uint8_t width)
  1124. {
  1125.    init();
  1126.    this->file = file;
  1127.    this->reg = reg;
  1128.    this->type = type;
  1129.    this->width = width;
  1130. }
  1131.  
  1132. fs_reg *
  1133. fs_visitor::variable_storage(ir_variable *var)
  1134. {
  1135.    return (fs_reg *)hash_table_find(this->variable_ht, var);
  1136. }
  1137.  
  1138. void
  1139. import_uniforms_callback(const void *key,
  1140.                          void *data,
  1141.                          void *closure)
  1142. {
  1143.    struct hash_table *dst_ht = (struct hash_table *)closure;
  1144.    const fs_reg *reg = (const fs_reg *)data;
  1145.  
  1146.    if (reg->file != UNIFORM)
  1147.       return;
  1148.  
  1149.    hash_table_insert(dst_ht, data, key);
  1150. }
  1151.  
  1152. /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
  1153.  * This brings in those uniform definitions
  1154.  */
  1155. void
  1156. fs_visitor::import_uniforms(fs_visitor *v)
  1157. {
  1158.    hash_table_call_foreach(v->variable_ht,
  1159.                            import_uniforms_callback,
  1160.                            variable_ht);
  1161.    this->push_constant_loc = v->push_constant_loc;
  1162.    this->pull_constant_loc = v->pull_constant_loc;
  1163.    this->uniforms = v->uniforms;
  1164.    this->param_size = v->param_size;
  1165. }
  1166.  
  1167. /* Our support for uniforms is piggy-backed on the struct
  1168.  * gl_fragment_program, because that's where the values actually
  1169.  * get stored, rather than in some global gl_shader_program uniform
  1170.  * store.
  1171.  */
  1172. void
  1173. fs_visitor::setup_uniform_values(ir_variable *ir)
  1174. {
  1175.    int namelen = strlen(ir->name);
  1176.  
  1177.    /* The data for our (non-builtin) uniforms is stored in a series of
  1178.     * gl_uniform_driver_storage structs for each subcomponent that
  1179.     * glGetUniformLocation() could name.  We know it's been set up in the same
  1180.     * order we'd walk the type, so walk the list of storage and find anything
  1181.     * with our name, or the prefix of a component that starts with our name.
  1182.     */
  1183.    unsigned params_before = uniforms;
  1184.    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
  1185.       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
  1186.  
  1187.       if (strncmp(ir->name, storage->name, namelen) != 0 ||
  1188.           (storage->name[namelen] != 0 &&
  1189.            storage->name[namelen] != '.' &&
  1190.            storage->name[namelen] != '[')) {
  1191.          continue;
  1192.       }
  1193.  
  1194.       unsigned slots = storage->type->component_slots();
  1195.       if (storage->array_elements)
  1196.          slots *= storage->array_elements;
  1197.  
  1198.       for (unsigned i = 0; i < slots; i++) {
  1199.          stage_prog_data->param[uniforms++] = &storage->storage[i];
  1200.       }
  1201.    }
  1202.  
  1203.    /* Make sure we actually initialized the right amount of stuff here. */
  1204.    assert(params_before + ir->type->component_slots() == uniforms);
  1205.    (void)params_before;
  1206. }
  1207.  
  1208.  
  1209. /* Our support for builtin uniforms is even scarier than non-builtin.
  1210.  * It sits on top of the PROG_STATE_VAR parameters that are
  1211.  * automatically updated from GL context state.
  1212.  */
  1213. void
  1214. fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
  1215. {
  1216.    const ir_state_slot *const slots = ir->get_state_slots();
  1217.    assert(slots != NULL);
  1218.  
  1219.    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
  1220.       /* This state reference has already been setup by ir_to_mesa, but we'll
  1221.        * get the same index back here.
  1222.        */
  1223.       int index = _mesa_add_state_reference(this->prog->Parameters,
  1224.                                             (gl_state_index *)slots[i].tokens);
  1225.  
  1226.       /* Add each of the unique swizzles of the element as a parameter.
  1227.        * This'll end up matching the expected layout of the
  1228.        * array/matrix/structure we're trying to fill in.
  1229.        */
  1230.       int last_swiz = -1;
  1231.       for (unsigned int j = 0; j < 4; j++) {
  1232.          int swiz = GET_SWZ(slots[i].swizzle, j);
  1233.          if (swiz == last_swiz)
  1234.             break;
  1235.          last_swiz = swiz;
  1236.  
  1237.          stage_prog_data->param[uniforms++] =
  1238.             &prog->Parameters->ParameterValues[index][swiz];
  1239.       }
  1240.    }
  1241. }
  1242.  
  1243. fs_reg *
  1244. fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
  1245.                                          bool origin_upper_left)
  1246. {
  1247.    assert(stage == MESA_SHADER_FRAGMENT);
  1248.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  1249.    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
  1250.    fs_reg wpos = *reg;
  1251.    bool flip = !origin_upper_left ^ key->render_to_fbo;
  1252.  
  1253.    /* gl_FragCoord.x */
  1254.    if (pixel_center_integer) {
  1255.       emit(MOV(wpos, this->pixel_x));
  1256.    } else {
  1257.       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
  1258.    }
  1259.    wpos = offset(wpos, 1);
  1260.  
  1261.    /* gl_FragCoord.y */
  1262.    if (!flip && pixel_center_integer) {
  1263.       emit(MOV(wpos, this->pixel_y));
  1264.    } else {
  1265.       fs_reg pixel_y = this->pixel_y;
  1266.       float offset = (pixel_center_integer ? 0.0 : 0.5);
  1267.  
  1268.       if (flip) {
  1269.          pixel_y.negate = true;
  1270.          offset += key->drawable_height - 1.0;
  1271.       }
  1272.  
  1273.       emit(ADD(wpos, pixel_y, fs_reg(offset)));
  1274.    }
  1275.    wpos = offset(wpos, 1);
  1276.  
  1277.    /* gl_FragCoord.z */
  1278.    if (devinfo->gen >= 6) {
  1279.       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
  1280.    } else {
  1281.       emit(FS_OPCODE_LINTERP, wpos,
  1282.            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
  1283.            interp_reg(VARYING_SLOT_POS, 2));
  1284.    }
  1285.    wpos = offset(wpos, 1);
  1286.  
  1287.    /* gl_FragCoord.w: Already set up in emit_interpolation */
  1288.    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
  1289.  
  1290.    return reg;
  1291. }
  1292.  
  1293. fs_inst *
  1294. fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
  1295.                          glsl_interp_qualifier interpolation_mode,
  1296.                          bool is_centroid, bool is_sample)
  1297. {
  1298.    brw_wm_barycentric_interp_mode barycoord_mode;
  1299.    if (devinfo->gen >= 6) {
  1300.       if (is_centroid) {
  1301.          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
  1302.             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
  1303.          else
  1304.             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
  1305.       } else if (is_sample) {
  1306.           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
  1307.             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
  1308.          else
  1309.             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
  1310.       } else {
  1311.          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
  1312.             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
  1313.          else
  1314.             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
  1315.       }
  1316.    } else {
  1317.       /* On Ironlake and below, there is only one interpolation mode.
  1318.        * Centroid interpolation doesn't mean anything on this hardware --
  1319.        * there is no multisampling.
  1320.        */
  1321.       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
  1322.    }
  1323.    return emit(FS_OPCODE_LINTERP, attr,
  1324.                this->delta_xy[barycoord_mode], interp);
  1325. }
  1326.  
  1327. void
  1328. fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
  1329.                                        const glsl_type *type,
  1330.                                        glsl_interp_qualifier interpolation_mode,
  1331.                                        int location, bool mod_centroid,
  1332.                                        bool mod_sample)
  1333. {
  1334.    attr.type = brw_type_for_base_type(type->get_scalar_type());
  1335.  
  1336.    assert(stage == MESA_SHADER_FRAGMENT);
  1337.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  1338.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  1339.  
  1340.    unsigned int array_elements;
  1341.  
  1342.    if (type->is_array()) {
  1343.       array_elements = type->length;
  1344.       if (array_elements == 0) {
  1345.          fail("dereferenced array '%s' has length 0\n", name);
  1346.       }
  1347.       type = type->fields.array;
  1348.    } else {
  1349.       array_elements = 1;
  1350.    }
  1351.  
  1352.    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
  1353.       bool is_gl_Color =
  1354.          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
  1355.       if (key->flat_shade && is_gl_Color) {
  1356.          interpolation_mode = INTERP_QUALIFIER_FLAT;
  1357.       } else {
  1358.          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
  1359.       }
  1360.    }
  1361.  
  1362.    for (unsigned int i = 0; i < array_elements; i++) {
  1363.       for (unsigned int j = 0; j < type->matrix_columns; j++) {
  1364.          if (prog_data->urb_setup[location] == -1) {
  1365.             /* If there's no incoming setup data for this slot, don't
  1366.              * emit interpolation for it.
  1367.              */
  1368.             attr = offset(attr, type->vector_elements);
  1369.             location++;
  1370.             continue;
  1371.          }
  1372.  
  1373.          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
  1374.             /* Constant interpolation (flat shading) case. The SF has
  1375.              * handed us defined values in only the constant offset
  1376.              * field of the setup reg.
  1377.              */
  1378.             for (unsigned int k = 0; k < type->vector_elements; k++) {
  1379.                struct brw_reg interp = interp_reg(location, k);
  1380.                interp = suboffset(interp, 3);
  1381.                interp.type = attr.type;
  1382.                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
  1383.                attr = offset(attr, 1);
  1384.             }
  1385.          } else {
  1386.             /* Smooth/noperspective interpolation case. */
  1387.             for (unsigned int k = 0; k < type->vector_elements; k++) {
  1388.                struct brw_reg interp = interp_reg(location, k);
  1389.                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
  1390.                   /* Get the pixel/sample mask into f0 so that we know
  1391.                    * which pixels are lit.  Then, for each channel that is
  1392.                    * unlit, replace the centroid data with non-centroid
  1393.                    * data.
  1394.                    */
  1395.                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
  1396.  
  1397.                   fs_inst *inst;
  1398.                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
  1399.                                       false, false);
  1400.                   inst->predicate = BRW_PREDICATE_NORMAL;
  1401.                   inst->predicate_inverse = true;
  1402.                   if (devinfo->has_pln)
  1403.                      inst->no_dd_clear = true;
  1404.  
  1405.                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
  1406.                                       mod_centroid && !key->persample_shading,
  1407.                                       mod_sample || key->persample_shading);
  1408.                   inst->predicate = BRW_PREDICATE_NORMAL;
  1409.                   inst->predicate_inverse = false;
  1410.                   if (devinfo->has_pln)
  1411.                      inst->no_dd_check = true;
  1412.  
  1413.                } else {
  1414.                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
  1415.                                mod_centroid && !key->persample_shading,
  1416.                                mod_sample || key->persample_shading);
  1417.                }
  1418.                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
  1419.                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
  1420.                }
  1421.                attr = offset(attr, 1);
  1422.             }
  1423.  
  1424.          }
  1425.          location++;
  1426.       }
  1427.    }
  1428. }
  1429.  
  1430. fs_reg *
  1431. fs_visitor::emit_frontfacing_interpolation()
  1432. {
  1433.    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
  1434.  
  1435.    if (devinfo->gen >= 6) {
  1436.       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
  1437.        * a boolean result from this (~0/true or 0/false).
  1438.        *
  1439.        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
  1440.        * this task in only one instruction:
  1441.        *    - a negation source modifier will flip the bit; and
  1442.        *    - a W -> D type conversion will sign extend the bit into the high
  1443.        *      word of the destination.
  1444.        *
  1445.        * An ASR 15 fills the low word of the destination.
  1446.        */
  1447.       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
  1448.       g0.negate = true;
  1449.  
  1450.       emit(ASR(*reg, g0, fs_reg(15)));
  1451.    } else {
  1452.       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
  1453.        * a boolean result from this (1/true or 0/false).
  1454.        *
  1455.        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
  1456.        * the negation source modifier to flip it. Unfortunately the SHR
  1457.        * instruction only operates on UD (or D with an abs source modifier)
  1458.        * sources without negation.
  1459.        *
  1460.        * Instead, use ASR (which will give ~0/true or 0/false).
  1461.        */
  1462.       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
  1463.       g1_6.negate = true;
  1464.  
  1465.       emit(ASR(*reg, g1_6, fs_reg(31)));
  1466.    }
  1467.  
  1468.    return reg;
  1469. }
  1470.  
  1471. void
  1472. fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
  1473. {
  1474.    assert(stage == MESA_SHADER_FRAGMENT);
  1475.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  1476.    assert(dst.type == BRW_REGISTER_TYPE_F);
  1477.  
  1478.    if (key->compute_pos_offset) {
  1479.       /* Convert int_sample_pos to floating point */
  1480.       emit(MOV(dst, int_sample_pos));
  1481.       /* Scale to the range [0, 1] */
  1482.       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
  1483.    }
  1484.    else {
  1485.       /* From ARB_sample_shading specification:
  1486.        * "When rendering to a non-multisample buffer, or if multisample
  1487.        *  rasterization is disabled, gl_SamplePosition will always be
  1488.        *  (0.5, 0.5).
  1489.        */
  1490.       emit(MOV(dst, fs_reg(0.5f)));
  1491.    }
  1492. }
  1493.  
  1494. fs_reg *
  1495. fs_visitor::emit_samplepos_setup()
  1496. {
  1497.    assert(devinfo->gen >= 6);
  1498.  
  1499.    this->current_annotation = "compute sample position";
  1500.    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
  1501.    fs_reg pos = *reg;
  1502.    fs_reg int_sample_x = vgrf(glsl_type::int_type);
  1503.    fs_reg int_sample_y = vgrf(glsl_type::int_type);
  1504.  
  1505.    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
  1506.     * mode will be enabled.
  1507.     *
  1508.     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
  1509.     * R31.1:0         Position Offset X/Y for Slot[3:0]
  1510.     * R31.3:2         Position Offset X/Y for Slot[7:4]
  1511.     * .....
  1512.     *
  1513.     * The X, Y sample positions come in as bytes in  thread payload. So, read
  1514.     * the positions using vstride=16, width=8, hstride=2.
  1515.     */
  1516.    struct brw_reg sample_pos_reg =
  1517.       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
  1518.                     BRW_REGISTER_TYPE_B), 16, 8, 2);
  1519.  
  1520.    if (dispatch_width == 8) {
  1521.       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
  1522.    } else {
  1523.       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
  1524.       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
  1525.          ->force_sechalf = true;
  1526.    }
  1527.    /* Compute gl_SamplePosition.x */
  1528.    compute_sample_position(pos, int_sample_x);
  1529.    pos = offset(pos, 1);
  1530.    if (dispatch_width == 8) {
  1531.       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
  1532.    } else {
  1533.       emit(MOV(half(int_sample_y, 0),
  1534.                fs_reg(suboffset(sample_pos_reg, 1))));
  1535.       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
  1536.          ->force_sechalf = true;
  1537.    }
  1538.    /* Compute gl_SamplePosition.y */
  1539.    compute_sample_position(pos, int_sample_y);
  1540.    return reg;
  1541. }
  1542.  
  1543. fs_reg *
  1544. fs_visitor::emit_sampleid_setup()
  1545. {
  1546.    assert(stage == MESA_SHADER_FRAGMENT);
  1547.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  1548.    assert(devinfo->gen >= 6);
  1549.  
  1550.    this->current_annotation = "compute sample id";
  1551.    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
  1552.  
  1553.    if (key->compute_sample_id) {
  1554.       fs_reg t1 = vgrf(glsl_type::int_type);
  1555.       fs_reg t2 = vgrf(glsl_type::int_type);
  1556.       t2.type = BRW_REGISTER_TYPE_UW;
  1557.  
  1558.       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
  1559.        * 8x multisampling, subspan 0 will represent sample N (where N
  1560.        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
  1561.        * 7. We can find the value of N by looking at R0.0 bits 7:6
  1562.        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
  1563.        * (since samples are always delivered in pairs). That is, we
  1564.        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
  1565.        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
  1566.        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
  1567.        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
  1568.        * populating a temporary variable with the sequence (0, 1, 2, 3),
  1569.        * and then reading from it using vstride=1, width=4, hstride=0.
  1570.        * These computations hold good for 4x multisampling as well.
  1571.        *
  1572.        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
  1573.        * the first four slots are sample 0 of subspan 0; the next four
  1574.        * are sample 1 of subspan 0; the third group is sample 0 of
  1575.        * subspan 1, and finally sample 1 of subspan 1.
  1576.        */
  1577.       fs_inst *inst;
  1578.       inst = emit(BRW_OPCODE_AND, t1,
  1579.                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  1580.                   fs_reg(0xc0));
  1581.       inst->force_writemask_all = true;
  1582.       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
  1583.       inst->force_writemask_all = true;
  1584.       /* This works for both SIMD8 and SIMD16 */
  1585.       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
  1586.       inst->force_writemask_all = true;
  1587.       /* This special instruction takes care of setting vstride=1,
  1588.        * width=4, hstride=0 of t2 during an ADD instruction.
  1589.        */
  1590.       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
  1591.    } else {
  1592.       /* As per GL_ARB_sample_shading specification:
  1593.        * "When rendering to a non-multisample buffer, or if multisample
  1594.        *  rasterization is disabled, gl_SampleID will always be zero."
  1595.        */
  1596.       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
  1597.    }
  1598.  
  1599.    return reg;
  1600. }
  1601.  
  1602. void
  1603. fs_visitor::resolve_source_modifiers(fs_reg *src)
  1604. {
  1605.    if (!src->abs && !src->negate)
  1606.       return;
  1607.  
  1608.    fs_reg temp = retype(vgrf(1), src->type);
  1609.    emit(MOV(temp, *src));
  1610.    *src = temp;
  1611. }
  1612.  
  1613. fs_reg
  1614. fs_visitor::fix_math_operand(fs_reg src)
  1615. {
  1616.    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
  1617.     * might be able to do better by doing execsize = 1 math and then
  1618.     * expanding that result out, but we would need to be careful with
  1619.     * masking.
  1620.     *
  1621.     * The hardware ignores source modifiers (negate and abs) on math
  1622.     * instructions, so we also move to a temp to set those up.
  1623.     */
  1624.    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
  1625.        !src.abs && !src.negate)
  1626.       return src;
  1627.  
  1628.    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
  1629.     * operands to math
  1630.     */
  1631.    if (devinfo->gen >= 7 && src.file != IMM)
  1632.       return src;
  1633.  
  1634.    fs_reg expanded = vgrf(glsl_type::float_type);
  1635.    expanded.type = src.type;
  1636.    emit(BRW_OPCODE_MOV, expanded, src);
  1637.    return expanded;
  1638. }
  1639.  
  1640. fs_inst *
  1641. fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
  1642. {
  1643.    switch (opcode) {
  1644.    case SHADER_OPCODE_RCP:
  1645.    case SHADER_OPCODE_RSQ:
  1646.    case SHADER_OPCODE_SQRT:
  1647.    case SHADER_OPCODE_EXP2:
  1648.    case SHADER_OPCODE_LOG2:
  1649.    case SHADER_OPCODE_SIN:
  1650.    case SHADER_OPCODE_COS:
  1651.       break;
  1652.    default:
  1653.       unreachable("not reached: bad math opcode");
  1654.    }
  1655.  
  1656.    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
  1657.     * might be able to do better by doing execsize = 1 math and then
  1658.     * expanding that result out, but we would need to be careful with
  1659.     * masking.
  1660.     *
  1661.     * Gen 6 hardware ignores source modifiers (negate and abs) on math
  1662.     * instructions, so we also move to a temp to set those up.
  1663.     */
  1664.    if (devinfo->gen == 6 || devinfo->gen == 7)
  1665.       src = fix_math_operand(src);
  1666.  
  1667.    fs_inst *inst = emit(opcode, dst, src);
  1668.  
  1669.    if (devinfo->gen < 6) {
  1670.       inst->base_mrf = 2;
  1671.       inst->mlen = dispatch_width / 8;
  1672.    }
  1673.  
  1674.    return inst;
  1675. }
  1676.  
  1677. fs_inst *
  1678. fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
  1679. {
  1680.    int base_mrf = 2;
  1681.    fs_inst *inst;
  1682.  
  1683.    if (devinfo->gen >= 8) {
  1684.       inst = emit(opcode, dst, src0, src1);
  1685.    } else if (devinfo->gen >= 6) {
  1686.       src0 = fix_math_operand(src0);
  1687.       src1 = fix_math_operand(src1);
  1688.  
  1689.       inst = emit(opcode, dst, src0, src1);
  1690.    } else {
  1691.       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
  1692.        * "Message Payload":
  1693.        *
  1694.        * "Operand0[7].  For the INT DIV functions, this operand is the
  1695.        *  denominator."
  1696.        *  ...
  1697.        * "Operand1[7].  For the INT DIV functions, this operand is the
  1698.        *  numerator."
  1699.        */
  1700.       bool is_int_div = opcode != SHADER_OPCODE_POW;
  1701.       fs_reg &op0 = is_int_div ? src1 : src0;
  1702.       fs_reg &op1 = is_int_div ? src0 : src1;
  1703.  
  1704.       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
  1705.       inst = emit(opcode, dst, op0, reg_null_f);
  1706.  
  1707.       inst->base_mrf = base_mrf;
  1708.       inst->mlen = 2 * dispatch_width / 8;
  1709.    }
  1710.    return inst;
  1711. }
  1712.  
  1713. void
  1714. fs_visitor::emit_discard_jump()
  1715. {
  1716.    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
  1717.  
  1718.    /* For performance, after a discard, jump to the end of the
  1719.     * shader if all relevant channels have been discarded.
  1720.     */
  1721.    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
  1722.    discard_jump->flag_subreg = 1;
  1723.  
  1724.    discard_jump->predicate = (dispatch_width == 8)
  1725.                              ? BRW_PREDICATE_ALIGN1_ANY8H
  1726.                              : BRW_PREDICATE_ALIGN1_ANY16H;
  1727.    discard_jump->predicate_inverse = true;
  1728. }
  1729.  
  1730. void
  1731. fs_visitor::assign_curb_setup()
  1732. {
  1733.    if (dispatch_width == 8) {
  1734.       prog_data->dispatch_grf_start_reg = payload.num_regs;
  1735.    } else {
  1736.       if (stage == MESA_SHADER_FRAGMENT) {
  1737.          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  1738.          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
  1739.       } else if (stage == MESA_SHADER_COMPUTE) {
  1740.          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
  1741.          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
  1742.       } else {
  1743.          unreachable("Unsupported shader type!");
  1744.       }
  1745.    }
  1746.  
  1747.    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
  1748.  
  1749.    /* Map the offsets in the UNIFORM file to fixed HW regs. */
  1750.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  1751.       for (unsigned int i = 0; i < inst->sources; i++) {
  1752.          if (inst->src[i].file == UNIFORM) {
  1753.             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
  1754.             int constant_nr;
  1755.             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
  1756.                constant_nr = push_constant_loc[uniform_nr];
  1757.             } else {
  1758.                /* Section 5.11 of the OpenGL 4.1 spec says:
  1759.                 * "Out-of-bounds reads return undefined values, which include
  1760.                 *  values from other variables of the active program or zero."
  1761.                 * Just return the first push constant.
  1762.                 */
  1763.                constant_nr = 0;
  1764.             }
  1765.  
  1766.             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
  1767.                                                   constant_nr / 8,
  1768.                                                   constant_nr % 8);
  1769.  
  1770.             inst->src[i].file = HW_REG;
  1771.             inst->src[i].fixed_hw_reg = byte_offset(
  1772.                retype(brw_reg, inst->src[i].type),
  1773.                inst->src[i].subreg_offset);
  1774.          }
  1775.       }
  1776.    }
  1777. }
  1778.  
  1779. void
  1780. fs_visitor::calculate_urb_setup()
  1781. {
  1782.    assert(stage == MESA_SHADER_FRAGMENT);
  1783.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  1784.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  1785.  
  1786.    memset(prog_data->urb_setup, -1,
  1787.           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
  1788.  
  1789.    int urb_next = 0;
  1790.    /* Figure out where each of the incoming setup attributes lands. */
  1791.    if (devinfo->gen >= 6) {
  1792.       if (_mesa_bitcount_64(prog->InputsRead &
  1793.                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
  1794.          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
  1795.           * first 16 varying inputs, so we can put them wherever we want.
  1796.           * Just put them in order.
  1797.           *
  1798.           * This is useful because it means that (a) inputs not used by the
  1799.           * fragment shader won't take up valuable register space, and (b) we
  1800.           * won't have to recompile the fragment shader if it gets paired with
  1801.           * a different vertex (or geometry) shader.
  1802.           */
  1803.          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
  1804.             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
  1805.                 BITFIELD64_BIT(i)) {
  1806.                prog_data->urb_setup[i] = urb_next++;
  1807.             }
  1808.          }
  1809.       } else {
  1810.          /* We have enough input varyings that the SF/SBE pipeline stage can't
  1811.           * arbitrarily rearrange them to suit our whim; we have to put them
  1812.           * in an order that matches the output of the previous pipeline stage
  1813.           * (geometry or vertex shader).
  1814.           */
  1815.          struct brw_vue_map prev_stage_vue_map;
  1816.          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
  1817.                              key->input_slots_valid);
  1818.          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
  1819.          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
  1820.          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
  1821.               slot++) {
  1822.             int varying = prev_stage_vue_map.slot_to_varying[slot];
  1823.             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
  1824.              * unused.
  1825.              */
  1826.             if (varying != BRW_VARYING_SLOT_COUNT &&
  1827.                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
  1828.                  BITFIELD64_BIT(varying))) {
  1829.                prog_data->urb_setup[varying] = slot - first_slot;
  1830.             }
  1831.          }
  1832.          urb_next = prev_stage_vue_map.num_slots - first_slot;
  1833.       }
  1834.    } else {
  1835.       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
  1836.       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
  1837.          /* Point size is packed into the header, not as a general attribute */
  1838.          if (i == VARYING_SLOT_PSIZ)
  1839.             continue;
  1840.  
  1841.          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
  1842.             /* The back color slot is skipped when the front color is
  1843.              * also written to.  In addition, some slots can be
  1844.              * written in the vertex shader and not read in the
  1845.              * fragment shader.  So the register number must always be
  1846.              * incremented, mapped or not.
  1847.              */
  1848.             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
  1849.                prog_data->urb_setup[i] = urb_next;
  1850.             urb_next++;
  1851.          }
  1852.       }
  1853.  
  1854.       /*
  1855.        * It's a FS only attribute, and we did interpolation for this attribute
  1856.        * in SF thread. So, count it here, too.
  1857.        *
  1858.        * See compile_sf_prog() for more info.
  1859.        */
  1860.       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
  1861.          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
  1862.    }
  1863.  
  1864.    prog_data->num_varying_inputs = urb_next;
  1865. }
  1866.  
  1867. void
  1868. fs_visitor::assign_urb_setup()
  1869. {
  1870.    assert(stage == MESA_SHADER_FRAGMENT);
  1871.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  1872.  
  1873.    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
  1874.  
  1875.    /* Offset all the urb_setup[] index by the actual position of the
  1876.     * setup regs, now that the location of the constants has been chosen.
  1877.     */
  1878.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  1879.       if (inst->opcode == FS_OPCODE_LINTERP) {
  1880.          assert(inst->src[1].file == HW_REG);
  1881.          inst->src[1].fixed_hw_reg.nr += urb_start;
  1882.       }
  1883.  
  1884.       if (inst->opcode == FS_OPCODE_CINTERP) {
  1885.          assert(inst->src[0].file == HW_REG);
  1886.          inst->src[0].fixed_hw_reg.nr += urb_start;
  1887.       }
  1888.    }
  1889.  
  1890.    /* Each attribute is 4 setup channels, each of which is half a reg. */
  1891.    this->first_non_payload_grf =
  1892.       urb_start + prog_data->num_varying_inputs * 2;
  1893. }
  1894.  
  1895. void
  1896. fs_visitor::assign_vs_urb_setup()
  1897. {
  1898.    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  1899.    int grf, count, slot, channel, attr;
  1900.  
  1901.    assert(stage == MESA_SHADER_VERTEX);
  1902.    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
  1903.    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
  1904.       count++;
  1905.  
  1906.    /* Each attribute is 4 regs. */
  1907.    this->first_non_payload_grf =
  1908.       payload.num_regs + prog_data->curb_read_length + count * 4;
  1909.  
  1910.    unsigned vue_entries =
  1911.       MAX2(count, vs_prog_data->base.vue_map.num_slots);
  1912.  
  1913.    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
  1914.    vs_prog_data->base.urb_read_length = (count + 1) / 2;
  1915.  
  1916.    assert(vs_prog_data->base.urb_read_length <= 15);
  1917.  
  1918.    /* Rewrite all ATTR file references to the hw grf that they land in. */
  1919.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  1920.       for (int i = 0; i < inst->sources; i++) {
  1921.          if (inst->src[i].file == ATTR) {
  1922.  
  1923.             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
  1924.                slot = count - 1;
  1925.             } else {
  1926.                /* Attributes come in in a contiguous block, ordered by their
  1927.                 * gl_vert_attrib value.  That means we can compute the slot
  1928.                 * number for an attribute by masking out the enabled
  1929.                 * attributes before it and counting the bits.
  1930.                 */
  1931.                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
  1932.                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
  1933.                                         BITFIELD64_MASK(attr));
  1934.             }
  1935.  
  1936.             channel = inst->src[i].reg_offset & 3;
  1937.  
  1938.             grf = payload.num_regs +
  1939.                prog_data->curb_read_length +
  1940.                slot * 4 + channel;
  1941.  
  1942.             inst->src[i].file = HW_REG;
  1943.             inst->src[i].fixed_hw_reg =
  1944.                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
  1945.          }
  1946.       }
  1947.    }
  1948. }
  1949.  
  1950. /**
  1951.  * Split large virtual GRFs into separate components if we can.
  1952.  *
  1953.  * This is mostly duplicated with what brw_fs_vector_splitting does,
  1954.  * but that's really conservative because it's afraid of doing
  1955.  * splitting that doesn't result in real progress after the rest of
  1956.  * the optimization phases, which would cause infinite looping in
  1957.  * optimization.  We can do it once here, safely.  This also has the
  1958.  * opportunity to split interpolated values, or maybe even uniforms,
  1959.  * which we don't have at the IR level.
  1960.  *
  1961.  * We want to split, because virtual GRFs are what we register
  1962.  * allocate and spill (due to contiguousness requirements for some
  1963.  * instructions), and they're what we naturally generate in the
  1964.  * codegen process, but most virtual GRFs don't actually need to be
  1965.  * contiguous sets of GRFs.  If we split, we'll end up with reduced
  1966.  * live intervals and better dead code elimination and coalescing.
  1967.  */
  1968. void
  1969. fs_visitor::split_virtual_grfs()
  1970. {
  1971.    int num_vars = this->alloc.count;
  1972.  
  1973.    /* Count the total number of registers */
  1974.    int reg_count = 0;
  1975.    int vgrf_to_reg[num_vars];
  1976.    for (int i = 0; i < num_vars; i++) {
  1977.       vgrf_to_reg[i] = reg_count;
  1978.       reg_count += alloc.sizes[i];
  1979.    }
  1980.  
  1981.    /* An array of "split points".  For each register slot, this indicates
  1982.     * if this slot can be separated from the previous slot.  Every time an
  1983.     * instruction uses multiple elements of a register (as a source or
  1984.     * destination), we mark the used slots as inseparable.  Then we go
  1985.     * through and split the registers into the smallest pieces we can.
  1986.     */
  1987.    bool split_points[reg_count];
  1988.    memset(split_points, 0, sizeof(split_points));
  1989.  
  1990.    /* Mark all used registers as fully splittable */
  1991.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  1992.       if (inst->dst.file == GRF) {
  1993.          int reg = vgrf_to_reg[inst->dst.reg];
  1994.          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
  1995.             split_points[reg + j] = true;
  1996.       }
  1997.  
  1998.       for (int i = 0; i < inst->sources; i++) {
  1999.          if (inst->src[i].file == GRF) {
  2000.             int reg = vgrf_to_reg[inst->src[i].reg];
  2001.             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
  2002.                split_points[reg + j] = true;
  2003.          }
  2004.       }
  2005.    }
  2006.  
  2007.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2008.       if (inst->dst.file == GRF) {
  2009.          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
  2010.          for (int j = 1; j < inst->regs_written; j++)
  2011.             split_points[reg + j] = false;
  2012.       }
  2013.       for (int i = 0; i < inst->sources; i++) {
  2014.          if (inst->src[i].file == GRF) {
  2015.             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
  2016.             for (int j = 1; j < inst->regs_read(i); j++)
  2017.                split_points[reg + j] = false;
  2018.          }
  2019.       }
  2020.    }
  2021.  
  2022.    int new_virtual_grf[reg_count];
  2023.    int new_reg_offset[reg_count];
  2024.  
  2025.    int reg = 0;
  2026.    for (int i = 0; i < num_vars; i++) {
  2027.       /* The first one should always be 0 as a quick sanity check. */
  2028.       assert(split_points[reg] == false);
  2029.  
  2030.       /* j = 0 case */
  2031.       new_reg_offset[reg] = 0;
  2032.       reg++;
  2033.       int offset = 1;
  2034.  
  2035.       /* j > 0 case */
  2036.       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
  2037.          /* If this is a split point, reset the offset to 0 and allocate a
  2038.           * new virtual GRF for the previous offset many registers
  2039.           */
  2040.          if (split_points[reg]) {
  2041.             assert(offset <= MAX_VGRF_SIZE);
  2042.             int grf = alloc.allocate(offset);
  2043.             for (int k = reg - offset; k < reg; k++)
  2044.                new_virtual_grf[k] = grf;
  2045.             offset = 0;
  2046.          }
  2047.          new_reg_offset[reg] = offset;
  2048.          offset++;
  2049.          reg++;
  2050.       }
  2051.  
  2052.       /* The last one gets the original register number */
  2053.       assert(offset <= MAX_VGRF_SIZE);
  2054.       alloc.sizes[i] = offset;
  2055.       for (int k = reg - offset; k < reg; k++)
  2056.          new_virtual_grf[k] = i;
  2057.    }
  2058.    assert(reg == reg_count);
  2059.  
  2060.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2061.       if (inst->dst.file == GRF) {
  2062.          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
  2063.          inst->dst.reg = new_virtual_grf[reg];
  2064.          inst->dst.reg_offset = new_reg_offset[reg];
  2065.          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
  2066.       }
  2067.       for (int i = 0; i < inst->sources; i++) {
  2068.          if (inst->src[i].file == GRF) {
  2069.             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
  2070.             inst->src[i].reg = new_virtual_grf[reg];
  2071.             inst->src[i].reg_offset = new_reg_offset[reg];
  2072.             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
  2073.          }
  2074.       }
  2075.    }
  2076.    invalidate_live_intervals();
  2077. }
  2078.  
  2079. /**
  2080.  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
  2081.  *
  2082.  * During code generation, we create tons of temporary variables, many of
  2083.  * which get immediately killed and are never used again.  Yet, in later
  2084.  * optimization and analysis passes, such as compute_live_intervals, we need
  2085.  * to loop over all the virtual GRFs.  Compacting them can save a lot of
  2086.  * overhead.
  2087.  */
  2088. bool
  2089. fs_visitor::compact_virtual_grfs()
  2090. {
  2091.    bool progress = false;
  2092.    int remap_table[this->alloc.count];
  2093.    memset(remap_table, -1, sizeof(remap_table));
  2094.  
  2095.    /* Mark which virtual GRFs are used. */
  2096.    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
  2097.       if (inst->dst.file == GRF)
  2098.          remap_table[inst->dst.reg] = 0;
  2099.  
  2100.       for (int i = 0; i < inst->sources; i++) {
  2101.          if (inst->src[i].file == GRF)
  2102.             remap_table[inst->src[i].reg] = 0;
  2103.       }
  2104.    }
  2105.  
  2106.    /* Compact the GRF arrays. */
  2107.    int new_index = 0;
  2108.    for (unsigned i = 0; i < this->alloc.count; i++) {
  2109.       if (remap_table[i] == -1) {
  2110.          /* We just found an unused register.  This means that we are
  2111.           * actually going to compact something.
  2112.           */
  2113.          progress = true;
  2114.       } else {
  2115.          remap_table[i] = new_index;
  2116.          alloc.sizes[new_index] = alloc.sizes[i];
  2117.          invalidate_live_intervals();
  2118.          ++new_index;
  2119.       }
  2120.    }
  2121.  
  2122.    this->alloc.count = new_index;
  2123.  
  2124.    /* Patch all the instructions to use the newly renumbered registers */
  2125.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2126.       if (inst->dst.file == GRF)
  2127.          inst->dst.reg = remap_table[inst->dst.reg];
  2128.  
  2129.       for (int i = 0; i < inst->sources; i++) {
  2130.          if (inst->src[i].file == GRF)
  2131.             inst->src[i].reg = remap_table[inst->src[i].reg];
  2132.       }
  2133.    }
  2134.  
  2135.    /* Patch all the references to delta_xy, since they're used in register
  2136.     * allocation.  If they're unused, switch them to BAD_FILE so we don't
  2137.     * think some random VGRF is delta_xy.
  2138.     */
  2139.    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
  2140.       if (delta_xy[i].file == GRF) {
  2141.          if (remap_table[delta_xy[i].reg] != -1) {
  2142.             delta_xy[i].reg = remap_table[delta_xy[i].reg];
  2143.          } else {
  2144.             delta_xy[i].file = BAD_FILE;
  2145.          }
  2146.       }
  2147.    }
  2148.  
  2149.    return progress;
  2150. }
  2151.  
  2152. /*
  2153.  * Implements array access of uniforms by inserting a
  2154.  * PULL_CONSTANT_LOAD instruction.
  2155.  *
  2156.  * Unlike temporary GRF array access (where we don't support it due to
  2157.  * the difficulty of doing relative addressing on instruction
  2158.  * destinations), we could potentially do array access of uniforms
  2159.  * that were loaded in GRF space as push constants.  In real-world
  2160.  * usage we've seen, though, the arrays being used are always larger
  2161.  * than we could load as push constants, so just always move all
  2162.  * uniform array access out to a pull constant buffer.
  2163.  */
  2164. void
  2165. fs_visitor::move_uniform_array_access_to_pull_constants()
  2166. {
  2167.    if (dispatch_width != 8)
  2168.       return;
  2169.  
  2170.    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  2171.    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
  2172.  
  2173.    /* Walk through and find array access of uniforms.  Put a copy of that
  2174.     * uniform in the pull constant buffer.
  2175.     *
  2176.     * Note that we don't move constant-indexed accesses to arrays.  No
  2177.     * testing has been done of the performance impact of this choice.
  2178.     */
  2179.    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
  2180.       for (int i = 0 ; i < inst->sources; i++) {
  2181.          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
  2182.             continue;
  2183.  
  2184.          int uniform = inst->src[i].reg;
  2185.  
  2186.          /* If this array isn't already present in the pull constant buffer,
  2187.           * add it.
  2188.           */
  2189.          if (pull_constant_loc[uniform] == -1) {
  2190.             const gl_constant_value **values = &stage_prog_data->param[uniform];
  2191.  
  2192.             assert(param_size[uniform]);
  2193.  
  2194.             for (int j = 0; j < param_size[uniform]; j++) {
  2195.                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
  2196.  
  2197.                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
  2198.                   values[j];
  2199.             }
  2200.          }
  2201.       }
  2202.    }
  2203. }
  2204.  
  2205. /**
  2206.  * Assign UNIFORM file registers to either push constants or pull constants.
  2207.  *
  2208.  * We allow a fragment shader to have more than the specified minimum
  2209.  * maximum number of fragment shader uniform components (64).  If
  2210.  * there are too many of these, they'd fill up all of register space.
  2211.  * So, this will push some of them out to the pull constant buffer and
  2212.  * update the program to load them.
  2213.  */
  2214. void
  2215. fs_visitor::assign_constant_locations()
  2216. {
  2217.    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
  2218.    if (dispatch_width != 8)
  2219.       return;
  2220.  
  2221.    /* Find which UNIFORM registers are still in use. */
  2222.    bool is_live[uniforms];
  2223.    for (unsigned int i = 0; i < uniforms; i++) {
  2224.       is_live[i] = false;
  2225.    }
  2226.  
  2227.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2228.       for (int i = 0; i < inst->sources; i++) {
  2229.          if (inst->src[i].file != UNIFORM)
  2230.             continue;
  2231.  
  2232.          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
  2233.          if (constant_nr >= 0 && constant_nr < (int) uniforms)
  2234.             is_live[constant_nr] = true;
  2235.       }
  2236.    }
  2237.  
  2238.    /* Only allow 16 registers (128 uniform components) as push constants.
  2239.     *
  2240.     * Just demote the end of the list.  We could probably do better
  2241.     * here, demoting things that are rarely used in the program first.
  2242.     *
  2243.     * If changing this value, note the limitation about total_regs in
  2244.     * brw_curbe.c.
  2245.     */
  2246.    unsigned int max_push_components = 16 * 8;
  2247.    unsigned int num_push_constants = 0;
  2248.  
  2249.    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
  2250.  
  2251.    for (unsigned int i = 0; i < uniforms; i++) {
  2252.       if (!is_live[i] || pull_constant_loc[i] != -1) {
  2253.          /* This UNIFORM register is either dead, or has already been demoted
  2254.           * to a pull const.  Mark it as no longer living in the param[] array.
  2255.           */
  2256.          push_constant_loc[i] = -1;
  2257.          continue;
  2258.       }
  2259.  
  2260.       if (num_push_constants < max_push_components) {
  2261.          /* Retain as a push constant.  Record the location in the params[]
  2262.           * array.
  2263.           */
  2264.          push_constant_loc[i] = num_push_constants++;
  2265.       } else {
  2266.          /* Demote to a pull constant. */
  2267.          push_constant_loc[i] = -1;
  2268.  
  2269.          int pull_index = stage_prog_data->nr_pull_params++;
  2270.          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
  2271.          pull_constant_loc[i] = pull_index;
  2272.       }
  2273.    }
  2274.  
  2275.    stage_prog_data->nr_params = num_push_constants;
  2276.  
  2277.    /* Up until now, the param[] array has been indexed by reg + reg_offset
  2278.     * of UNIFORM registers.  Condense it to only contain the uniforms we
  2279.     * chose to upload as push constants.
  2280.     */
  2281.    for (unsigned int i = 0; i < uniforms; i++) {
  2282.       int remapped = push_constant_loc[i];
  2283.  
  2284.       if (remapped == -1)
  2285.          continue;
  2286.  
  2287.       assert(remapped <= (int)i);
  2288.       stage_prog_data->param[remapped] = stage_prog_data->param[i];
  2289.    }
  2290. }
  2291.  
  2292. /**
  2293.  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
  2294.  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
  2295.  */
  2296. void
  2297. fs_visitor::demote_pull_constants()
  2298. {
  2299.    foreach_block_and_inst (block, fs_inst, inst, cfg) {
  2300.       for (int i = 0; i < inst->sources; i++) {
  2301.          if (inst->src[i].file != UNIFORM)
  2302.             continue;
  2303.  
  2304.          int pull_index;
  2305.          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
  2306.          if (location >= uniforms) /* Out of bounds access */
  2307.             pull_index = -1;
  2308.          else
  2309.             pull_index = pull_constant_loc[location];
  2310.  
  2311.          if (pull_index == -1)
  2312.             continue;
  2313.  
  2314.          /* Set up the annotation tracking for new generated instructions. */
  2315.          base_ir = inst->ir;
  2316.          current_annotation = inst->annotation;
  2317.  
  2318.          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
  2319.          fs_reg dst = vgrf(glsl_type::float_type);
  2320.  
  2321.          /* Generate a pull load into dst. */
  2322.          if (inst->src[i].reladdr) {
  2323.             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
  2324.                                                         surf_index,
  2325.                                                         *inst->src[i].reladdr,
  2326.                                                         pull_index);
  2327.             inst->insert_before(block, &list);
  2328.             inst->src[i].reladdr = NULL;
  2329.          } else {
  2330.             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
  2331.             fs_inst *pull =
  2332.                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
  2333.                                     dst, surf_index, offset);
  2334.             inst->insert_before(block, pull);
  2335.             inst->src[i].set_smear(pull_index & 3);
  2336.          }
  2337.  
  2338.          /* Rewrite the instruction to use the temporary VGRF. */
  2339.          inst->src[i].file = GRF;
  2340.          inst->src[i].reg = dst.reg;
  2341.          inst->src[i].reg_offset = 0;
  2342.          inst->src[i].width = dispatch_width;
  2343.       }
  2344.    }
  2345.    invalidate_live_intervals();
  2346. }
  2347.  
  2348. bool
  2349. fs_visitor::opt_algebraic()
  2350. {
  2351.    bool progress = false;
  2352.  
  2353.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2354.       switch (inst->opcode) {
  2355.       case BRW_OPCODE_MOV:
  2356.          if (inst->src[0].file != IMM)
  2357.             break;
  2358.  
  2359.          if (inst->saturate) {
  2360.             if (inst->dst.type != inst->src[0].type)
  2361.                assert(!"unimplemented: saturate mixed types");
  2362.  
  2363.             if (brw_saturate_immediate(inst->dst.type,
  2364.                                        &inst->src[0].fixed_hw_reg)) {
  2365.                inst->saturate = false;
  2366.                progress = true;
  2367.             }
  2368.          }
  2369.          break;
  2370.  
  2371.       case BRW_OPCODE_MUL:
  2372.          if (inst->src[1].file != IMM)
  2373.             continue;
  2374.  
  2375.          /* a * 1.0 = a */
  2376.          if (inst->src[1].is_one()) {
  2377.             inst->opcode = BRW_OPCODE_MOV;
  2378.             inst->src[1] = reg_undef;
  2379.             progress = true;
  2380.             break;
  2381.          }
  2382.  
  2383.          /* a * -1.0 = -a */
  2384.          if (inst->src[1].is_negative_one()) {
  2385.             inst->opcode = BRW_OPCODE_MOV;
  2386.             inst->src[0].negate = !inst->src[0].negate;
  2387.             inst->src[1] = reg_undef;
  2388.             progress = true;
  2389.             break;
  2390.          }
  2391.  
  2392.          /* a * 0.0 = 0.0 */
  2393.          if (inst->src[1].is_zero()) {
  2394.             inst->opcode = BRW_OPCODE_MOV;
  2395.             inst->src[0] = inst->src[1];
  2396.             inst->src[1] = reg_undef;
  2397.             progress = true;
  2398.             break;
  2399.          }
  2400.  
  2401.          if (inst->src[0].file == IMM) {
  2402.             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
  2403.             inst->opcode = BRW_OPCODE_MOV;
  2404.             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
  2405.             inst->src[1] = reg_undef;
  2406.             progress = true;
  2407.             break;
  2408.          }
  2409.          break;
  2410.       case BRW_OPCODE_ADD:
  2411.          if (inst->src[1].file != IMM)
  2412.             continue;
  2413.  
  2414.          /* a + 0.0 = a */
  2415.          if (inst->src[1].is_zero()) {
  2416.             inst->opcode = BRW_OPCODE_MOV;
  2417.             inst->src[1] = reg_undef;
  2418.             progress = true;
  2419.             break;
  2420.          }
  2421.  
  2422.          if (inst->src[0].file == IMM) {
  2423.             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
  2424.             inst->opcode = BRW_OPCODE_MOV;
  2425.             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
  2426.             inst->src[1] = reg_undef;
  2427.             progress = true;
  2428.             break;
  2429.          }
  2430.          break;
  2431.       case BRW_OPCODE_OR:
  2432.          if (inst->src[0].equals(inst->src[1])) {
  2433.             inst->opcode = BRW_OPCODE_MOV;
  2434.             inst->src[1] = reg_undef;
  2435.             progress = true;
  2436.             break;
  2437.          }
  2438.          break;
  2439.       case BRW_OPCODE_LRP:
  2440.          if (inst->src[1].equals(inst->src[2])) {
  2441.             inst->opcode = BRW_OPCODE_MOV;
  2442.             inst->src[0] = inst->src[1];
  2443.             inst->src[1] = reg_undef;
  2444.             inst->src[2] = reg_undef;
  2445.             progress = true;
  2446.             break;
  2447.          }
  2448.          break;
  2449.       case BRW_OPCODE_CMP:
  2450.          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
  2451.              inst->src[0].abs &&
  2452.              inst->src[0].negate &&
  2453.              inst->src[1].is_zero()) {
  2454.             inst->src[0].abs = false;
  2455.             inst->src[0].negate = false;
  2456.             inst->conditional_mod = BRW_CONDITIONAL_Z;
  2457.             progress = true;
  2458.             break;
  2459.          }
  2460.          break;
  2461.       case BRW_OPCODE_SEL:
  2462.          if (inst->src[0].equals(inst->src[1])) {
  2463.             inst->opcode = BRW_OPCODE_MOV;
  2464.             inst->src[1] = reg_undef;
  2465.             inst->predicate = BRW_PREDICATE_NONE;
  2466.             inst->predicate_inverse = false;
  2467.             progress = true;
  2468.          } else if (inst->saturate && inst->src[1].file == IMM) {
  2469.             switch (inst->conditional_mod) {
  2470.             case BRW_CONDITIONAL_LE:
  2471.             case BRW_CONDITIONAL_L:
  2472.                switch (inst->src[1].type) {
  2473.                case BRW_REGISTER_TYPE_F:
  2474.                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
  2475.                      inst->opcode = BRW_OPCODE_MOV;
  2476.                      inst->src[1] = reg_undef;
  2477.                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
  2478.                      progress = true;
  2479.                   }
  2480.                   break;
  2481.                default:
  2482.                   break;
  2483.                }
  2484.                break;
  2485.             case BRW_CONDITIONAL_GE:
  2486.             case BRW_CONDITIONAL_G:
  2487.                switch (inst->src[1].type) {
  2488.                case BRW_REGISTER_TYPE_F:
  2489.                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
  2490.                      inst->opcode = BRW_OPCODE_MOV;
  2491.                      inst->src[1] = reg_undef;
  2492.                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
  2493.                      progress = true;
  2494.                   }
  2495.                   break;
  2496.                default:
  2497.                   break;
  2498.                }
  2499.             default:
  2500.                break;
  2501.             }
  2502.          }
  2503.          break;
  2504.       case BRW_OPCODE_MAD:
  2505.          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
  2506.             inst->opcode = BRW_OPCODE_MOV;
  2507.             inst->src[1] = reg_undef;
  2508.             inst->src[2] = reg_undef;
  2509.             progress = true;
  2510.          } else if (inst->src[0].is_zero()) {
  2511.             inst->opcode = BRW_OPCODE_MUL;
  2512.             inst->src[0] = inst->src[2];
  2513.             inst->src[2] = reg_undef;
  2514.             progress = true;
  2515.          } else if (inst->src[1].is_one()) {
  2516.             inst->opcode = BRW_OPCODE_ADD;
  2517.             inst->src[1] = inst->src[2];
  2518.             inst->src[2] = reg_undef;
  2519.             progress = true;
  2520.          } else if (inst->src[2].is_one()) {
  2521.             inst->opcode = BRW_OPCODE_ADD;
  2522.             inst->src[2] = reg_undef;
  2523.             progress = true;
  2524.          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
  2525.             inst->opcode = BRW_OPCODE_ADD;
  2526.             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
  2527.             inst->src[2] = reg_undef;
  2528.             progress = true;
  2529.          }
  2530.          break;
  2531.       case SHADER_OPCODE_RCP: {
  2532.          fs_inst *prev = (fs_inst *)inst->prev;
  2533.          if (prev->opcode == SHADER_OPCODE_SQRT) {
  2534.             if (inst->src[0].equals(prev->dst)) {
  2535.                inst->opcode = SHADER_OPCODE_RSQ;
  2536.                inst->src[0] = prev->src[0];
  2537.                progress = true;
  2538.             }
  2539.          }
  2540.          break;
  2541.       }
  2542.       case SHADER_OPCODE_BROADCAST:
  2543.          if (is_uniform(inst->src[0])) {
  2544.             inst->opcode = BRW_OPCODE_MOV;
  2545.             inst->sources = 1;
  2546.             inst->force_writemask_all = true;
  2547.             progress = true;
  2548.          } else if (inst->src[1].file == IMM) {
  2549.             inst->opcode = BRW_OPCODE_MOV;
  2550.             inst->src[0] = component(inst->src[0],
  2551.                                      inst->src[1].fixed_hw_reg.dw1.ud);
  2552.             inst->sources = 1;
  2553.             inst->force_writemask_all = true;
  2554.             progress = true;
  2555.          }
  2556.          break;
  2557.  
  2558.       default:
  2559.          break;
  2560.       }
  2561.  
  2562.       /* Swap if src[0] is immediate. */
  2563.       if (progress && inst->is_commutative()) {
  2564.          if (inst->src[0].file == IMM) {
  2565.             fs_reg tmp = inst->src[1];
  2566.             inst->src[1] = inst->src[0];
  2567.             inst->src[0] = tmp;
  2568.          }
  2569.       }
  2570.    }
  2571.    return progress;
  2572. }
  2573.  
  2574. /**
  2575.  * Optimize sample messages that have constant zero values for the trailing
  2576.  * texture coordinates. We can just reduce the message length for these
  2577.  * instructions instead of reserving a register for it. Trailing parameters
  2578.  * that aren't sent default to zero anyway. This will cause the dead code
  2579.  * eliminator to remove the MOV instruction that would otherwise be emitted to
  2580.  * set up the zero value.
  2581.  */
  2582. bool
  2583. fs_visitor::opt_zero_samples()
  2584. {
  2585.    /* Gen4 infers the texturing opcode based on the message length so we can't
  2586.     * change it.
  2587.     */
  2588.    if (devinfo->gen < 5)
  2589.       return false;
  2590.  
  2591.    bool progress = false;
  2592.  
  2593.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2594.       if (!inst->is_tex())
  2595.          continue;
  2596.  
  2597.       fs_inst *load_payload = (fs_inst *) inst->prev;
  2598.  
  2599.       if (load_payload->is_head_sentinel() ||
  2600.           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
  2601.          continue;
  2602.  
  2603.       /* We don't want to remove the message header or the first parameter.
  2604.        * Removing the first parameter is not allowed, see the Haswell PRM
  2605.        * volume 7, page 149:
  2606.        *
  2607.        *     "Parameter 0 is required except for the sampleinfo message, which
  2608.        *      has no parameter 0"
  2609.        */
  2610.       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
  2611.              load_payload->src[(inst->mlen - inst->header_size) /
  2612.                                (dispatch_width / 8) +
  2613.                                inst->header_size - 1].is_zero()) {
  2614.          inst->mlen -= dispatch_width / 8;
  2615.          progress = true;
  2616.       }
  2617.    }
  2618.  
  2619.    if (progress)
  2620.       invalidate_live_intervals();
  2621.  
  2622.    return progress;
  2623. }
  2624.  
  2625. /**
  2626.  * Optimize sample messages which are followed by the final RT write.
  2627.  *
  2628.  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
  2629.  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
  2630.  * final texturing results copied to the framebuffer write payload and modify
  2631.  * them to write to the framebuffer directly.
  2632.  */
  2633. bool
  2634. fs_visitor::opt_sampler_eot()
  2635. {
  2636.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  2637.  
  2638.    if (stage != MESA_SHADER_FRAGMENT)
  2639.       return false;
  2640.  
  2641.    if (devinfo->gen < 9 && !devinfo->is_cherryview)
  2642.       return false;
  2643.  
  2644.    /* FINISHME: It should be possible to implement this optimization when there
  2645.     * are multiple drawbuffers.
  2646.     */
  2647.    if (key->nr_color_regions != 1)
  2648.       return false;
  2649.  
  2650.    /* Look for a texturing instruction immediately before the final FB_WRITE. */
  2651.    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
  2652.    assert(fb_write->eot);
  2653.    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
  2654.  
  2655.    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
  2656.  
  2657.    /* There wasn't one; nothing to do. */
  2658.    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
  2659.       return false;
  2660.  
  2661.    /* This optimisation doesn't seem to work for textureGather for some
  2662.     * reason. I can't find any documentation or known workarounds to indicate
  2663.     * that this is expected, but considering that it is probably pretty
  2664.     * unlikely that a shader would directly write out the results from
  2665.     * textureGather we might as well just disable it.
  2666.     */
  2667.    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
  2668.        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
  2669.       return false;
  2670.  
  2671.    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
  2672.     * It's very likely to be the previous instruction.
  2673.     */
  2674.    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
  2675.    if (load_payload->is_head_sentinel() ||
  2676.        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
  2677.       return false;
  2678.  
  2679.    assert(!tex_inst->eot); /* We can't get here twice */
  2680.    assert((tex_inst->offset & (0xff << 24)) == 0);
  2681.  
  2682.    tex_inst->offset |= fb_write->target << 24;
  2683.    tex_inst->eot = true;
  2684.    tex_inst->dst = reg_null_ud;
  2685.    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
  2686.  
  2687.    /* If a header is present, marking the eot is sufficient. Otherwise, we need
  2688.     * to create a new LOAD_PAYLOAD command with the same sources and a space
  2689.     * saved for the header. Using a new destination register not only makes sure
  2690.     * we have enough space, but it will make sure the dead code eliminator kills
  2691.     * the instruction that this will replace.
  2692.     */
  2693.    if (tex_inst->header_size != 0)
  2694.       return true;
  2695.  
  2696.    fs_reg send_header = vgrf(load_payload->sources + 1);
  2697.    fs_reg *new_sources =
  2698.       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
  2699.  
  2700.    new_sources[0] = fs_reg();
  2701.    for (int i = 0; i < load_payload->sources; i++)
  2702.       new_sources[i+1] = load_payload->src[i];
  2703.  
  2704.    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
  2705.     * requires a lot of information about the sources to appropriately figure
  2706.     * out the number of registers needed to be used. Given this stage in our
  2707.     * optimization, we may not have the appropriate GRFs required by
  2708.     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
  2709.     * manually emit the instruction.
  2710.     */
  2711.    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
  2712.                                                     load_payload->exec_size,
  2713.                                                     send_header,
  2714.                                                     new_sources,
  2715.                                                     load_payload->sources + 1);
  2716.  
  2717.    new_load_payload->regs_written = load_payload->regs_written + 1;
  2718.    new_load_payload->header_size = 1;
  2719.    tex_inst->mlen++;
  2720.    tex_inst->header_size = 1;
  2721.    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
  2722.    tex_inst->src[0] = send_header;
  2723.  
  2724.    return true;
  2725. }
  2726.  
  2727. bool
  2728. fs_visitor::opt_register_renaming()
  2729. {
  2730.    bool progress = false;
  2731.    int depth = 0;
  2732.  
  2733.    int remap[alloc.count];
  2734.    memset(remap, -1, sizeof(int) * alloc.count);
  2735.  
  2736.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  2737.       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
  2738.          depth++;
  2739.       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
  2740.                  inst->opcode == BRW_OPCODE_WHILE) {
  2741.          depth--;
  2742.       }
  2743.  
  2744.       /* Rewrite instruction sources. */
  2745.       for (int i = 0; i < inst->sources; i++) {
  2746.          if (inst->src[i].file == GRF &&
  2747.              remap[inst->src[i].reg] != -1 &&
  2748.              remap[inst->src[i].reg] != inst->src[i].reg) {
  2749.             inst->src[i].reg = remap[inst->src[i].reg];
  2750.             progress = true;
  2751.          }
  2752.       }
  2753.  
  2754.       const int dst = inst->dst.reg;
  2755.  
  2756.       if (depth == 0 &&
  2757.           inst->dst.file == GRF &&
  2758.           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
  2759.           !inst->is_partial_write()) {
  2760.          if (remap[dst] == -1) {
  2761.             remap[dst] = dst;
  2762.          } else {
  2763.             remap[dst] = alloc.allocate(inst->dst.width / 8);
  2764.             inst->dst.reg = remap[dst];
  2765.             progress = true;
  2766.          }
  2767.       } else if (inst->dst.file == GRF &&
  2768.                  remap[dst] != -1 &&
  2769.                  remap[dst] != dst) {
  2770.          inst->dst.reg = remap[dst];
  2771.          progress = true;
  2772.       }
  2773.    }
  2774.  
  2775.    if (progress) {
  2776.       invalidate_live_intervals();
  2777.  
  2778.       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
  2779.          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
  2780.             delta_xy[i].reg = remap[delta_xy[i].reg];
  2781.          }
  2782.       }
  2783.    }
  2784.  
  2785.    return progress;
  2786. }
  2787.  
  2788. /**
  2789.  * Remove redundant or useless discard jumps.
  2790.  *
  2791.  * For example, we can eliminate jumps in the following sequence:
  2792.  *
  2793.  * discard-jump       (redundant with the next jump)
  2794.  * discard-jump       (useless; jumps to the next instruction)
  2795.  * placeholder-halt
  2796.  */
  2797. bool
  2798. fs_visitor::opt_redundant_discard_jumps()
  2799. {
  2800.    bool progress = false;
  2801.  
  2802.    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
  2803.  
  2804.    fs_inst *placeholder_halt = NULL;
  2805.    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
  2806.       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
  2807.          placeholder_halt = inst;
  2808.          break;
  2809.       }
  2810.    }
  2811.  
  2812.    if (!placeholder_halt)
  2813.       return false;
  2814.  
  2815.    /* Delete any HALTs immediately before the placeholder halt. */
  2816.    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
  2817.         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
  2818.         prev = (fs_inst *) placeholder_halt->prev) {
  2819.       prev->remove(last_bblock);
  2820.       progress = true;
  2821.    }
  2822.  
  2823.    if (progress)
  2824.       invalidate_live_intervals();
  2825.  
  2826.    return progress;
  2827. }
  2828.  
  2829. bool
  2830. fs_visitor::compute_to_mrf()
  2831. {
  2832.    bool progress = false;
  2833.    int next_ip = 0;
  2834.  
  2835.    /* No MRFs on Gen >= 7. */
  2836.    if (devinfo->gen >= 7)
  2837.       return false;
  2838.  
  2839.    calculate_live_intervals();
  2840.  
  2841.    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
  2842.       int ip = next_ip;
  2843.       next_ip++;
  2844.  
  2845.       if (inst->opcode != BRW_OPCODE_MOV ||
  2846.           inst->is_partial_write() ||
  2847.           inst->dst.file != MRF || inst->src[0].file != GRF ||
  2848.           inst->dst.type != inst->src[0].type ||
  2849.           inst->src[0].abs || inst->src[0].negate ||
  2850.           !inst->src[0].is_contiguous() ||
  2851.           inst->src[0].subreg_offset)
  2852.          continue;
  2853.  
  2854.       /* Work out which hardware MRF registers are written by this
  2855.        * instruction.
  2856.        */
  2857.       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
  2858.       int mrf_high;
  2859.       if (inst->dst.reg & BRW_MRF_COMPR4) {
  2860.          mrf_high = mrf_low + 4;
  2861.       } else if (inst->exec_size == 16) {
  2862.          mrf_high = mrf_low + 1;
  2863.       } else {
  2864.          mrf_high = mrf_low;
  2865.       }
  2866.  
  2867.       /* Can't compute-to-MRF this GRF if someone else was going to
  2868.        * read it later.
  2869.        */
  2870.       if (this->virtual_grf_end[inst->src[0].reg] > ip)
  2871.          continue;
  2872.  
  2873.       /* Found a move of a GRF to a MRF.  Let's see if we can go
  2874.        * rewrite the thing that made this GRF to write into the MRF.
  2875.        */
  2876.       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
  2877.          if (scan_inst->dst.file == GRF &&
  2878.              scan_inst->dst.reg == inst->src[0].reg) {
  2879.             /* Found the last thing to write our reg we want to turn
  2880.              * into a compute-to-MRF.
  2881.              */
  2882.  
  2883.             /* If this one instruction didn't populate all the
  2884.              * channels, bail.  We might be able to rewrite everything
  2885.              * that writes that reg, but it would require smarter
  2886.              * tracking to delay the rewriting until complete success.
  2887.              */
  2888.             if (scan_inst->is_partial_write())
  2889.                break;
  2890.  
  2891.             /* Things returning more than one register would need us to
  2892.              * understand coalescing out more than one MOV at a time.
  2893.              */
  2894.             if (scan_inst->regs_written > scan_inst->dst.width / 8)
  2895.                break;
  2896.  
  2897.             /* SEND instructions can't have MRF as a destination. */
  2898.             if (scan_inst->mlen)
  2899.                break;
  2900.  
  2901.             if (devinfo->gen == 6) {
  2902.                /* gen6 math instructions must have the destination be
  2903.                 * GRF, so no compute-to-MRF for them.
  2904.                 */
  2905.                if (scan_inst->is_math()) {
  2906.                   break;
  2907.                }
  2908.             }
  2909.  
  2910.             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
  2911.                /* Found the creator of our MRF's source value. */
  2912.                scan_inst->dst.file = MRF;
  2913.                scan_inst->dst.reg = inst->dst.reg;
  2914.                scan_inst->saturate |= inst->saturate;
  2915.                inst->remove(block);
  2916.                progress = true;
  2917.             }
  2918.             break;
  2919.          }
  2920.  
  2921.          /* We don't handle control flow here.  Most computation of
  2922.           * values that end up in MRFs are shortly before the MRF
  2923.           * write anyway.
  2924.           */
  2925.          if (block->start() == scan_inst)
  2926.             break;
  2927.  
  2928.          /* You can't read from an MRF, so if someone else reads our
  2929.           * MRF's source GRF that we wanted to rewrite, that stops us.
  2930.           */
  2931.          bool interfered = false;
  2932.          for (int i = 0; i < scan_inst->sources; i++) {
  2933.             if (scan_inst->src[i].file == GRF &&
  2934.                 scan_inst->src[i].reg == inst->src[0].reg &&
  2935.                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
  2936.                interfered = true;
  2937.             }
  2938.          }
  2939.          if (interfered)
  2940.             break;
  2941.  
  2942.          if (scan_inst->dst.file == MRF) {
  2943.             /* If somebody else writes our MRF here, we can't
  2944.              * compute-to-MRF before that.
  2945.              */
  2946.             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
  2947.             int scan_mrf_high;
  2948.  
  2949.             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
  2950.                scan_mrf_high = scan_mrf_low + 4;
  2951.             } else if (scan_inst->exec_size == 16) {
  2952.                scan_mrf_high = scan_mrf_low + 1;
  2953.             } else {
  2954.                scan_mrf_high = scan_mrf_low;
  2955.             }
  2956.  
  2957.             if (mrf_low == scan_mrf_low ||
  2958.                 mrf_low == scan_mrf_high ||
  2959.                 mrf_high == scan_mrf_low ||
  2960.                 mrf_high == scan_mrf_high) {
  2961.                break;
  2962.             }
  2963.          }
  2964.  
  2965.          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
  2966.             /* Found a SEND instruction, which means that there are
  2967.              * live values in MRFs from base_mrf to base_mrf +
  2968.              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
  2969.              * above it.
  2970.              */
  2971.             if (mrf_low >= scan_inst->base_mrf &&
  2972.                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
  2973.                break;
  2974.             }
  2975.             if (mrf_high >= scan_inst->base_mrf &&
  2976.                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
  2977.                break;
  2978.             }
  2979.          }
  2980.       }
  2981.    }
  2982.  
  2983.    if (progress)
  2984.       invalidate_live_intervals();
  2985.  
  2986.    return progress;
  2987. }
  2988.  
  2989. /**
  2990.  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
  2991.  * flow.  We could probably do better here with some form of divergence
  2992.  * analysis.
  2993.  */
  2994. bool
  2995. fs_visitor::eliminate_find_live_channel()
  2996. {
  2997.    bool progress = false;
  2998.    unsigned depth = 0;
  2999.  
  3000.    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
  3001.       switch (inst->opcode) {
  3002.       case BRW_OPCODE_IF:
  3003.       case BRW_OPCODE_DO:
  3004.          depth++;
  3005.          break;
  3006.  
  3007.       case BRW_OPCODE_ENDIF:
  3008.       case BRW_OPCODE_WHILE:
  3009.          depth--;
  3010.          break;
  3011.  
  3012.       case FS_OPCODE_DISCARD_JUMP:
  3013.          /* This can potentially make control flow non-uniform until the end
  3014.           * of the program.
  3015.           */
  3016.          return progress;
  3017.  
  3018.       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
  3019.          if (depth == 0) {
  3020.             inst->opcode = BRW_OPCODE_MOV;
  3021.             inst->src[0] = fs_reg(0);
  3022.             inst->sources = 1;
  3023.             inst->force_writemask_all = true;
  3024.             progress = true;
  3025.          }
  3026.          break;
  3027.  
  3028.       default:
  3029.          break;
  3030.       }
  3031.    }
  3032.  
  3033.    return progress;
  3034. }
  3035.  
  3036. /**
  3037.  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
  3038.  * instructions to FS_OPCODE_REP_FB_WRITE.
  3039.  */
  3040. void
  3041. fs_visitor::emit_repclear_shader()
  3042. {
  3043.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  3044.    int base_mrf = 1;
  3045.    int color_mrf = base_mrf + 2;
  3046.  
  3047.    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
  3048.                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
  3049.    mov->force_writemask_all = true;
  3050.  
  3051.    fs_inst *write;
  3052.    if (key->nr_color_regions == 1) {
  3053.       write = emit(FS_OPCODE_REP_FB_WRITE);
  3054.       write->saturate = key->clamp_fragment_color;
  3055.       write->base_mrf = color_mrf;
  3056.       write->target = 0;
  3057.       write->header_size = 0;
  3058.       write->mlen = 1;
  3059.    } else {
  3060.       assume(key->nr_color_regions > 0);
  3061.       for (int i = 0; i < key->nr_color_regions; ++i) {
  3062.          write = emit(FS_OPCODE_REP_FB_WRITE);
  3063.          write->saturate = key->clamp_fragment_color;
  3064.          write->base_mrf = base_mrf;
  3065.          write->target = i;
  3066.          write->header_size = 2;
  3067.          write->mlen = 3;
  3068.       }
  3069.    }
  3070.    write->eot = true;
  3071.  
  3072.    calculate_cfg();
  3073.  
  3074.    assign_constant_locations();
  3075.    assign_curb_setup();
  3076.  
  3077.    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
  3078.    assert(mov->src[0].file == HW_REG);
  3079.    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
  3080. }
  3081.  
  3082. /**
  3083.  * Walks through basic blocks, looking for repeated MRF writes and
  3084.  * removing the later ones.
  3085.  */
  3086. bool
  3087. fs_visitor::remove_duplicate_mrf_writes()
  3088. {
  3089.    fs_inst *last_mrf_move[16];
  3090.    bool progress = false;
  3091.  
  3092.    /* Need to update the MRF tracking for compressed instructions. */
  3093.    if (dispatch_width == 16)
  3094.       return false;
  3095.  
  3096.    memset(last_mrf_move, 0, sizeof(last_mrf_move));
  3097.  
  3098.    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
  3099.       if (inst->is_control_flow()) {
  3100.          memset(last_mrf_move, 0, sizeof(last_mrf_move));
  3101.       }
  3102.  
  3103.       if (inst->opcode == BRW_OPCODE_MOV &&
  3104.           inst->dst.file == MRF) {
  3105.          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
  3106.          if (prev_inst && inst->equals(prev_inst)) {
  3107.             inst->remove(block);
  3108.             progress = true;
  3109.             continue;
  3110.          }
  3111.       }
  3112.  
  3113.       /* Clear out the last-write records for MRFs that were overwritten. */
  3114.       if (inst->dst.file == MRF) {
  3115.          last_mrf_move[inst->dst.reg] = NULL;
  3116.       }
  3117.  
  3118.       if (inst->mlen > 0 && inst->base_mrf != -1) {
  3119.          /* Found a SEND instruction, which will include two or fewer
  3120.           * implied MRF writes.  We could do better here.
  3121.           */
  3122.          for (int i = 0; i < implied_mrf_writes(inst); i++) {
  3123.             last_mrf_move[inst->base_mrf + i] = NULL;
  3124.          }
  3125.       }
  3126.  
  3127.       /* Clear out any MRF move records whose sources got overwritten. */
  3128.       if (inst->dst.file == GRF) {
  3129.          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
  3130.             if (last_mrf_move[i] &&
  3131.                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
  3132.                last_mrf_move[i] = NULL;
  3133.             }
  3134.          }
  3135.       }
  3136.  
  3137.       if (inst->opcode == BRW_OPCODE_MOV &&
  3138.           inst->dst.file == MRF &&
  3139.           inst->src[0].file == GRF &&
  3140.           !inst->is_partial_write()) {
  3141.          last_mrf_move[inst->dst.reg] = inst;
  3142.       }
  3143.    }
  3144.  
  3145.    if (progress)
  3146.       invalidate_live_intervals();
  3147.  
  3148.    return progress;
  3149. }
  3150.  
  3151. static void
  3152. clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
  3153. {
  3154.    /* Clear the flag for registers that actually got read (as expected). */
  3155.    for (int i = 0; i < inst->sources; i++) {
  3156.       int grf;
  3157.       if (inst->src[i].file == GRF) {
  3158.          grf = inst->src[i].reg;
  3159.       } else if (inst->src[i].file == HW_REG &&
  3160.                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  3161.          grf = inst->src[i].fixed_hw_reg.nr;
  3162.       } else {
  3163.          continue;
  3164.       }
  3165.  
  3166.       if (grf >= first_grf &&
  3167.           grf < first_grf + grf_len) {
  3168.          deps[grf - first_grf] = false;
  3169.          if (inst->exec_size == 16)
  3170.             deps[grf - first_grf + 1] = false;
  3171.       }
  3172.    }
  3173. }
  3174.  
  3175. /**
  3176.  * Implements this workaround for the original 965:
  3177.  *
  3178.  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
  3179.  *      check for post destination dependencies on this instruction, software
  3180.  *      must ensure that there is no destination hazard for the case of ‘write
  3181.  *      followed by a posted write’ shown in the following example.
  3182.  *
  3183.  *      1. mov r3 0
  3184.  *      2. send r3.xy <rest of send instruction>
  3185.  *      3. mov r2 r3
  3186.  *
  3187.  *      Due to no post-destination dependency check on the ‘send’, the above
  3188.  *      code sequence could have two instructions (1 and 2) in flight at the
  3189.  *      same time that both consider ‘r3’ as the target of their final writes.
  3190.  */
  3191. void
  3192. fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
  3193.                                                         fs_inst *inst)
  3194. {
  3195.    int write_len = inst->regs_written;
  3196.    int first_write_grf = inst->dst.reg;
  3197.    bool needs_dep[BRW_MAX_MRF];
  3198.    assert(write_len < (int)sizeof(needs_dep) - 1);
  3199.  
  3200.    memset(needs_dep, false, sizeof(needs_dep));
  3201.    memset(needs_dep, true, write_len);
  3202.  
  3203.    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
  3204.  
  3205.    /* Walk backwards looking for writes to registers we're writing which
  3206.     * aren't read since being written.  If we hit the start of the program,
  3207.     * we assume that there are no outstanding dependencies on entry to the
  3208.     * program.
  3209.     */
  3210.    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
  3211.       /* If we hit control flow, assume that there *are* outstanding
  3212.        * dependencies, and force their cleanup before our instruction.
  3213.        */
  3214.       if (block->start() == scan_inst) {
  3215.          for (int i = 0; i < write_len; i++) {
  3216.             if (needs_dep[i]) {
  3217.                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
  3218.             }
  3219.          }
  3220.          return;
  3221.       }
  3222.  
  3223.       /* We insert our reads as late as possible on the assumption that any
  3224.        * instruction but a MOV that might have left us an outstanding
  3225.        * dependency has more latency than a MOV.
  3226.        */
  3227.       if (scan_inst->dst.file == GRF) {
  3228.          for (int i = 0; i < scan_inst->regs_written; i++) {
  3229.             int reg = scan_inst->dst.reg + i;
  3230.  
  3231.             if (reg >= first_write_grf &&
  3232.                 reg < first_write_grf + write_len &&
  3233.                 needs_dep[reg - first_write_grf]) {
  3234.                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
  3235.                needs_dep[reg - first_write_grf] = false;
  3236.                if (scan_inst->exec_size == 16)
  3237.                   needs_dep[reg - first_write_grf + 1] = false;
  3238.             }
  3239.          }
  3240.       }
  3241.  
  3242.       /* Clear the flag for registers that actually got read (as expected). */
  3243.       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
  3244.  
  3245.       /* Continue the loop only if we haven't resolved all the dependencies */
  3246.       int i;
  3247.       for (i = 0; i < write_len; i++) {
  3248.          if (needs_dep[i])
  3249.             break;
  3250.       }
  3251.       if (i == write_len)
  3252.          return;
  3253.    }
  3254. }
  3255.  
  3256. /**
  3257.  * Implements this workaround for the original 965:
  3258.  *
  3259.  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
  3260.  *      used as a destination register until after it has been sourced by an
  3261.  *      instruction with a different destination register.
  3262.  */
  3263. void
  3264. fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
  3265. {
  3266.    int write_len = inst->regs_written;
  3267.    int first_write_grf = inst->dst.reg;
  3268.    bool needs_dep[BRW_MAX_MRF];
  3269.    assert(write_len < (int)sizeof(needs_dep) - 1);
  3270.  
  3271.    memset(needs_dep, false, sizeof(needs_dep));
  3272.    memset(needs_dep, true, write_len);
  3273.    /* Walk forwards looking for writes to registers we're writing which aren't
  3274.     * read before being written.
  3275.     */
  3276.    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
  3277.       /* If we hit control flow, force resolve all remaining dependencies. */
  3278.       if (block->end() == scan_inst) {
  3279.          for (int i = 0; i < write_len; i++) {
  3280.             if (needs_dep[i])
  3281.                scan_inst->insert_before(block,
  3282.                                         DEP_RESOLVE_MOV(first_write_grf + i));
  3283.          }
  3284.          return;
  3285.       }
  3286.  
  3287.       /* Clear the flag for registers that actually got read (as expected). */
  3288.       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
  3289.  
  3290.       /* We insert our reads as late as possible since they're reading the
  3291.        * result of a SEND, which has massive latency.
  3292.        */
  3293.       if (scan_inst->dst.file == GRF &&
  3294.           scan_inst->dst.reg >= first_write_grf &&
  3295.           scan_inst->dst.reg < first_write_grf + write_len &&
  3296.           needs_dep[scan_inst->dst.reg - first_write_grf]) {
  3297.          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
  3298.          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
  3299.       }
  3300.  
  3301.       /* Continue the loop only if we haven't resolved all the dependencies */
  3302.       int i;
  3303.       for (i = 0; i < write_len; i++) {
  3304.          if (needs_dep[i])
  3305.             break;
  3306.       }
  3307.       if (i == write_len)
  3308.          return;
  3309.    }
  3310. }
  3311.  
  3312. void
  3313. fs_visitor::insert_gen4_send_dependency_workarounds()
  3314. {
  3315.    if (devinfo->gen != 4 || devinfo->is_g4x)
  3316.       return;
  3317.  
  3318.    bool progress = false;
  3319.  
  3320.    /* Note that we're done with register allocation, so GRF fs_regs always
  3321.     * have a .reg_offset of 0.
  3322.     */
  3323.  
  3324.    foreach_block_and_inst(block, fs_inst, inst, cfg) {
  3325.       if (inst->mlen != 0 && inst->dst.file == GRF) {
  3326.          insert_gen4_pre_send_dependency_workarounds(block, inst);
  3327.          insert_gen4_post_send_dependency_workarounds(block, inst);
  3328.          progress = true;
  3329.       }
  3330.    }
  3331.  
  3332.    if (progress)
  3333.       invalidate_live_intervals();
  3334. }
  3335.  
  3336. /**
  3337.  * Turns the generic expression-style uniform pull constant load instruction
  3338.  * into a hardware-specific series of instructions for loading a pull
  3339.  * constant.
  3340.  *
  3341.  * The expression style allows the CSE pass before this to optimize out
  3342.  * repeated loads from the same offset, and gives the pre-register-allocation
  3343.  * scheduling full flexibility, while the conversion to native instructions
  3344.  * allows the post-register-allocation scheduler the best information
  3345.  * possible.
  3346.  *
  3347.  * Note that execution masking for setting up pull constant loads is special:
  3348.  * the channels that need to be written are unrelated to the current execution
  3349.  * mask, since a later instruction will use one of the result channels as a
  3350.  * source operand for all 8 or 16 of its channels.
  3351.  */
  3352. void
  3353. fs_visitor::lower_uniform_pull_constant_loads()
  3354. {
  3355.    foreach_block_and_inst (block, fs_inst, inst, cfg) {
  3356.       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
  3357.          continue;
  3358.  
  3359.       if (devinfo->gen >= 7) {
  3360.          /* The offset arg before was a vec4-aligned byte offset.  We need to
  3361.           * turn it into a dword offset.
  3362.           */
  3363.          fs_reg const_offset_reg = inst->src[1];
  3364.          assert(const_offset_reg.file == IMM &&
  3365.                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
  3366.          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
  3367.          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
  3368.  
  3369.          /* We have to use a message header on Skylake to get SIMD4x2 mode.
  3370.           * Reserve space for the register.
  3371.           */
  3372.          if (devinfo->gen >= 9) {
  3373.             payload.reg_offset++;
  3374.             alloc.sizes[payload.reg] = 2;
  3375.          }
  3376.  
  3377.          /* This is actually going to be a MOV, but since only the first dword
  3378.           * is accessed, we have a special opcode to do just that one.  Note
  3379.           * that this needs to be an operation that will be considered a def
  3380.           * by live variable analysis, or register allocation will explode.
  3381.           */
  3382.          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
  3383.                                                8, payload, const_offset_reg);
  3384.          setup->force_writemask_all = true;
  3385.  
  3386.          setup->ir = inst->ir;
  3387.          setup->annotation = inst->annotation;
  3388.          inst->insert_before(block, setup);
  3389.  
  3390.          /* Similarly, this will only populate the first 4 channels of the
  3391.           * result register (since we only use smear values from 0-3), but we
  3392.           * don't tell the optimizer.
  3393.           */
  3394.          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
  3395.          inst->src[1] = payload;
  3396.  
  3397.          invalidate_live_intervals();
  3398.       } else {
  3399.          /* Before register allocation, we didn't tell the scheduler about the
  3400.           * MRF we use.  We know it's safe to use this MRF because nothing
  3401.           * else does except for register spill/unspill, which generates and
  3402.           * uses its MRF within a single IR instruction.
  3403.           */
  3404.          inst->base_mrf = 14;
  3405.          inst->mlen = 1;
  3406.       }
  3407.    }
  3408. }
  3409.  
  3410. bool
  3411. fs_visitor::lower_load_payload()
  3412. {
  3413.    bool progress = false;
  3414.  
  3415.    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
  3416.       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
  3417.          continue;
  3418.  
  3419.       assert(inst->dst.file == MRF || inst->dst.file == GRF);
  3420.       assert(inst->saturate == false);
  3421.  
  3422.       fs_reg dst = inst->dst;
  3423.  
  3424.       /* Get rid of COMPR4.  We'll add it back in if we need it */
  3425.       if (dst.file == MRF)
  3426.          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
  3427.  
  3428.       dst.width = 8;
  3429.       for (uint8_t i = 0; i < inst->header_size; i++) {
  3430.          if (inst->src[i].file != BAD_FILE) {
  3431.             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
  3432.             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
  3433.             mov_src.width = 8;
  3434.             fs_inst *mov = MOV(mov_dst, mov_src);
  3435.             mov->force_writemask_all = true;
  3436.             inst->insert_before(block, mov);
  3437.          }
  3438.          dst = offset(dst, 1);
  3439.       }
  3440.  
  3441.       dst.width = inst->exec_size;
  3442.       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
  3443.           inst->exec_size > 8) {
  3444.          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
  3445.           * a straightforward copy.  Instead, the result of the
  3446.           * LOAD_PAYLOAD is treated as interleaved and the first four
  3447.           * non-header sources are unpacked as:
  3448.           *
  3449.           * m + 0: r0
  3450.           * m + 1: g0
  3451.           * m + 2: b0
  3452.           * m + 3: a0
  3453.           * m + 4: r1
  3454.           * m + 5: g1
  3455.           * m + 6: b1
  3456.           * m + 7: a1
  3457.           *
  3458.           * This is used for gen <= 5 fb writes.
  3459.           */
  3460.          assert(inst->exec_size == 16);
  3461.          assert(inst->header_size + 4 <= inst->sources);
  3462.          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
  3463.             if (inst->src[i].file != BAD_FILE) {
  3464.                if (devinfo->has_compr4) {
  3465.                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
  3466.                   compr4_dst.reg |= BRW_MRF_COMPR4;
  3467.  
  3468.                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
  3469.                   mov->force_writemask_all = inst->force_writemask_all;
  3470.                   inst->insert_before(block, mov);
  3471.                } else {
  3472.                   /* Platform doesn't have COMPR4.  We have to fake it */
  3473.                   fs_reg mov_dst = retype(dst, inst->src[i].type);
  3474.                   mov_dst.width = 8;
  3475.  
  3476.                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
  3477.                   mov->force_writemask_all = inst->force_writemask_all;
  3478.                   inst->insert_before(block, mov);
  3479.  
  3480.                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
  3481.                   mov->force_writemask_all = inst->force_writemask_all;
  3482.                   mov->force_sechalf = true;
  3483.                   inst->insert_before(block, mov);
  3484.                }
  3485.             }
  3486.  
  3487.             dst.reg++;
  3488.          }
  3489.  
  3490.          /* The loop above only ever incremented us through the first set
  3491.           * of 4 registers.  However, thanks to the magic of COMPR4, we
  3492.           * actually wrote to the first 8 registers, so we need to take
  3493.           * that into account now.
  3494.           */
  3495.          dst.reg += 4;
  3496.  
  3497.          /* The COMPR4 code took care of the first 4 sources.  We'll let
  3498.           * the regular path handle any remaining sources.  Yes, we are
  3499.           * modifying the instruction but we're about to delete it so
  3500.           * this really doesn't hurt anything.
  3501.           */
  3502.          inst->header_size += 4;
  3503.       }
  3504.  
  3505.       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
  3506.          if (inst->src[i].file != BAD_FILE) {
  3507.             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
  3508.                                inst->src[i]);
  3509.             mov->force_writemask_all = inst->force_writemask_all;
  3510.             mov->force_sechalf = inst->force_sechalf;
  3511.             inst->insert_before(block, mov);
  3512.          }
  3513.          dst = offset(dst, 1);
  3514.       }
  3515.  
  3516.       inst->remove(block);
  3517.       progress = true;
  3518.    }
  3519.  
  3520.    if (progress)
  3521.       invalidate_live_intervals();
  3522.  
  3523.    return progress;
  3524. }
  3525.  
  3526. bool
  3527. fs_visitor::lower_integer_multiplication()
  3528. {
  3529.    bool progress = false;
  3530.  
  3531.    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
  3532.     * directly, but Cherryview cannot.
  3533.     */
  3534.    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
  3535.       return false;
  3536.  
  3537.    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
  3538.       if (inst->opcode != BRW_OPCODE_MUL ||
  3539.           inst->dst.is_accumulator() ||
  3540.           (inst->dst.type != BRW_REGISTER_TYPE_D &&
  3541.            inst->dst.type != BRW_REGISTER_TYPE_UD))
  3542.          continue;
  3543.  
  3544. #define insert(instr) inst->insert_before(block, instr)
  3545.  
  3546.       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
  3547.        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
  3548.        * src1 are used.
  3549.        *
  3550.        * If multiplying by an immediate value that fits in 16-bits, do a
  3551.        * single MUL instruction with that value in the proper location.
  3552.        */
  3553.       if (inst->src[1].file == IMM &&
  3554.           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
  3555.          if (devinfo->gen < 7) {
  3556.             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
  3557.                        inst->dst.type, dispatch_width);
  3558.             insert(MOV(imm, inst->src[1]));
  3559.             insert(MUL(inst->dst, imm, inst->src[0]));
  3560.          } else {
  3561.             insert(MUL(inst->dst, inst->src[0], inst->src[1]));
  3562.          }
  3563.       } else {
  3564.          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
  3565.           * do 32-bit integer multiplication in one instruction, but instead
  3566.           * must do a sequence (which actually calculates a 64-bit result):
  3567.           *
  3568.           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
  3569.           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
  3570.           *    mov(8)  g2<1>D     acc0<8,8,1>D
  3571.           *
  3572.           * But on Gen > 6, the ability to use second accumulator register
  3573.           * (acc1) for non-float data types was removed, preventing a simple
  3574.           * implementation in SIMD16. A 16-channel result can be calculated by
  3575.           * executing the three instructions twice in SIMD8, once with quarter
  3576.           * control of 1Q for the first eight channels and again with 2Q for
  3577.           * the second eight channels.
  3578.           *
  3579.           * Which accumulator register is implicitly accessed (by AccWrEnable
  3580.           * for instance) is determined by the quarter control. Unfortunately
  3581.           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
  3582.           * implicit accumulator access by an instruction with 2Q will access
  3583.           * acc1 regardless of whether the data type is usable in acc1.
  3584.           *
  3585.           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
  3586.           * integer data types.
  3587.           *
  3588.           * Since we only want the low 32-bits of the result, we can do two
  3589.           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
  3590.           * adjust the high result and add them (like the mach is doing):
  3591.           *
  3592.           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
  3593.           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
  3594.           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
  3595.           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
  3596.           *
  3597.           * We avoid the shl instruction by realizing that we only want to add
  3598.           * the low 16-bits of the "high" result to the high 16-bits of the
  3599.           * "low" result and using proper regioning on the add:
  3600.           *
  3601.           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
  3602.           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
  3603.           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
  3604.           *
  3605.           * Since it does not use the (single) accumulator register, we can
  3606.           * schedule multi-component multiplications much better.
  3607.           */
  3608.  
  3609.          fs_reg low = inst->dst;
  3610.          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
  3611.                      inst->dst.type, dispatch_width);
  3612.  
  3613.          if (brw->gen >= 7) {
  3614.             fs_reg src1_0_w = inst->src[1];
  3615.             fs_reg src1_1_w = inst->src[1];
  3616.  
  3617.             if (inst->src[1].file == IMM) {
  3618.                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
  3619.                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
  3620.             } else {
  3621.                src1_0_w.type = BRW_REGISTER_TYPE_UW;
  3622.                src1_0_w.stride = 2;
  3623.  
  3624.                src1_1_w.type = BRW_REGISTER_TYPE_UW;
  3625.                src1_1_w.stride = 2;
  3626.                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
  3627.             }
  3628.             insert(MUL(low, inst->src[0], src1_0_w));
  3629.             insert(MUL(high, inst->src[0], src1_1_w));
  3630.          } else {
  3631.             fs_reg src0_0_w = inst->src[0];
  3632.             fs_reg src0_1_w = inst->src[0];
  3633.  
  3634.             src0_0_w.type = BRW_REGISTER_TYPE_UW;
  3635.             src0_0_w.stride = 2;
  3636.  
  3637.             src0_1_w.type = BRW_REGISTER_TYPE_UW;
  3638.             src0_1_w.stride = 2;
  3639.             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
  3640.  
  3641.             insert(MUL(low, src0_0_w, inst->src[1]));
  3642.             insert(MUL(high, src0_1_w, inst->src[1]));
  3643.          }
  3644.  
  3645.          fs_reg dst = inst->dst;
  3646.          dst.type = BRW_REGISTER_TYPE_UW;
  3647.          dst.subreg_offset = 2;
  3648.          dst.stride = 2;
  3649.  
  3650.          high.type = BRW_REGISTER_TYPE_UW;
  3651.          high.stride = 2;
  3652.  
  3653.          low.type = BRW_REGISTER_TYPE_UW;
  3654.          low.subreg_offset = 2;
  3655.          low.stride = 2;
  3656.  
  3657.          insert(ADD(dst, low, high));
  3658.       }
  3659. #undef insert
  3660.  
  3661.       inst->remove(block);
  3662.       progress = true;
  3663.    }
  3664.  
  3665.    if (progress)
  3666.       invalidate_live_intervals();
  3667.  
  3668.    return progress;
  3669. }
  3670.  
  3671. void
  3672. fs_visitor::dump_instructions()
  3673. {
  3674.    dump_instructions(NULL);
  3675. }
  3676.  
  3677. void
  3678. fs_visitor::dump_instructions(const char *name)
  3679. {
  3680.    FILE *file = stderr;
  3681.    if (name && geteuid() != 0) {
  3682.       file = fopen(name, "w");
  3683.       if (!file)
  3684.          file = stderr;
  3685.    }
  3686.  
  3687.    if (cfg) {
  3688.       calculate_register_pressure();
  3689.       int ip = 0, max_pressure = 0;
  3690.       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
  3691.          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
  3692.          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
  3693.          dump_instruction(inst, file);
  3694.          ip++;
  3695.       }
  3696.       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
  3697.    } else {
  3698.       int ip = 0;
  3699.       foreach_in_list(backend_instruction, inst, &instructions) {
  3700.          fprintf(file, "%4d: ", ip++);
  3701.          dump_instruction(inst, file);
  3702.       }
  3703.    }
  3704.  
  3705.    if (file != stderr) {
  3706.       fclose(file);
  3707.    }
  3708. }
  3709.  
  3710. void
  3711. fs_visitor::dump_instruction(backend_instruction *be_inst)
  3712. {
  3713.    dump_instruction(be_inst, stderr);
  3714. }
  3715.  
  3716. void
  3717. fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
  3718. {
  3719.    fs_inst *inst = (fs_inst *)be_inst;
  3720.  
  3721.    if (inst->predicate) {
  3722.       fprintf(file, "(%cf0.%d) ",
  3723.              inst->predicate_inverse ? '-' : '+',
  3724.              inst->flag_subreg);
  3725.    }
  3726.  
  3727.    fprintf(file, "%s", brw_instruction_name(inst->opcode));
  3728.    if (inst->saturate)
  3729.       fprintf(file, ".sat");
  3730.    if (inst->conditional_mod) {
  3731.       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
  3732.       if (!inst->predicate &&
  3733.           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
  3734.                               inst->opcode != BRW_OPCODE_IF &&
  3735.                               inst->opcode != BRW_OPCODE_WHILE))) {
  3736.          fprintf(file, ".f0.%d", inst->flag_subreg);
  3737.       }
  3738.    }
  3739.    fprintf(file, "(%d) ", inst->exec_size);
  3740.  
  3741.  
  3742.    switch (inst->dst.file) {
  3743.    case GRF:
  3744.       fprintf(file, "vgrf%d", inst->dst.reg);
  3745.       if (inst->dst.width != dispatch_width)
  3746.          fprintf(file, "@%d", inst->dst.width);
  3747.       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
  3748.           inst->dst.subreg_offset)
  3749.          fprintf(file, "+%d.%d",
  3750.                  inst->dst.reg_offset, inst->dst.subreg_offset);
  3751.       break;
  3752.    case MRF:
  3753.       fprintf(file, "m%d", inst->dst.reg);
  3754.       break;
  3755.    case BAD_FILE:
  3756.       fprintf(file, "(null)");
  3757.       break;
  3758.    case UNIFORM:
  3759.       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
  3760.       break;
  3761.    case ATTR:
  3762.       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
  3763.       break;
  3764.    case HW_REG:
  3765.       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
  3766.          switch (inst->dst.fixed_hw_reg.nr) {
  3767.          case BRW_ARF_NULL:
  3768.             fprintf(file, "null");
  3769.             break;
  3770.          case BRW_ARF_ADDRESS:
  3771.             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
  3772.             break;
  3773.          case BRW_ARF_ACCUMULATOR:
  3774.             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
  3775.             break;
  3776.          case BRW_ARF_FLAG:
  3777.             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
  3778.                              inst->dst.fixed_hw_reg.subnr);
  3779.             break;
  3780.          default:
  3781.             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
  3782.                                inst->dst.fixed_hw_reg.subnr);
  3783.             break;
  3784.          }
  3785.       } else {
  3786.          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
  3787.       }
  3788.       if (inst->dst.fixed_hw_reg.subnr)
  3789.          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
  3790.       break;
  3791.    default:
  3792.       fprintf(file, "???");
  3793.       break;
  3794.    }
  3795.    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
  3796.  
  3797.    for (int i = 0; i < inst->sources; i++) {
  3798.       if (inst->src[i].negate)
  3799.          fprintf(file, "-");
  3800.       if (inst->src[i].abs)
  3801.          fprintf(file, "|");
  3802.       switch (inst->src[i].file) {
  3803.       case GRF:
  3804.          fprintf(file, "vgrf%d", inst->src[i].reg);
  3805.          if (inst->src[i].width != dispatch_width)
  3806.             fprintf(file, "@%d", inst->src[i].width);
  3807.          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
  3808.              inst->src[i].subreg_offset)
  3809.             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
  3810.                     inst->src[i].subreg_offset);
  3811.          break;
  3812.       case MRF:
  3813.          fprintf(file, "***m%d***", inst->src[i].reg);
  3814.          break;
  3815.       case ATTR:
  3816.          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
  3817.          break;
  3818.       case UNIFORM:
  3819.          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
  3820.          if (inst->src[i].reladdr) {
  3821.             fprintf(file, "+reladdr");
  3822.          } else if (inst->src[i].subreg_offset) {
  3823.             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
  3824.                     inst->src[i].subreg_offset);
  3825.          }
  3826.          break;
  3827.       case BAD_FILE:
  3828.          fprintf(file, "(null)");
  3829.          break;
  3830.       case IMM:
  3831.          switch (inst->src[i].type) {
  3832.          case BRW_REGISTER_TYPE_F:
  3833.             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
  3834.             break;
  3835.          case BRW_REGISTER_TYPE_W:
  3836.          case BRW_REGISTER_TYPE_D:
  3837.             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
  3838.             break;
  3839.          case BRW_REGISTER_TYPE_UW:
  3840.          case BRW_REGISTER_TYPE_UD:
  3841.             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
  3842.             break;
  3843.          case BRW_REGISTER_TYPE_VF:
  3844.             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
  3845.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
  3846.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
  3847.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
  3848.                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
  3849.             break;
  3850.          default:
  3851.             fprintf(file, "???");
  3852.             break;
  3853.          }
  3854.          break;
  3855.       case HW_REG:
  3856.          if (inst->src[i].fixed_hw_reg.negate)
  3857.             fprintf(file, "-");
  3858.          if (inst->src[i].fixed_hw_reg.abs)
  3859.             fprintf(file, "|");
  3860.          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
  3861.             switch (inst->src[i].fixed_hw_reg.nr) {
  3862.             case BRW_ARF_NULL:
  3863.                fprintf(file, "null");
  3864.                break;
  3865.             case BRW_ARF_ADDRESS:
  3866.                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
  3867.                break;
  3868.             case BRW_ARF_ACCUMULATOR:
  3869.                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
  3870.                break;
  3871.             case BRW_ARF_FLAG:
  3872.                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
  3873.                                 inst->src[i].fixed_hw_reg.subnr);
  3874.                break;
  3875.             default:
  3876.                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
  3877.                                   inst->src[i].fixed_hw_reg.subnr);
  3878.                break;
  3879.             }
  3880.          } else {
  3881.             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
  3882.          }
  3883.          if (inst->src[i].fixed_hw_reg.subnr)
  3884.             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
  3885.          if (inst->src[i].fixed_hw_reg.abs)
  3886.             fprintf(file, "|");
  3887.          break;
  3888.       default:
  3889.          fprintf(file, "???");
  3890.          break;
  3891.       }
  3892.       if (inst->src[i].abs)
  3893.          fprintf(file, "|");
  3894.  
  3895.       if (inst->src[i].file != IMM) {
  3896.          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
  3897.       }
  3898.  
  3899.       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
  3900.          fprintf(file, ", ");
  3901.    }
  3902.  
  3903.    fprintf(file, " ");
  3904.  
  3905.    if (dispatch_width == 16 && inst->exec_size == 8) {
  3906.       if (inst->force_sechalf)
  3907.          fprintf(file, "2ndhalf ");
  3908.       else
  3909.          fprintf(file, "1sthalf ");
  3910.    }
  3911.  
  3912.    fprintf(file, "\n");
  3913. }
  3914.  
  3915. /**
  3916.  * Possibly returns an instruction that set up @param reg.
  3917.  *
  3918.  * Sometimes we want to take the result of some expression/variable
  3919.  * dereference tree and rewrite the instruction generating the result
  3920.  * of the tree.  When processing the tree, we know that the
  3921.  * instructions generated are all writing temporaries that are dead
  3922.  * outside of this tree.  So, if we have some instructions that write
  3923.  * a temporary, we're free to point that temp write somewhere else.
  3924.  *
  3925.  * Note that this doesn't guarantee that the instruction generated
  3926.  * only reg -- it might be the size=4 destination of a texture instruction.
  3927.  */
  3928. fs_inst *
  3929. fs_visitor::get_instruction_generating_reg(fs_inst *start,
  3930.                                            fs_inst *end,
  3931.                                            const fs_reg &reg)
  3932. {
  3933.    if (end == start ||
  3934.        end->is_partial_write() ||
  3935.        reg.reladdr ||
  3936.        !reg.equals(end->dst)) {
  3937.       return NULL;
  3938.    } else {
  3939.       return end;
  3940.    }
  3941. }
  3942.  
  3943. void
  3944. fs_visitor::setup_payload_gen6()
  3945. {
  3946.    bool uses_depth =
  3947.       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
  3948.    unsigned barycentric_interp_modes =
  3949.       (stage == MESA_SHADER_FRAGMENT) ?
  3950.       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
  3951.  
  3952.    assert(devinfo->gen >= 6);
  3953.  
  3954.    /* R0-1: masks, pixel X/Y coordinates. */
  3955.    payload.num_regs = 2;
  3956.    /* R2: only for 32-pixel dispatch.*/
  3957.  
  3958.    /* R3-26: barycentric interpolation coordinates.  These appear in the
  3959.     * same order that they appear in the brw_wm_barycentric_interp_mode
  3960.     * enum.  Each set of coordinates occupies 2 registers if dispatch width
  3961.     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
  3962.     * appear if they were enabled using the "Barycentric Interpolation
  3963.     * Mode" bits in WM_STATE.
  3964.     */
  3965.    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
  3966.       if (barycentric_interp_modes & (1 << i)) {
  3967.          payload.barycentric_coord_reg[i] = payload.num_regs;
  3968.          payload.num_regs += 2;
  3969.          if (dispatch_width == 16) {
  3970.             payload.num_regs += 2;
  3971.          }
  3972.       }
  3973.    }
  3974.  
  3975.    /* R27: interpolated depth if uses source depth */
  3976.    if (uses_depth) {
  3977.       payload.source_depth_reg = payload.num_regs;
  3978.       payload.num_regs++;
  3979.       if (dispatch_width == 16) {
  3980.          /* R28: interpolated depth if not SIMD8. */
  3981.          payload.num_regs++;
  3982.       }
  3983.    }
  3984.    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
  3985.    if (uses_depth) {
  3986.       payload.source_w_reg = payload.num_regs;
  3987.       payload.num_regs++;
  3988.       if (dispatch_width == 16) {
  3989.          /* R30: interpolated W if not SIMD8. */
  3990.          payload.num_regs++;
  3991.       }
  3992.    }
  3993.  
  3994.    if (stage == MESA_SHADER_FRAGMENT) {
  3995.       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  3996.       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  3997.       prog_data->uses_pos_offset = key->compute_pos_offset;
  3998.       /* R31: MSAA position offsets. */
  3999.       if (prog_data->uses_pos_offset) {
  4000.          payload.sample_pos_reg = payload.num_regs;
  4001.          payload.num_regs++;
  4002.       }
  4003.    }
  4004.  
  4005.    /* R32: MSAA input coverage mask */
  4006.    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
  4007.       assert(devinfo->gen >= 7);
  4008.       payload.sample_mask_in_reg = payload.num_regs;
  4009.       payload.num_regs++;
  4010.       if (dispatch_width == 16) {
  4011.          /* R33: input coverage mask if not SIMD8. */
  4012.          payload.num_regs++;
  4013.       }
  4014.    }
  4015.  
  4016.    /* R34-: bary for 32-pixel. */
  4017.    /* R58-59: interp W for 32-pixel. */
  4018.  
  4019.    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
  4020.       source_depth_to_render_target = true;
  4021.    }
  4022. }
  4023.  
  4024. void
  4025. fs_visitor::setup_vs_payload()
  4026. {
  4027.    /* R0: thread header, R1: urb handles */
  4028.    payload.num_regs = 2;
  4029. }
  4030.  
  4031. void
  4032. fs_visitor::setup_cs_payload()
  4033. {
  4034.    assert(brw->gen >= 7);
  4035.  
  4036.    payload.num_regs = 1;
  4037. }
  4038.  
  4039. void
  4040. fs_visitor::assign_binding_table_offsets()
  4041. {
  4042.    assert(stage == MESA_SHADER_FRAGMENT);
  4043.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  4044.    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  4045.    uint32_t next_binding_table_offset = 0;
  4046.  
  4047.    /* If there are no color regions, we still perform an FB write to a null
  4048.     * renderbuffer, which we place at surface index 0.
  4049.     */
  4050.    prog_data->binding_table.render_target_start = next_binding_table_offset;
  4051.    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
  4052.  
  4053.    assign_common_binding_table_offsets(next_binding_table_offset);
  4054. }
  4055.  
  4056. void
  4057. fs_visitor::calculate_register_pressure()
  4058. {
  4059.    invalidate_live_intervals();
  4060.    calculate_live_intervals();
  4061.  
  4062.    unsigned num_instructions = 0;
  4063.    foreach_block(block, cfg)
  4064.       num_instructions += block->instructions.length();
  4065.  
  4066.    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
  4067.  
  4068.    for (unsigned reg = 0; reg < alloc.count; reg++) {
  4069.       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
  4070.          regs_live_at_ip[ip] += alloc.sizes[reg];
  4071.    }
  4072. }
  4073.  
  4074. void
  4075. fs_visitor::optimize()
  4076. {
  4077.    split_virtual_grfs();
  4078.  
  4079.    move_uniform_array_access_to_pull_constants();
  4080.    assign_constant_locations();
  4081.    demote_pull_constants();
  4082.  
  4083. #define OPT(pass, args...) ({                                           \
  4084.       pass_num++;                                                       \
  4085.       bool this_progress = pass(args);                                  \
  4086.                                                                         \
  4087.       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
  4088.          char filename[64];                                             \
  4089.          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
  4090.                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
  4091.                                                                         \
  4092.          backend_visitor::dump_instructions(filename);                  \
  4093.       }                                                                 \
  4094.                                                                         \
  4095.       progress = progress || this_progress;                             \
  4096.       this_progress;                                                    \
  4097.    })
  4098.  
  4099.    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
  4100.       char filename[64];
  4101.       snprintf(filename, 64, "%s%d-%04d-00-start",
  4102.                stage_abbrev, dispatch_width,
  4103.                shader_prog ? shader_prog->Name : 0);
  4104.  
  4105.       backend_visitor::dump_instructions(filename);
  4106.    }
  4107.  
  4108.    bool progress;
  4109.    int iteration = 0;
  4110.    int pass_num = 0;
  4111.    do {
  4112.       progress = false;
  4113.       pass_num = 0;
  4114.       iteration++;
  4115.  
  4116.       OPT(remove_duplicate_mrf_writes);
  4117.  
  4118.       OPT(opt_algebraic);
  4119.       OPT(opt_cse);
  4120.       OPT(opt_copy_propagate);
  4121.       OPT(opt_peephole_predicated_break);
  4122.       OPT(opt_cmod_propagation);
  4123.       OPT(dead_code_eliminate);
  4124.       OPT(opt_peephole_sel);
  4125.       OPT(dead_control_flow_eliminate, this);
  4126.       OPT(opt_register_renaming);
  4127.       OPT(opt_redundant_discard_jumps);
  4128.       OPT(opt_saturate_propagation);
  4129.       OPT(opt_zero_samples);
  4130.       OPT(register_coalesce);
  4131.       OPT(compute_to_mrf);
  4132.       OPT(eliminate_find_live_channel);
  4133.  
  4134.       OPT(compact_virtual_grfs);
  4135.    } while (progress);
  4136.  
  4137.    pass_num = 0;
  4138.  
  4139.    OPT(opt_sampler_eot);
  4140.  
  4141.    if (OPT(lower_load_payload)) {
  4142.       split_virtual_grfs();
  4143.       OPT(register_coalesce);
  4144.       OPT(compute_to_mrf);
  4145.       OPT(dead_code_eliminate);
  4146.    }
  4147.  
  4148.    OPT(opt_combine_constants);
  4149.    OPT(lower_integer_multiplication);
  4150.  
  4151.    lower_uniform_pull_constant_loads();
  4152. }
  4153.  
  4154. /**
  4155.  * Three source instruction must have a GRF/MRF destination register.
  4156.  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
  4157.  */
  4158. void
  4159. fs_visitor::fixup_3src_null_dest()
  4160. {
  4161.    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
  4162.       if (inst->is_3src() && inst->dst.is_null()) {
  4163.          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
  4164.                             inst->dst.type);
  4165.       }
  4166.    }
  4167. }
  4168.  
  4169. void
  4170. fs_visitor::allocate_registers()
  4171. {
  4172.    bool allocated_without_spills;
  4173.  
  4174.    static const enum instruction_scheduler_mode pre_modes[] = {
  4175.       SCHEDULE_PRE,
  4176.       SCHEDULE_PRE_NON_LIFO,
  4177.       SCHEDULE_PRE_LIFO,
  4178.    };
  4179.  
  4180.    /* Try each scheduling heuristic to see if it can successfully register
  4181.     * allocate without spilling.  They should be ordered by decreasing
  4182.     * performance but increasing likelihood of allocating.
  4183.     */
  4184.    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
  4185.       schedule_instructions(pre_modes[i]);
  4186.  
  4187.       if (0) {
  4188.          assign_regs_trivial();
  4189.          allocated_without_spills = true;
  4190.       } else {
  4191.          allocated_without_spills = assign_regs(false);
  4192.       }
  4193.       if (allocated_without_spills)
  4194.          break;
  4195.    }
  4196.  
  4197.    if (!allocated_without_spills) {
  4198.       /* We assume that any spilling is worse than just dropping back to
  4199.        * SIMD8.  There's probably actually some intermediate point where
  4200.        * SIMD16 with a couple of spills is still better.
  4201.        */
  4202.       if (dispatch_width == 16) {
  4203.          fail("Failure to register allocate.  Reduce number of "
  4204.               "live scalar values to avoid this.");
  4205.       } else {
  4206.          perf_debug("%s shader triggered register spilling.  "
  4207.                     "Try reducing the number of live scalar values to "
  4208.                     "improve performance.\n", stage_name);
  4209.       }
  4210.  
  4211.       /* Since we're out of heuristics, just go spill registers until we
  4212.        * get an allocation.
  4213.        */
  4214.       while (!assign_regs(true)) {
  4215.          if (failed)
  4216.             break;
  4217.       }
  4218.    }
  4219.  
  4220.    /* This must come after all optimization and register allocation, since
  4221.     * it inserts dead code that happens to have side effects, and it does
  4222.     * so based on the actual physical registers in use.
  4223.     */
  4224.    insert_gen4_send_dependency_workarounds();
  4225.  
  4226.    if (failed)
  4227.       return;
  4228.  
  4229.    if (!allocated_without_spills)
  4230.       schedule_instructions(SCHEDULE_POST);
  4231.  
  4232.    if (last_scratch > 0)
  4233.       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
  4234. }
  4235.  
  4236. bool
  4237. fs_visitor::run_vs()
  4238. {
  4239.    assert(stage == MESA_SHADER_VERTEX);
  4240.  
  4241.    assign_common_binding_table_offsets(0);
  4242.    setup_vs_payload();
  4243.  
  4244.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  4245.       emit_shader_time_begin();
  4246.  
  4247.    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
  4248.       emit_nir_code();
  4249.    } else {
  4250.       foreach_in_list(ir_instruction, ir, shader->base.ir) {
  4251.          base_ir = ir;
  4252.          this->result = reg_undef;
  4253.          ir->accept(this);
  4254.       }
  4255.       base_ir = NULL;
  4256.    }
  4257.  
  4258.    if (failed)
  4259.       return false;
  4260.  
  4261.    emit_urb_writes();
  4262.  
  4263.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  4264.       emit_shader_time_end();
  4265.  
  4266.    calculate_cfg();
  4267.  
  4268.    optimize();
  4269.  
  4270.    assign_curb_setup();
  4271.    assign_vs_urb_setup();
  4272.  
  4273.    fixup_3src_null_dest();
  4274.    allocate_registers();
  4275.  
  4276.    return !failed;
  4277. }
  4278.  
  4279. bool
  4280. fs_visitor::run_fs()
  4281. {
  4282.    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
  4283.    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
  4284.  
  4285.    assert(stage == MESA_SHADER_FRAGMENT);
  4286.  
  4287.    sanity_param_count = prog->Parameters->NumParameters;
  4288.  
  4289.    assign_binding_table_offsets();
  4290.  
  4291.    if (devinfo->gen >= 6)
  4292.       setup_payload_gen6();
  4293.    else
  4294.       setup_payload_gen4();
  4295.  
  4296.    if (0) {
  4297.       emit_dummy_fs();
  4298.    } else if (brw->use_rep_send && dispatch_width == 16) {
  4299.       emit_repclear_shader();
  4300.    } else {
  4301.       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  4302.          emit_shader_time_begin();
  4303.  
  4304.       calculate_urb_setup();
  4305.       if (prog->InputsRead > 0) {
  4306.          if (devinfo->gen < 6)
  4307.             emit_interpolation_setup_gen4();
  4308.          else
  4309.             emit_interpolation_setup_gen6();
  4310.       }
  4311.  
  4312.       /* We handle discards by keeping track of the still-live pixels in f0.1.
  4313.        * Initialize it with the dispatched pixels.
  4314.        */
  4315.       if (wm_prog_data->uses_kill) {
  4316.          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
  4317.          discard_init->flag_subreg = 1;
  4318.       }
  4319.  
  4320.       /* Generate FS IR for main().  (the visitor only descends into
  4321.        * functions called "main").
  4322.        */
  4323.       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
  4324.          emit_nir_code();
  4325.       } else if (shader) {
  4326.          foreach_in_list(ir_instruction, ir, shader->base.ir) {
  4327.             base_ir = ir;
  4328.             this->result = reg_undef;
  4329.             ir->accept(this);
  4330.          }
  4331.       } else {
  4332.          emit_fragment_program_code();
  4333.       }
  4334.       base_ir = NULL;
  4335.       if (failed)
  4336.          return false;
  4337.  
  4338.       if (wm_prog_data->uses_kill)
  4339.          emit(FS_OPCODE_PLACEHOLDER_HALT);
  4340.  
  4341.       if (wm_key->alpha_test_func)
  4342.          emit_alpha_test();
  4343.  
  4344.       emit_fb_writes();
  4345.  
  4346.       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  4347.          emit_shader_time_end();
  4348.  
  4349.       calculate_cfg();
  4350.  
  4351.       optimize();
  4352.  
  4353.       assign_curb_setup();
  4354.       assign_urb_setup();
  4355.  
  4356.       fixup_3src_null_dest();
  4357.       allocate_registers();
  4358.  
  4359.       if (failed)
  4360.          return false;
  4361.    }
  4362.  
  4363.    if (dispatch_width == 8)
  4364.       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
  4365.    else
  4366.       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
  4367.  
  4368.    /* If any state parameters were appended, then ParameterValues could have
  4369.     * been realloced, in which case the driver uniform storage set up by
  4370.     * _mesa_associate_uniform_storage() would point to freed memory.  Make
  4371.     * sure that didn't happen.
  4372.     */
  4373.    assert(sanity_param_count == prog->Parameters->NumParameters);
  4374.  
  4375.    return !failed;
  4376. }
  4377.  
  4378. bool
  4379. fs_visitor::run_cs()
  4380. {
  4381.    assert(stage == MESA_SHADER_COMPUTE);
  4382.    assert(shader);
  4383.  
  4384.    sanity_param_count = prog->Parameters->NumParameters;
  4385.  
  4386.    assign_common_binding_table_offsets(0);
  4387.  
  4388.    setup_cs_payload();
  4389.  
  4390.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  4391.       emit_shader_time_begin();
  4392.  
  4393.    emit_nir_code();
  4394.  
  4395.    if (failed)
  4396.       return false;
  4397.  
  4398.    emit_cs_terminate();
  4399.  
  4400.    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  4401.       emit_shader_time_end();
  4402.  
  4403.    calculate_cfg();
  4404.  
  4405.    optimize();
  4406.  
  4407.    assign_curb_setup();
  4408.  
  4409.    fixup_3src_null_dest();
  4410.    allocate_registers();
  4411.  
  4412.    if (failed)
  4413.       return false;
  4414.  
  4415.    /* If any state parameters were appended, then ParameterValues could have
  4416.     * been realloced, in which case the driver uniform storage set up by
  4417.     * _mesa_associate_uniform_storage() would point to freed memory.  Make
  4418.     * sure that didn't happen.
  4419.     */
  4420.    assert(sanity_param_count == prog->Parameters->NumParameters);
  4421.  
  4422.    return !failed;
  4423. }
  4424.  
  4425. const unsigned *
  4426. brw_wm_fs_emit(struct brw_context *brw,
  4427.                void *mem_ctx,
  4428.                const struct brw_wm_prog_key *key,
  4429.                struct brw_wm_prog_data *prog_data,
  4430.                struct gl_fragment_program *fp,
  4431.                struct gl_shader_program *prog,
  4432.                unsigned *final_assembly_size)
  4433. {
  4434.    bool start_busy = false;
  4435.    double start_time = 0;
  4436.  
  4437.    if (unlikely(brw->perf_debug)) {
  4438.       start_busy = (brw->batch.last_bo &&
  4439.                     drm_intel_bo_busy(brw->batch.last_bo));
  4440.       start_time = get_time();
  4441.    }
  4442.  
  4443.    struct brw_shader *shader = NULL;
  4444.    if (prog)
  4445.       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
  4446.  
  4447.    if (unlikely(INTEL_DEBUG & DEBUG_WM))
  4448.       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
  4449.  
  4450.    /* Now the main event: Visit the shader IR and generate our FS IR for it.
  4451.     */
  4452.    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
  4453.                 prog, &fp->Base, 8);
  4454.    if (!v.run_fs()) {
  4455.       if (prog) {
  4456.          prog->LinkStatus = false;
  4457.          ralloc_strcat(&prog->InfoLog, v.fail_msg);
  4458.       }
  4459.  
  4460.       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
  4461.                     v.fail_msg);
  4462.  
  4463.       return NULL;
  4464.    }
  4465.  
  4466.    cfg_t *simd16_cfg = NULL;
  4467.    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
  4468.                  prog, &fp->Base, 16);
  4469.    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
  4470.       if (!v.simd16_unsupported) {
  4471.          /* Try a SIMD16 compile */
  4472.          v2.import_uniforms(&v);
  4473.          if (!v2.run_fs()) {
  4474.             perf_debug("SIMD16 shader failed to compile, falling back to "
  4475.                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
  4476.          } else {
  4477.             simd16_cfg = v2.cfg;
  4478.          }
  4479.       } else {
  4480.          perf_debug("SIMD16 shader unsupported, falling back to "
  4481.                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
  4482.       }
  4483.    }
  4484.  
  4485.    cfg_t *simd8_cfg;
  4486.    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
  4487.    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
  4488.       simd8_cfg = NULL;
  4489.       prog_data->no_8 = true;
  4490.    } else {
  4491.       simd8_cfg = v.cfg;
  4492.       prog_data->no_8 = false;
  4493.    }
  4494.  
  4495.    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
  4496.                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
  4497.  
  4498.    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
  4499.       char *name;
  4500.       if (prog)
  4501.          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
  4502.                                 prog->Label ? prog->Label : "unnamed",
  4503.                                 prog->Name);
  4504.       else
  4505.          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
  4506.  
  4507.       g.enable_debug(name);
  4508.    }
  4509.  
  4510.    if (simd8_cfg)
  4511.       g.generate_code(simd8_cfg, 8);
  4512.    if (simd16_cfg)
  4513.       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
  4514.  
  4515.    if (unlikely(brw->perf_debug) && shader) {
  4516.       if (shader->compiled_once)
  4517.          brw_wm_debug_recompile(brw, prog, key);
  4518.       shader->compiled_once = true;
  4519.  
  4520.       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
  4521.          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
  4522.                     (get_time() - start_time) * 1000);
  4523.       }
  4524.    }
  4525.  
  4526.    return g.get_assembly(final_assembly_size);
  4527. }
  4528.  
  4529. extern "C" bool
  4530. brw_fs_precompile(struct gl_context *ctx,
  4531.                   struct gl_shader_program *shader_prog,
  4532.                   struct gl_program *prog)
  4533. {
  4534.    struct brw_context *brw = brw_context(ctx);
  4535.    struct brw_wm_prog_key key;
  4536.  
  4537.    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
  4538.    struct brw_fragment_program *bfp = brw_fragment_program(fp);
  4539.    bool program_uses_dfdy = fp->UsesDFdy;
  4540.  
  4541.    memset(&key, 0, sizeof(key));
  4542.  
  4543.    if (brw->gen < 6) {
  4544.       if (fp->UsesKill)
  4545.          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
  4546.  
  4547.       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
  4548.          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
  4549.  
  4550.       /* Just assume depth testing. */
  4551.       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
  4552.       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
  4553.    }
  4554.  
  4555.    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
  4556.                                          BRW_FS_VARYING_INPUT_MASK) > 16)
  4557.       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
  4558.  
  4559.    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
  4560.  
  4561.    if (fp->Base.InputsRead & VARYING_BIT_POS) {
  4562.       key.drawable_height = ctx->DrawBuffer->Height;
  4563.    }
  4564.  
  4565.    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
  4566.          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
  4567.          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
  4568.  
  4569.    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
  4570.       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
  4571.                           key.nr_color_regions > 1;
  4572.    }
  4573.  
  4574.    key.program_string_id = bfp->id;
  4575.  
  4576.    uint32_t old_prog_offset = brw->wm.base.prog_offset;
  4577.    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
  4578.  
  4579.    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
  4580.  
  4581.    brw->wm.base.prog_offset = old_prog_offset;
  4582.    brw->wm.prog_data = old_prog_data;
  4583.  
  4584.    return success;
  4585. }
  4586.  
  4587. void
  4588. brw_setup_tex_for_precompile(struct brw_context *brw,
  4589.                              struct brw_sampler_prog_key_data *tex,
  4590.                              struct gl_program *prog)
  4591. {
  4592.    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
  4593.    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
  4594.    for (unsigned i = 0; i < sampler_count; i++) {
  4595.       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
  4596.          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
  4597.          tex->swizzles[i] =
  4598.             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
  4599.       } else {
  4600.          /* Color sampler: assume no swizzling. */
  4601.          tex->swizzles[i] = SWIZZLE_XYZW;
  4602.       }
  4603.    }
  4604. }
  4605.