Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_fs.cpp
  25.  *
  26.  * This file drives the GLSL IR -> LIR translation, contains the
  27.  * optimizations on the LIR, and drives the generation of native code
  28.  * from the LIR.
  29.  */
  30.  
  31. extern "C" {
  32.  
  33. #include <sys/types.h>
  34.  
  35. #include "main/hash_table.h"
  36. #include "main/macros.h"
  37. #include "main/shaderobj.h"
  38. #include "main/uniforms.h"
  39. #include "main/fbobject.h"
  40. #include "program/prog_parameter.h"
  41. #include "program/prog_print.h"
  42. #include "program/register_allocate.h"
  43. #include "program/sampler.h"
  44. #include "program/hash_table.h"
  45. #include "brw_context.h"
  46. #include "brw_eu.h"
  47. #include "brw_wm.h"
  48. }
  49. #include "brw_fs.h"
  50. #include "glsl/glsl_types.h"
  51.  
  52. void
  53. fs_inst::init()
  54. {
  55.    memset(this, 0, sizeof(*this));
  56.    this->opcode = BRW_OPCODE_NOP;
  57.    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58.  
  59.    this->dst = reg_undef;
  60.    this->src[0] = reg_undef;
  61.    this->src[1] = reg_undef;
  62.    this->src[2] = reg_undef;
  63.  
  64.    /* This will be the case for almost all instructions. */
  65.    this->regs_written = 1;
  66. }
  67.  
  68. fs_inst::fs_inst()
  69. {
  70.    init();
  71. }
  72.  
  73. fs_inst::fs_inst(enum opcode opcode)
  74. {
  75.    init();
  76.    this->opcode = opcode;
  77. }
  78.  
  79. fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  80. {
  81.    init();
  82.    this->opcode = opcode;
  83.    this->dst = dst;
  84.  
  85.    if (dst.file == GRF)
  86.       assert(dst.reg_offset >= 0);
  87. }
  88.  
  89. fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  90. {
  91.    init();
  92.    this->opcode = opcode;
  93.    this->dst = dst;
  94.    this->src[0] = src0;
  95.  
  96.    if (dst.file == GRF)
  97.       assert(dst.reg_offset >= 0);
  98.    if (src[0].file == GRF)
  99.       assert(src[0].reg_offset >= 0);
  100. }
  101.  
  102. fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
  103. {
  104.    init();
  105.    this->opcode = opcode;
  106.    this->dst = dst;
  107.    this->src[0] = src0;
  108.    this->src[1] = src1;
  109.  
  110.    if (dst.file == GRF)
  111.       assert(dst.reg_offset >= 0);
  112.    if (src[0].file == GRF)
  113.       assert(src[0].reg_offset >= 0);
  114.    if (src[1].file == GRF)
  115.       assert(src[1].reg_offset >= 0);
  116. }
  117.  
  118. fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
  119.                  fs_reg src0, fs_reg src1, fs_reg src2)
  120. {
  121.    init();
  122.    this->opcode = opcode;
  123.    this->dst = dst;
  124.    this->src[0] = src0;
  125.    this->src[1] = src1;
  126.    this->src[2] = src2;
  127.  
  128.    if (dst.file == GRF)
  129.       assert(dst.reg_offset >= 0);
  130.    if (src[0].file == GRF)
  131.       assert(src[0].reg_offset >= 0);
  132.    if (src[1].file == GRF)
  133.       assert(src[1].reg_offset >= 0);
  134.    if (src[2].file == GRF)
  135.       assert(src[2].reg_offset >= 0);
  136. }
  137.  
  138. #define ALU1(op)                                                        \
  139.    fs_inst *                                                            \
  140.    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
  141.    {                                                                    \
  142.       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
  143.    }
  144.  
  145. #define ALU2(op)                                                        \
  146.    fs_inst *                                                            \
  147.    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
  148.    {                                                                    \
  149.       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
  150.    }
  151.  
  152. #define ALU3(op)                                                        \
  153.    fs_inst *                                                            \
  154.    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
  155.    {                                                                    \
  156.       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
  157.    }
  158.  
  159. ALU1(NOT)
  160. ALU1(MOV)
  161. ALU1(FRC)
  162. ALU1(RNDD)
  163. ALU1(RNDE)
  164. ALU1(RNDZ)
  165. ALU2(ADD)
  166. ALU2(MUL)
  167. ALU2(MACH)
  168. ALU2(AND)
  169. ALU2(OR)
  170. ALU2(XOR)
  171. ALU2(SHL)
  172. ALU2(SHR)
  173. ALU2(ASR)
  174. ALU3(LRP)
  175. ALU1(BFREV)
  176. ALU3(BFE)
  177. ALU2(BFI1)
  178. ALU3(BFI2)
  179. ALU1(FBH)
  180. ALU1(FBL)
  181. ALU1(CBIT)
  182.  
  183. /** Gen4 predicated IF. */
  184. fs_inst *
  185. fs_visitor::IF(uint32_t predicate)
  186. {
  187.    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
  188.    inst->predicate = predicate;
  189.    return inst;
  190. }
  191.  
  192. /** Gen6+ IF with embedded comparison. */
  193. fs_inst *
  194. fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
  195. {
  196.    assert(brw->gen >= 6);
  197.    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
  198.                                         reg_null_d, src0, src1);
  199.    inst->conditional_mod = condition;
  200.    return inst;
  201. }
  202.  
  203. /**
  204.  * CMP: Sets the low bit of the destination channels with the result
  205.  * of the comparison, while the upper bits are undefined, and updates
  206.  * the flag register with the packed 16 bits of the result.
  207.  */
  208. fs_inst *
  209. fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
  210. {
  211.    fs_inst *inst;
  212.  
  213.    /* Take the instruction:
  214.     *
  215.     * CMP null<d> src0<f> src1<f>
  216.     *
  217.     * Original gen4 does type conversion to the destination type before
  218.     * comparison, producing garbage results for floating point comparisons.
  219.     * gen5 does the comparison on the execution type (resolved source types),
  220.     * so dst type doesn't matter.  gen6 does comparison and then uses the
  221.     * result as if it was the dst type with no conversion, which happens to
  222.     * mostly work out for float-interpreted-as-int since our comparisons are
  223.     * for >0, =0, <0.
  224.     */
  225.    if (brw->gen == 4) {
  226.       dst.type = src0.type;
  227.       if (dst.file == HW_REG)
  228.          dst.fixed_hw_reg.type = dst.type;
  229.    }
  230.  
  231.    resolve_ud_negate(&src0);
  232.    resolve_ud_negate(&src1);
  233.  
  234.    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
  235.    inst->conditional_mod = condition;
  236.  
  237.    return inst;
  238. }
  239.  
  240. exec_list
  241. fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
  242.                                        fs_reg varying_offset,
  243.                                        uint32_t const_offset)
  244. {
  245.    exec_list instructions;
  246.    fs_inst *inst;
  247.  
  248.    /* We have our constant surface use a pitch of 4 bytes, so our index can
  249.     * be any component of a vector, and then we load 4 contiguous
  250.     * components starting from that.
  251.     *
  252.     * We break down the const_offset to a portion added to the variable
  253.     * offset and a portion done using reg_offset, which means that if you
  254.     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
  255.     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
  256.     * CSE can later notice that those loads are all the same and eliminate
  257.     * the redundant ones.
  258.     */
  259.    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
  260.    instructions.push_tail(ADD(vec4_offset,
  261.                               varying_offset, const_offset & ~3));
  262.  
  263.    int scale = 1;
  264.    if (brw->gen == 4 && dispatch_width == 8) {
  265.       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
  266.        * u, v, r) as parameters, or we can just use the SIMD16 message
  267.        * consisting of (header, u).  We choose the second, at the cost of a
  268.        * longer return length.
  269.        */
  270.       scale = 2;
  271.    }
  272.  
  273.    enum opcode op;
  274.    if (brw->gen >= 7)
  275.       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
  276.    else
  277.       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
  278.    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
  279.    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
  280.    inst->regs_written = 4 * scale;
  281.    instructions.push_tail(inst);
  282.  
  283.    if (brw->gen < 7) {
  284.       inst->base_mrf = 13;
  285.       inst->header_present = true;
  286.       if (brw->gen == 4)
  287.          inst->mlen = 3;
  288.       else
  289.          inst->mlen = 1 + dispatch_width / 8;
  290.    }
  291.  
  292.    vec4_result.reg_offset += (const_offset & 3) * scale;
  293.    instructions.push_tail(MOV(dst, vec4_result));
  294.  
  295.    return instructions;
  296. }
  297.  
  298. /**
  299.  * A helper for MOV generation for fixing up broken hardware SEND dependency
  300.  * handling.
  301.  */
  302. fs_inst *
  303. fs_visitor::DEP_RESOLVE_MOV(int grf)
  304. {
  305.    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
  306.  
  307.    inst->ir = NULL;
  308.    inst->annotation = "send dependency resolve";
  309.  
  310.    /* The caller always wants uncompressed to emit the minimal extra
  311.     * dependencies, and to avoid having to deal with aligning its regs to 2.
  312.     */
  313.    inst->force_uncompressed = true;
  314.  
  315.    return inst;
  316. }
  317.  
  318. bool
  319. fs_inst::equals(fs_inst *inst)
  320. {
  321.    return (opcode == inst->opcode &&
  322.            dst.equals(inst->dst) &&
  323.            src[0].equals(inst->src[0]) &&
  324.            src[1].equals(inst->src[1]) &&
  325.            src[2].equals(inst->src[2]) &&
  326.            saturate == inst->saturate &&
  327.            predicate == inst->predicate &&
  328.            conditional_mod == inst->conditional_mod &&
  329.            mlen == inst->mlen &&
  330.            base_mrf == inst->base_mrf &&
  331.            sampler == inst->sampler &&
  332.            target == inst->target &&
  333.            eot == inst->eot &&
  334.            header_present == inst->header_present &&
  335.            shadow_compare == inst->shadow_compare &&
  336.            offset == inst->offset);
  337. }
  338.  
  339. bool
  340. fs_inst::overwrites_reg(const fs_reg &reg)
  341. {
  342.    return (reg.file == dst.file &&
  343.            reg.reg == dst.reg &&
  344.            reg.reg_offset >= dst.reg_offset  &&
  345.            reg.reg_offset < dst.reg_offset + regs_written);
  346. }
  347.  
  348. bool
  349. fs_inst::is_send_from_grf()
  350. {
  351.    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
  352.            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
  353.            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
  354.             src[1].file == GRF));
  355. }
  356.  
  357. bool
  358. fs_visitor::can_do_source_mods(fs_inst *inst)
  359. {
  360.    if (brw->gen == 6 && inst->is_math())
  361.       return false;
  362.  
  363.    if (inst->is_send_from_grf())
  364.       return false;
  365.  
  366.    return true;
  367. }
  368.  
  369. void
  370. fs_reg::init()
  371. {
  372.    memset(this, 0, sizeof(*this));
  373.    this->smear = -1;
  374. }
  375.  
  376. /** Generic unset register constructor. */
  377. fs_reg::fs_reg()
  378. {
  379.    init();
  380.    this->file = BAD_FILE;
  381. }
  382.  
  383. /** Immediate value constructor. */
  384. fs_reg::fs_reg(float f)
  385. {
  386.    init();
  387.    this->file = IMM;
  388.    this->type = BRW_REGISTER_TYPE_F;
  389.    this->imm.f = f;
  390. }
  391.  
  392. /** Immediate value constructor. */
  393. fs_reg::fs_reg(int32_t i)
  394. {
  395.    init();
  396.    this->file = IMM;
  397.    this->type = BRW_REGISTER_TYPE_D;
  398.    this->imm.i = i;
  399. }
  400.  
  401. /** Immediate value constructor. */
  402. fs_reg::fs_reg(uint32_t u)
  403. {
  404.    init();
  405.    this->file = IMM;
  406.    this->type = BRW_REGISTER_TYPE_UD;
  407.    this->imm.u = u;
  408. }
  409.  
  410. /** Fixed brw_reg Immediate value constructor. */
  411. fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
  412. {
  413.    init();
  414.    this->file = HW_REG;
  415.    this->fixed_hw_reg = fixed_hw_reg;
  416.    this->type = fixed_hw_reg.type;
  417. }
  418.  
  419. bool
  420. fs_reg::equals(const fs_reg &r) const
  421. {
  422.    return (file == r.file &&
  423.            reg == r.reg &&
  424.            reg_offset == r.reg_offset &&
  425.            type == r.type &&
  426.            negate == r.negate &&
  427.            abs == r.abs &&
  428.            !reladdr && !r.reladdr &&
  429.            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
  430.                   sizeof(fixed_hw_reg)) == 0 &&
  431.            smear == r.smear &&
  432.            imm.u == r.imm.u);
  433. }
  434.  
  435. bool
  436. fs_reg::is_zero() const
  437. {
  438.    if (file != IMM)
  439.       return false;
  440.  
  441.    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
  442. }
  443.  
  444. bool
  445. fs_reg::is_one() const
  446. {
  447.    if (file != IMM)
  448.       return false;
  449.  
  450.    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
  451. }
  452.  
  453. bool
  454. fs_reg::is_valid_3src() const
  455. {
  456.    return file == GRF || file == UNIFORM;
  457. }
  458.  
  459. int
  460. fs_visitor::type_size(const struct glsl_type *type)
  461. {
  462.    unsigned int size, i;
  463.  
  464.    switch (type->base_type) {
  465.    case GLSL_TYPE_UINT:
  466.    case GLSL_TYPE_INT:
  467.    case GLSL_TYPE_FLOAT:
  468.    case GLSL_TYPE_BOOL:
  469.       return type->components();
  470.    case GLSL_TYPE_ARRAY:
  471.       return type_size(type->fields.array) * type->length;
  472.    case GLSL_TYPE_STRUCT:
  473.       size = 0;
  474.       for (i = 0; i < type->length; i++) {
  475.          size += type_size(type->fields.structure[i].type);
  476.       }
  477.       return size;
  478.    case GLSL_TYPE_SAMPLER:
  479.       /* Samplers take up no register space, since they're baked in at
  480.        * link time.
  481.        */
  482.       return 0;
  483.    case GLSL_TYPE_VOID:
  484.    case GLSL_TYPE_ERROR:
  485.    case GLSL_TYPE_INTERFACE:
  486.       assert(!"not reached");
  487.       break;
  488.    }
  489.  
  490.    return 0;
  491. }
  492.  
  493. fs_reg
  494. fs_visitor::get_timestamp()
  495. {
  496.    assert(brw->gen >= 7);
  497.  
  498.    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
  499.                                           BRW_ARF_TIMESTAMP,
  500.                                           0),
  501.                              BRW_REGISTER_TYPE_UD));
  502.  
  503.    fs_reg dst = fs_reg(this, glsl_type::uint_type);
  504.  
  505.    fs_inst *mov = emit(MOV(dst, ts));
  506.    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
  507.     * even if it's not enabled in the dispatch.
  508.     */
  509.    mov->force_writemask_all = true;
  510.    mov->force_uncompressed = true;
  511.  
  512.    /* The caller wants the low 32 bits of the timestamp.  Since it's running
  513.     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
  514.     * which is plenty of time for our purposes.  It is identical across the
  515.     * EUs, but since it's tracking GPU core speed it will increment at a
  516.     * varying rate as render P-states change.
  517.     *
  518.     * The caller could also check if render P-states have changed (or anything
  519.     * else that might disrupt timing) by setting smear to 2 and checking if
  520.     * that field is != 0.
  521.     */
  522.    dst.smear = 0;
  523.  
  524.    return dst;
  525. }
  526.  
  527. void
  528. fs_visitor::emit_shader_time_begin()
  529. {
  530.    current_annotation = "shader time start";
  531.    shader_start_time = get_timestamp();
  532. }
  533.  
  534. void
  535. fs_visitor::emit_shader_time_end()
  536. {
  537.    current_annotation = "shader time end";
  538.  
  539.    enum shader_time_shader_type type, written_type, reset_type;
  540.    if (dispatch_width == 8) {
  541.       type = ST_FS8;
  542.       written_type = ST_FS8_WRITTEN;
  543.       reset_type = ST_FS8_RESET;
  544.    } else {
  545.       assert(dispatch_width == 16);
  546.       type = ST_FS16;
  547.       written_type = ST_FS16_WRITTEN;
  548.       reset_type = ST_FS16_RESET;
  549.    }
  550.  
  551.    fs_reg shader_end_time = get_timestamp();
  552.  
  553.    /* Check that there weren't any timestamp reset events (assuming these
  554.     * were the only two timestamp reads that happened).
  555.     */
  556.    fs_reg reset = shader_end_time;
  557.    reset.smear = 2;
  558.    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
  559.    test->conditional_mod = BRW_CONDITIONAL_Z;
  560.    emit(IF(BRW_PREDICATE_NORMAL));
  561.  
  562.    push_force_uncompressed();
  563.    fs_reg start = shader_start_time;
  564.    start.negate = true;
  565.    fs_reg diff = fs_reg(this, glsl_type::uint_type);
  566.    emit(ADD(diff, start, shader_end_time));
  567.  
  568.    /* If there were no instructions between the two timestamp gets, the diff
  569.     * is 2 cycles.  Remove that overhead, so I can forget about that when
  570.     * trying to determine the time taken for single instructions.
  571.     */
  572.    emit(ADD(diff, diff, fs_reg(-2u)));
  573.  
  574.    emit_shader_time_write(type, diff);
  575.    emit_shader_time_write(written_type, fs_reg(1u));
  576.    emit(BRW_OPCODE_ELSE);
  577.    emit_shader_time_write(reset_type, fs_reg(1u));
  578.    emit(BRW_OPCODE_ENDIF);
  579.  
  580.    pop_force_uncompressed();
  581. }
  582.  
  583. void
  584. fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
  585.                                    fs_reg value)
  586. {
  587.    int shader_time_index =
  588.       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
  589.    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
  590.  
  591.    fs_reg payload;
  592.    if (dispatch_width == 8)
  593.       payload = fs_reg(this, glsl_type::uvec2_type);
  594.    else
  595.       payload = fs_reg(this, glsl_type::uint_type);
  596.  
  597.    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
  598.                 fs_reg(), payload, offset, value));
  599. }
  600.  
  601. void
  602. fs_visitor::fail(const char *format, ...)
  603. {
  604.    va_list va;
  605.    char *msg;
  606.  
  607.    if (failed)
  608.       return;
  609.  
  610.    failed = true;
  611.  
  612.    va_start(va, format);
  613.    msg = ralloc_vasprintf(mem_ctx, format, va);
  614.    va_end(va);
  615.    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
  616.  
  617.    this->fail_msg = msg;
  618.  
  619.    if (INTEL_DEBUG & DEBUG_WM) {
  620.       fprintf(stderr, "%s",  msg);
  621.    }
  622. }
  623.  
  624. fs_inst *
  625. fs_visitor::emit(enum opcode opcode)
  626. {
  627.    return emit(fs_inst(opcode));
  628. }
  629.  
  630. fs_inst *
  631. fs_visitor::emit(enum opcode opcode, fs_reg dst)
  632. {
  633.    return emit(fs_inst(opcode, dst));
  634. }
  635.  
  636. fs_inst *
  637. fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
  638. {
  639.    return emit(fs_inst(opcode, dst, src0));
  640. }
  641.  
  642. fs_inst *
  643. fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
  644. {
  645.    return emit(fs_inst(opcode, dst, src0, src1));
  646. }
  647.  
  648. fs_inst *
  649. fs_visitor::emit(enum opcode opcode, fs_reg dst,
  650.                  fs_reg src0, fs_reg src1, fs_reg src2)
  651. {
  652.    return emit(fs_inst(opcode, dst, src0, src1, src2));
  653. }
  654.  
  655. void
  656. fs_visitor::push_force_uncompressed()
  657. {
  658.    force_uncompressed_stack++;
  659. }
  660.  
  661. void
  662. fs_visitor::pop_force_uncompressed()
  663. {
  664.    force_uncompressed_stack--;
  665.    assert(force_uncompressed_stack >= 0);
  666. }
  667.  
  668. void
  669. fs_visitor::push_force_sechalf()
  670. {
  671.    force_sechalf_stack++;
  672. }
  673.  
  674. void
  675. fs_visitor::pop_force_sechalf()
  676. {
  677.    force_sechalf_stack--;
  678.    assert(force_sechalf_stack >= 0);
  679. }
  680.  
  681. /**
  682.  * Returns true if the instruction has a flag that means it won't
  683.  * update an entire destination register.
  684.  *
  685.  * For example, dead code elimination and live variable analysis want to know
  686.  * when a write to a variable screens off any preceding values that were in
  687.  * it.
  688.  */
  689. bool
  690. fs_inst::is_partial_write()
  691. {
  692.    return (this->predicate ||
  693.            this->force_uncompressed ||
  694.            this->force_sechalf);
  695. }
  696.  
  697. /**
  698.  * Returns how many MRFs an FS opcode will write over.
  699.  *
  700.  * Note that this is not the 0 or 1 implied writes in an actual gen
  701.  * instruction -- the FS opcodes often generate MOVs in addition.
  702.  */
  703. int
  704. fs_visitor::implied_mrf_writes(fs_inst *inst)
  705. {
  706.    if (inst->mlen == 0)
  707.       return 0;
  708.  
  709.    switch (inst->opcode) {
  710.    case SHADER_OPCODE_RCP:
  711.    case SHADER_OPCODE_RSQ:
  712.    case SHADER_OPCODE_SQRT:
  713.    case SHADER_OPCODE_EXP2:
  714.    case SHADER_OPCODE_LOG2:
  715.    case SHADER_OPCODE_SIN:
  716.    case SHADER_OPCODE_COS:
  717.       return 1 * dispatch_width / 8;
  718.    case SHADER_OPCODE_POW:
  719.    case SHADER_OPCODE_INT_QUOTIENT:
  720.    case SHADER_OPCODE_INT_REMAINDER:
  721.       return 2 * dispatch_width / 8;
  722.    case SHADER_OPCODE_TEX:
  723.    case FS_OPCODE_TXB:
  724.    case SHADER_OPCODE_TXD:
  725.    case SHADER_OPCODE_TXF:
  726.    case SHADER_OPCODE_TXF_MS:
  727.    case SHADER_OPCODE_TXL:
  728.    case SHADER_OPCODE_TXS:
  729.    case SHADER_OPCODE_LOD:
  730.       return 1;
  731.    case FS_OPCODE_FB_WRITE:
  732.       return 2;
  733.    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  734.    case FS_OPCODE_UNSPILL:
  735.       return 1;
  736.    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
  737.       return inst->mlen;
  738.    case FS_OPCODE_SPILL:
  739.       return 2;
  740.    default:
  741.       assert(!"not reached");
  742.       return inst->mlen;
  743.    }
  744. }
  745.  
  746. int
  747. fs_visitor::virtual_grf_alloc(int size)
  748. {
  749.    if (virtual_grf_array_size <= virtual_grf_count) {
  750.       if (virtual_grf_array_size == 0)
  751.          virtual_grf_array_size = 16;
  752.       else
  753.          virtual_grf_array_size *= 2;
  754.       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
  755.                                    virtual_grf_array_size);
  756.    }
  757.    virtual_grf_sizes[virtual_grf_count] = size;
  758.    return virtual_grf_count++;
  759. }
  760.  
  761. /** Fixed HW reg constructor. */
  762. fs_reg::fs_reg(enum register_file file, int reg)
  763. {
  764.    init();
  765.    this->file = file;
  766.    this->reg = reg;
  767.    this->type = BRW_REGISTER_TYPE_F;
  768. }
  769.  
  770. /** Fixed HW reg constructor. */
  771. fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
  772. {
  773.    init();
  774.    this->file = file;
  775.    this->reg = reg;
  776.    this->type = type;
  777. }
  778.  
  779. /** Automatic reg constructor. */
  780. fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
  781. {
  782.    init();
  783.  
  784.    this->file = GRF;
  785.    this->reg = v->virtual_grf_alloc(v->type_size(type));
  786.    this->reg_offset = 0;
  787.    this->type = brw_type_for_base_type(type);
  788. }
  789.  
  790. fs_reg *
  791. fs_visitor::variable_storage(ir_variable *var)
  792. {
  793.    return (fs_reg *)hash_table_find(this->variable_ht, var);
  794. }
  795.  
  796. void
  797. import_uniforms_callback(const void *key,
  798.                          void *data,
  799.                          void *closure)
  800. {
  801.    struct hash_table *dst_ht = (struct hash_table *)closure;
  802.    const fs_reg *reg = (const fs_reg *)data;
  803.  
  804.    if (reg->file != UNIFORM)
  805.       return;
  806.  
  807.    hash_table_insert(dst_ht, data, key);
  808. }
  809.  
  810. /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
  811.  * This brings in those uniform definitions
  812.  */
  813. void
  814. fs_visitor::import_uniforms(fs_visitor *v)
  815. {
  816.    hash_table_call_foreach(v->variable_ht,
  817.                            import_uniforms_callback,
  818.                            variable_ht);
  819.    this->params_remap = v->params_remap;
  820.    this->nr_params_remap = v->nr_params_remap;
  821. }
  822.  
  823. /* Our support for uniforms is piggy-backed on the struct
  824.  * gl_fragment_program, because that's where the values actually
  825.  * get stored, rather than in some global gl_shader_program uniform
  826.  * store.
  827.  */
  828. void
  829. fs_visitor::setup_uniform_values(ir_variable *ir)
  830. {
  831.    int namelen = strlen(ir->name);
  832.  
  833.    /* The data for our (non-builtin) uniforms is stored in a series of
  834.     * gl_uniform_driver_storage structs for each subcomponent that
  835.     * glGetUniformLocation() could name.  We know it's been set up in the same
  836.     * order we'd walk the type, so walk the list of storage and find anything
  837.     * with our name, or the prefix of a component that starts with our name.
  838.     */
  839.    unsigned params_before = c->prog_data.nr_params;
  840.    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
  841.       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
  842.  
  843.       if (strncmp(ir->name, storage->name, namelen) != 0 ||
  844.           (storage->name[namelen] != 0 &&
  845.            storage->name[namelen] != '.' &&
  846.            storage->name[namelen] != '[')) {
  847.          continue;
  848.       }
  849.  
  850.       unsigned slots = storage->type->component_slots();
  851.       if (storage->array_elements)
  852.          slots *= storage->array_elements;
  853.  
  854.       for (unsigned i = 0; i < slots; i++) {
  855.          c->prog_data.param[c->prog_data.nr_params++] =
  856.             &storage->storage[i].f;
  857.       }
  858.    }
  859.  
  860.    /* Make sure we actually initialized the right amount of stuff here. */
  861.    assert(params_before + ir->type->component_slots() ==
  862.           c->prog_data.nr_params);
  863.    (void)params_before;
  864. }
  865.  
  866.  
  867. /* Our support for builtin uniforms is even scarier than non-builtin.
  868.  * It sits on top of the PROG_STATE_VAR parameters that are
  869.  * automatically updated from GL context state.
  870.  */
  871. void
  872. fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
  873. {
  874.    const ir_state_slot *const slots = ir->state_slots;
  875.    assert(ir->state_slots != NULL);
  876.  
  877.    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
  878.       /* This state reference has already been setup by ir_to_mesa, but we'll
  879.        * get the same index back here.
  880.        */
  881.       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
  882.                                             (gl_state_index *)slots[i].tokens);
  883.  
  884.       /* Add each of the unique swizzles of the element as a parameter.
  885.        * This'll end up matching the expected layout of the
  886.        * array/matrix/structure we're trying to fill in.
  887.        */
  888.       int last_swiz = -1;
  889.       for (unsigned int j = 0; j < 4; j++) {
  890.          int swiz = GET_SWZ(slots[i].swizzle, j);
  891.          if (swiz == last_swiz)
  892.             break;
  893.          last_swiz = swiz;
  894.  
  895.          c->prog_data.param[c->prog_data.nr_params++] =
  896.             &fp->Base.Parameters->ParameterValues[index][swiz].f;
  897.       }
  898.    }
  899. }
  900.  
  901. fs_reg *
  902. fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
  903. {
  904.    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
  905.    fs_reg wpos = *reg;
  906.    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
  907.  
  908.    /* gl_FragCoord.x */
  909.    if (ir->pixel_center_integer) {
  910.       emit(MOV(wpos, this->pixel_x));
  911.    } else {
  912.       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
  913.    }
  914.    wpos.reg_offset++;
  915.  
  916.    /* gl_FragCoord.y */
  917.    if (!flip && ir->pixel_center_integer) {
  918.       emit(MOV(wpos, this->pixel_y));
  919.    } else {
  920.       fs_reg pixel_y = this->pixel_y;
  921.       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
  922.  
  923.       if (flip) {
  924.          pixel_y.negate = true;
  925.          offset += c->key.drawable_height - 1.0;
  926.       }
  927.  
  928.       emit(ADD(wpos, pixel_y, fs_reg(offset)));
  929.    }
  930.    wpos.reg_offset++;
  931.  
  932.    /* gl_FragCoord.z */
  933.    if (brw->gen >= 6) {
  934.       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
  935.    } else {
  936.       emit(FS_OPCODE_LINTERP, wpos,
  937.            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
  938.            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
  939.            interp_reg(VARYING_SLOT_POS, 2));
  940.    }
  941.    wpos.reg_offset++;
  942.  
  943.    /* gl_FragCoord.w: Already set up in emit_interpolation */
  944.    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
  945.  
  946.    return reg;
  947. }
  948.  
  949. fs_inst *
  950. fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
  951.                          glsl_interp_qualifier interpolation_mode,
  952.                          bool is_centroid)
  953. {
  954.    brw_wm_barycentric_interp_mode barycoord_mode;
  955.    if (brw->gen >= 6) {
  956.       if (is_centroid) {
  957.          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
  958.             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
  959.          else
  960.             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
  961.       } else {
  962.          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
  963.             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
  964.          else
  965.             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
  966.       }
  967.    } else {
  968.       /* On Ironlake and below, there is only one interpolation mode.
  969.        * Centroid interpolation doesn't mean anything on this hardware --
  970.        * there is no multisampling.
  971.        */
  972.       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
  973.    }
  974.    return emit(FS_OPCODE_LINTERP, attr,
  975.                this->delta_x[barycoord_mode],
  976.                this->delta_y[barycoord_mode], interp);
  977. }
  978.  
  979. fs_reg *
  980. fs_visitor::emit_general_interpolation(ir_variable *ir)
  981. {
  982.    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
  983.    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
  984.    fs_reg attr = *reg;
  985.  
  986.    unsigned int array_elements;
  987.    const glsl_type *type;
  988.  
  989.    if (ir->type->is_array()) {
  990.       array_elements = ir->type->length;
  991.       if (array_elements == 0) {
  992.          fail("dereferenced array '%s' has length 0\n", ir->name);
  993.       }
  994.       type = ir->type->fields.array;
  995.    } else {
  996.       array_elements = 1;
  997.       type = ir->type;
  998.    }
  999.  
  1000.    glsl_interp_qualifier interpolation_mode =
  1001.       ir->determine_interpolation_mode(c->key.flat_shade);
  1002.  
  1003.    int location = ir->location;
  1004.    for (unsigned int i = 0; i < array_elements; i++) {
  1005.       for (unsigned int j = 0; j < type->matrix_columns; j++) {
  1006.          if (urb_setup[location] == -1) {
  1007.             /* If there's no incoming setup data for this slot, don't
  1008.              * emit interpolation for it.
  1009.              */
  1010.             attr.reg_offset += type->vector_elements;
  1011.             location++;
  1012.             continue;
  1013.          }
  1014.  
  1015.          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
  1016.             /* Constant interpolation (flat shading) case. The SF has
  1017.              * handed us defined values in only the constant offset
  1018.              * field of the setup reg.
  1019.              */
  1020.             for (unsigned int k = 0; k < type->vector_elements; k++) {
  1021.                struct brw_reg interp = interp_reg(location, k);
  1022.                interp = suboffset(interp, 3);
  1023.                interp.type = reg->type;
  1024.                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
  1025.                attr.reg_offset++;
  1026.             }
  1027.          } else {
  1028.             /* Smooth/noperspective interpolation case. */
  1029.             for (unsigned int k = 0; k < type->vector_elements; k++) {
  1030.                /* FINISHME: At some point we probably want to push
  1031.                 * this farther by giving similar treatment to the
  1032.                 * other potentially constant components of the
  1033.                 * attribute, as well as making brw_vs_constval.c
  1034.                 * handle varyings other than gl_TexCoord.
  1035.                 */
  1036.                struct brw_reg interp = interp_reg(location, k);
  1037.                emit_linterp(attr, fs_reg(interp), interpolation_mode,
  1038.                             ir->centroid);
  1039.                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
  1040.                   /* Get the pixel/sample mask into f0 so that we know
  1041.                    * which pixels are lit.  Then, for each channel that is
  1042.                    * unlit, replace the centroid data with non-centroid
  1043.                    * data.
  1044.                    */
  1045.                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
  1046.                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
  1047.                                                interpolation_mode, false);
  1048.                   inst->predicate = BRW_PREDICATE_NORMAL;
  1049.                   inst->predicate_inverse = true;
  1050.                }
  1051.                if (brw->gen < 6) {
  1052.                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
  1053.                }
  1054.                attr.reg_offset++;
  1055.             }
  1056.  
  1057.          }
  1058.          location++;
  1059.       }
  1060.    }
  1061.  
  1062.    return reg;
  1063. }
  1064.  
  1065. fs_reg *
  1066. fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
  1067. {
  1068.    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
  1069.  
  1070.    /* The frontfacing comes in as a bit in the thread payload. */
  1071.    if (brw->gen >= 6) {
  1072.       emit(BRW_OPCODE_ASR, *reg,
  1073.            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
  1074.            fs_reg(15));
  1075.       emit(BRW_OPCODE_NOT, *reg, *reg);
  1076.       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
  1077.    } else {
  1078.       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
  1079.       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
  1080.        * us front face
  1081.        */
  1082.       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
  1083.       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
  1084.    }
  1085.  
  1086.    return reg;
  1087. }
  1088.  
  1089. fs_reg
  1090. fs_visitor::fix_math_operand(fs_reg src)
  1091. {
  1092.    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
  1093.     * might be able to do better by doing execsize = 1 math and then
  1094.     * expanding that result out, but we would need to be careful with
  1095.     * masking.
  1096.     *
  1097.     * The hardware ignores source modifiers (negate and abs) on math
  1098.     * instructions, so we also move to a temp to set those up.
  1099.     */
  1100.    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
  1101.        !src.abs && !src.negate)
  1102.       return src;
  1103.  
  1104.    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
  1105.     * operands to math
  1106.     */
  1107.    if (brw->gen >= 7 && src.file != IMM)
  1108.       return src;
  1109.  
  1110.    fs_reg expanded = fs_reg(this, glsl_type::float_type);
  1111.    expanded.type = src.type;
  1112.    emit(BRW_OPCODE_MOV, expanded, src);
  1113.    return expanded;
  1114. }
  1115.  
  1116. fs_inst *
  1117. fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
  1118. {
  1119.    switch (opcode) {
  1120.    case SHADER_OPCODE_RCP:
  1121.    case SHADER_OPCODE_RSQ:
  1122.    case SHADER_OPCODE_SQRT:
  1123.    case SHADER_OPCODE_EXP2:
  1124.    case SHADER_OPCODE_LOG2:
  1125.    case SHADER_OPCODE_SIN:
  1126.    case SHADER_OPCODE_COS:
  1127.       break;
  1128.    default:
  1129.       assert(!"not reached: bad math opcode");
  1130.       return NULL;
  1131.    }
  1132.  
  1133.    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
  1134.     * might be able to do better by doing execsize = 1 math and then
  1135.     * expanding that result out, but we would need to be careful with
  1136.     * masking.
  1137.     *
  1138.     * Gen 6 hardware ignores source modifiers (negate and abs) on math
  1139.     * instructions, so we also move to a temp to set those up.
  1140.     */
  1141.    if (brw->gen >= 6)
  1142.       src = fix_math_operand(src);
  1143.  
  1144.    fs_inst *inst = emit(opcode, dst, src);
  1145.  
  1146.    if (brw->gen < 6) {
  1147.       inst->base_mrf = 2;
  1148.       inst->mlen = dispatch_width / 8;
  1149.    }
  1150.  
  1151.    return inst;
  1152. }
  1153.  
  1154. fs_inst *
  1155. fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
  1156. {
  1157.    int base_mrf = 2;
  1158.    fs_inst *inst;
  1159.  
  1160.    switch (opcode) {
  1161.    case SHADER_OPCODE_INT_QUOTIENT:
  1162.    case SHADER_OPCODE_INT_REMAINDER:
  1163.       if (brw->gen >= 7 && dispatch_width == 16)
  1164.          fail("16-wide INTDIV unsupported\n");
  1165.       break;
  1166.    case SHADER_OPCODE_POW:
  1167.       break;
  1168.    default:
  1169.       assert(!"not reached: unsupported binary math opcode.");
  1170.       return NULL;
  1171.    }
  1172.  
  1173.    if (brw->gen >= 6) {
  1174.       src0 = fix_math_operand(src0);
  1175.       src1 = fix_math_operand(src1);
  1176.  
  1177.       inst = emit(opcode, dst, src0, src1);
  1178.    } else {
  1179.       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
  1180.        * "Message Payload":
  1181.        *
  1182.        * "Operand0[7].  For the INT DIV functions, this operand is the
  1183.        *  denominator."
  1184.        *  ...
  1185.        * "Operand1[7].  For the INT DIV functions, this operand is the
  1186.        *  numerator."
  1187.        */
  1188.       bool is_int_div = opcode != SHADER_OPCODE_POW;
  1189.       fs_reg &op0 = is_int_div ? src1 : src0;
  1190.       fs_reg &op1 = is_int_div ? src0 : src1;
  1191.  
  1192.       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
  1193.       inst = emit(opcode, dst, op0, reg_null_f);
  1194.  
  1195.       inst->base_mrf = base_mrf;
  1196.       inst->mlen = 2 * dispatch_width / 8;
  1197.    }
  1198.    return inst;
  1199. }
  1200.  
  1201. void
  1202. fs_visitor::assign_curb_setup()
  1203. {
  1204.    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
  1205.    if (dispatch_width == 8) {
  1206.       c->prog_data.first_curbe_grf = c->nr_payload_regs;
  1207.    } else {
  1208.       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
  1209.    }
  1210.  
  1211.    /* Map the offsets in the UNIFORM file to fixed HW regs. */
  1212.    foreach_list(node, &this->instructions) {
  1213.       fs_inst *inst = (fs_inst *)node;
  1214.  
  1215.       for (unsigned int i = 0; i < 3; i++) {
  1216.          if (inst->src[i].file == UNIFORM) {
  1217.             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
  1218.             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
  1219.                                                   constant_nr / 8,
  1220.                                                   constant_nr % 8);
  1221.  
  1222.             inst->src[i].file = HW_REG;
  1223.             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
  1224.          }
  1225.       }
  1226.    }
  1227. }
  1228.  
  1229. void
  1230. fs_visitor::calculate_urb_setup()
  1231. {
  1232.    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
  1233.       urb_setup[i] = -1;
  1234.    }
  1235.  
  1236.    int urb_next = 0;
  1237.    /* Figure out where each of the incoming setup attributes lands. */
  1238.    if (brw->gen >= 6) {
  1239.       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
  1240.          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
  1241.             urb_setup[i] = urb_next++;
  1242.          }
  1243.       }
  1244.    } else {
  1245.       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
  1246.       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
  1247.          /* Point size is packed into the header, not as a general attribute */
  1248.          if (i == VARYING_SLOT_PSIZ)
  1249.             continue;
  1250.  
  1251.          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
  1252.             /* The back color slot is skipped when the front color is
  1253.              * also written to.  In addition, some slots can be
  1254.              * written in the vertex shader and not read in the
  1255.              * fragment shader.  So the register number must always be
  1256.              * incremented, mapped or not.
  1257.              */
  1258.             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
  1259.                urb_setup[i] = urb_next;
  1260.             urb_next++;
  1261.          }
  1262.       }
  1263.  
  1264.       /*
  1265.        * It's a FS only attribute, and we did interpolation for this attribute
  1266.        * in SF thread. So, count it here, too.
  1267.        *
  1268.        * See compile_sf_prog() for more info.
  1269.        */
  1270.       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
  1271.          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
  1272.    }
  1273.  
  1274.    /* Each attribute is 4 setup channels, each of which is half a reg. */
  1275.    c->prog_data.urb_read_length = urb_next * 2;
  1276. }
  1277.  
  1278. void
  1279. fs_visitor::assign_urb_setup()
  1280. {
  1281.    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
  1282.  
  1283.    /* Offset all the urb_setup[] index by the actual position of the
  1284.     * setup regs, now that the location of the constants has been chosen.
  1285.     */
  1286.    foreach_list(node, &this->instructions) {
  1287.       fs_inst *inst = (fs_inst *)node;
  1288.  
  1289.       if (inst->opcode == FS_OPCODE_LINTERP) {
  1290.          assert(inst->src[2].file == HW_REG);
  1291.          inst->src[2].fixed_hw_reg.nr += urb_start;
  1292.       }
  1293.  
  1294.       if (inst->opcode == FS_OPCODE_CINTERP) {
  1295.          assert(inst->src[0].file == HW_REG);
  1296.          inst->src[0].fixed_hw_reg.nr += urb_start;
  1297.       }
  1298.    }
  1299.  
  1300.    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
  1301. }
  1302.  
  1303. /**
  1304.  * Split large virtual GRFs into separate components if we can.
  1305.  *
  1306.  * This is mostly duplicated with what brw_fs_vector_splitting does,
  1307.  * but that's really conservative because it's afraid of doing
  1308.  * splitting that doesn't result in real progress after the rest of
  1309.  * the optimization phases, which would cause infinite looping in
  1310.  * optimization.  We can do it once here, safely.  This also has the
  1311.  * opportunity to split interpolated values, or maybe even uniforms,
  1312.  * which we don't have at the IR level.
  1313.  *
  1314.  * We want to split, because virtual GRFs are what we register
  1315.  * allocate and spill (due to contiguousness requirements for some
  1316.  * instructions), and they're what we naturally generate in the
  1317.  * codegen process, but most virtual GRFs don't actually need to be
  1318.  * contiguous sets of GRFs.  If we split, we'll end up with reduced
  1319.  * live intervals and better dead code elimination and coalescing.
  1320.  */
  1321. void
  1322. fs_visitor::split_virtual_grfs()
  1323. {
  1324.    int num_vars = this->virtual_grf_count;
  1325.    bool split_grf[num_vars];
  1326.    int new_virtual_grf[num_vars];
  1327.  
  1328.    /* Try to split anything > 0 sized. */
  1329.    for (int i = 0; i < num_vars; i++) {
  1330.       if (this->virtual_grf_sizes[i] != 1)
  1331.          split_grf[i] = true;
  1332.       else
  1333.          split_grf[i] = false;
  1334.    }
  1335.  
  1336.    if (brw->has_pln &&
  1337.        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
  1338.       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
  1339.        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
  1340.        * Gen6, that was the only supported interpolation mode, and since Gen6,
  1341.        * delta_x and delta_y are in fixed hardware registers.
  1342.        */
  1343.       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
  1344.          false;
  1345.    }
  1346.  
  1347.    foreach_list(node, &this->instructions) {
  1348.       fs_inst *inst = (fs_inst *)node;
  1349.  
  1350.       /* If there's a SEND message that requires contiguous destination
  1351.        * registers, no splitting is allowed.
  1352.        */
  1353.       if (inst->regs_written > 1) {
  1354.          split_grf[inst->dst.reg] = false;
  1355.       }
  1356.  
  1357.       /* If we're sending from a GRF, don't split it, on the assumption that
  1358.        * the send is reading the whole thing.
  1359.        */
  1360.       if (inst->is_send_from_grf()) {
  1361.          for (int i = 0; i < 3; i++) {
  1362.             if (inst->src[i].file == GRF) {
  1363.                split_grf[inst->src[i].reg] = false;
  1364.             }
  1365.          }
  1366.       }
  1367.    }
  1368.  
  1369.    /* Allocate new space for split regs.  Note that the virtual
  1370.     * numbers will be contiguous.
  1371.     */
  1372.    for (int i = 0; i < num_vars; i++) {
  1373.       if (split_grf[i]) {
  1374.          new_virtual_grf[i] = virtual_grf_alloc(1);
  1375.          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
  1376.             int reg = virtual_grf_alloc(1);
  1377.             assert(reg == new_virtual_grf[i] + j - 1);
  1378.             (void) reg;
  1379.          }
  1380.          this->virtual_grf_sizes[i] = 1;
  1381.       }
  1382.    }
  1383.  
  1384.    foreach_list(node, &this->instructions) {
  1385.       fs_inst *inst = (fs_inst *)node;
  1386.  
  1387.       if (inst->dst.file == GRF &&
  1388.           split_grf[inst->dst.reg] &&
  1389.           inst->dst.reg_offset != 0) {
  1390.          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
  1391.                           inst->dst.reg_offset - 1);
  1392.          inst->dst.reg_offset = 0;
  1393.       }
  1394.       for (int i = 0; i < 3; i++) {
  1395.          if (inst->src[i].file == GRF &&
  1396.              split_grf[inst->src[i].reg] &&
  1397.              inst->src[i].reg_offset != 0) {
  1398.             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
  1399.                                 inst->src[i].reg_offset - 1);
  1400.             inst->src[i].reg_offset = 0;
  1401.          }
  1402.       }
  1403.    }
  1404.    this->live_intervals_valid = false;
  1405. }
  1406.  
  1407. /**
  1408.  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
  1409.  *
  1410.  * During code generation, we create tons of temporary variables, many of
  1411.  * which get immediately killed and are never used again.  Yet, in later
  1412.  * optimization and analysis passes, such as compute_live_intervals, we need
  1413.  * to loop over all the virtual GRFs.  Compacting them can save a lot of
  1414.  * overhead.
  1415.  */
  1416. void
  1417. fs_visitor::compact_virtual_grfs()
  1418. {
  1419.    /* Mark which virtual GRFs are used, and count how many. */
  1420.    int remap_table[this->virtual_grf_count];
  1421.    memset(remap_table, -1, sizeof(remap_table));
  1422.  
  1423.    foreach_list(node, &this->instructions) {
  1424.       const fs_inst *inst = (const fs_inst *) node;
  1425.  
  1426.       if (inst->dst.file == GRF)
  1427.          remap_table[inst->dst.reg] = 0;
  1428.  
  1429.       for (int i = 0; i < 3; i++) {
  1430.          if (inst->src[i].file == GRF)
  1431.             remap_table[inst->src[i].reg] = 0;
  1432.       }
  1433.    }
  1434.  
  1435.    /* In addition to registers used in instructions, fs_visitor keeps
  1436.     * direct references to certain special values which must be patched:
  1437.     */
  1438.    fs_reg *special[] = {
  1439.       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
  1440.       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
  1441.       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
  1442.       &delta_x[0], &delta_x[1], &delta_x[2],
  1443.       &delta_x[3], &delta_x[4], &delta_x[5],
  1444.       &delta_y[0], &delta_y[1], &delta_y[2],
  1445.       &delta_y[3], &delta_y[4], &delta_y[5],
  1446.    };
  1447.    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
  1448.    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
  1449.  
  1450.    /* Treat all special values as used, to be conservative */
  1451.    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
  1452.       if (special[i]->file == GRF)
  1453.          remap_table[special[i]->reg] = 0;
  1454.    }
  1455.  
  1456.    /* Compact the GRF arrays. */
  1457.    int new_index = 0;
  1458.    for (int i = 0; i < this->virtual_grf_count; i++) {
  1459.       if (remap_table[i] != -1) {
  1460.          remap_table[i] = new_index;
  1461.          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
  1462.          if (live_intervals_valid) {
  1463.             virtual_grf_start[new_index] = virtual_grf_start[i];
  1464.             virtual_grf_end[new_index] = virtual_grf_end[i];
  1465.          }
  1466.          ++new_index;
  1467.       }
  1468.    }
  1469.  
  1470.    this->virtual_grf_count = new_index;
  1471.  
  1472.    /* Patch all the instructions to use the newly renumbered registers */
  1473.    foreach_list(node, &this->instructions) {
  1474.       fs_inst *inst = (fs_inst *) node;
  1475.  
  1476.       if (inst->dst.file == GRF)
  1477.          inst->dst.reg = remap_table[inst->dst.reg];
  1478.  
  1479.       for (int i = 0; i < 3; i++) {
  1480.          if (inst->src[i].file == GRF)
  1481.             inst->src[i].reg = remap_table[inst->src[i].reg];
  1482.       }
  1483.    }
  1484.  
  1485.    /* Patch all the references to special values */
  1486.    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
  1487.       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
  1488.          special[i]->reg = remap_table[special[i]->reg];
  1489.    }
  1490. }
  1491.  
  1492. bool
  1493. fs_visitor::remove_dead_constants()
  1494. {
  1495.    if (dispatch_width == 8) {
  1496.       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
  1497.       this->nr_params_remap = c->prog_data.nr_params;
  1498.  
  1499.       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
  1500.          this->params_remap[i] = -1;
  1501.  
  1502.       /* Find which params are still in use. */
  1503.       foreach_list(node, &this->instructions) {
  1504.          fs_inst *inst = (fs_inst *)node;
  1505.  
  1506.          for (int i = 0; i < 3; i++) {
  1507.             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
  1508.  
  1509.             if (inst->src[i].file != UNIFORM)
  1510.                continue;
  1511.  
  1512.             /* Section 5.11 of the OpenGL 4.3 spec says:
  1513.              *
  1514.              *     "Out-of-bounds reads return undefined values, which include
  1515.              *     values from other variables of the active program or zero."
  1516.              */
  1517.             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
  1518.                constant_nr = 0;
  1519.             }
  1520.  
  1521.             /* For now, set this to non-negative.  We'll give it the
  1522.              * actual new number in a moment, in order to keep the
  1523.              * register numbers nicely ordered.
  1524.              */
  1525.             this->params_remap[constant_nr] = 0;
  1526.          }
  1527.       }
  1528.  
  1529.       /* Figure out what the new numbers for the params will be.  At some
  1530.        * point when we're doing uniform array access, we're going to want
  1531.        * to keep the distinction between .reg and .reg_offset, but for
  1532.        * now we don't care.
  1533.        */
  1534.       unsigned int new_nr_params = 0;
  1535.       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
  1536.          if (this->params_remap[i] != -1) {
  1537.             this->params_remap[i] = new_nr_params++;
  1538.          }
  1539.       }
  1540.  
  1541.       /* Update the list of params to be uploaded to match our new numbering. */
  1542.       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
  1543.          int remapped = this->params_remap[i];
  1544.  
  1545.          if (remapped == -1)
  1546.             continue;
  1547.  
  1548.          c->prog_data.param[remapped] = c->prog_data.param[i];
  1549.       }
  1550.  
  1551.       c->prog_data.nr_params = new_nr_params;
  1552.    } else {
  1553.       /* This should have been generated in the 8-wide pass already. */
  1554.       assert(this->params_remap);
  1555.    }
  1556.  
  1557.    /* Now do the renumbering of the shader to remove unused params. */
  1558.    foreach_list(node, &this->instructions) {
  1559.       fs_inst *inst = (fs_inst *)node;
  1560.  
  1561.       for (int i = 0; i < 3; i++) {
  1562.          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
  1563.  
  1564.          if (inst->src[i].file != UNIFORM)
  1565.             continue;
  1566.  
  1567.          /* as above alias to 0 */
  1568.          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
  1569.             constant_nr = 0;
  1570.          }
  1571.          assert(this->params_remap[constant_nr] != -1);
  1572.          inst->src[i].reg = this->params_remap[constant_nr];
  1573.          inst->src[i].reg_offset = 0;
  1574.       }
  1575.    }
  1576.  
  1577.    return true;
  1578. }
  1579.  
  1580. /*
  1581.  * Implements array access of uniforms by inserting a
  1582.  * PULL_CONSTANT_LOAD instruction.
  1583.  *
  1584.  * Unlike temporary GRF array access (where we don't support it due to
  1585.  * the difficulty of doing relative addressing on instruction
  1586.  * destinations), we could potentially do array access of uniforms
  1587.  * that were loaded in GRF space as push constants.  In real-world
  1588.  * usage we've seen, though, the arrays being used are always larger
  1589.  * than we could load as push constants, so just always move all
  1590.  * uniform array access out to a pull constant buffer.
  1591.  */
  1592. void
  1593. fs_visitor::move_uniform_array_access_to_pull_constants()
  1594. {
  1595.    int pull_constant_loc[c->prog_data.nr_params];
  1596.  
  1597.    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
  1598.       pull_constant_loc[i] = -1;
  1599.    }
  1600.  
  1601.    /* Walk through and find array access of uniforms.  Put a copy of that
  1602.     * uniform in the pull constant buffer.
  1603.     *
  1604.     * Note that we don't move constant-indexed accesses to arrays.  No
  1605.     * testing has been done of the performance impact of this choice.
  1606.     */
  1607.    foreach_list_safe(node, &this->instructions) {
  1608.       fs_inst *inst = (fs_inst *)node;
  1609.  
  1610.       for (int i = 0 ; i < 3; i++) {
  1611.          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
  1612.             continue;
  1613.  
  1614.          int uniform = inst->src[i].reg;
  1615.  
  1616.          /* If this array isn't already present in the pull constant buffer,
  1617.           * add it.
  1618.           */
  1619.          if (pull_constant_loc[uniform] == -1) {
  1620.             const float **values = &c->prog_data.param[uniform];
  1621.  
  1622.             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
  1623.  
  1624.             assert(param_size[uniform]);
  1625.  
  1626.             for (int j = 0; j < param_size[uniform]; j++) {
  1627.                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
  1628.                   values[j];
  1629.             }
  1630.          }
  1631.  
  1632.          /* Set up the annotation tracking for new generated instructions. */
  1633.          base_ir = inst->ir;
  1634.          current_annotation = inst->annotation;
  1635.  
  1636.          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
  1637.          fs_reg temp = fs_reg(this, glsl_type::float_type);
  1638.          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
  1639.                                                      surf_index,
  1640.                                                      *inst->src[i].reladdr,
  1641.                                                      pull_constant_loc[uniform] +
  1642.                                                      inst->src[i].reg_offset);
  1643.          inst->insert_before(&list);
  1644.  
  1645.          inst->src[i].file = temp.file;
  1646.          inst->src[i].reg = temp.reg;
  1647.          inst->src[i].reg_offset = temp.reg_offset;
  1648.          inst->src[i].reladdr = NULL;
  1649.       }
  1650.    }
  1651. }
  1652.  
  1653. /**
  1654.  * Choose accesses from the UNIFORM file to demote to using the pull
  1655.  * constant buffer.
  1656.  *
  1657.  * We allow a fragment shader to have more than the specified minimum
  1658.  * maximum number of fragment shader uniform components (64).  If
  1659.  * there are too many of these, they'd fill up all of register space.
  1660.  * So, this will push some of them out to the pull constant buffer and
  1661.  * update the program to load them.
  1662.  */
  1663. void
  1664. fs_visitor::setup_pull_constants()
  1665. {
  1666.    /* Only allow 16 registers (128 uniform components) as push constants. */
  1667.    unsigned int max_uniform_components = 16 * 8;
  1668.    if (c->prog_data.nr_params <= max_uniform_components)
  1669.       return;
  1670.  
  1671.    if (dispatch_width == 16) {
  1672.       fail("Pull constants not supported in 16-wide\n");
  1673.       return;
  1674.    }
  1675.  
  1676.    /* Just demote the end of the list.  We could probably do better
  1677.     * here, demoting things that are rarely used in the program first.
  1678.     */
  1679.    unsigned int pull_uniform_base = max_uniform_components;
  1680.  
  1681.    int pull_constant_loc[c->prog_data.nr_params];
  1682.    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
  1683.       if (i < pull_uniform_base) {
  1684.          pull_constant_loc[i] = -1;
  1685.       } else {
  1686.          pull_constant_loc[i] = -1;
  1687.          /* If our constant is already being uploaded for reladdr purposes,
  1688.           * reuse it.
  1689.           */
  1690.          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
  1691.             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
  1692.                pull_constant_loc[i] = j;
  1693.                break;
  1694.             }
  1695.          }
  1696.          if (pull_constant_loc[i] == -1) {
  1697.             int pull_index = c->prog_data.nr_pull_params++;
  1698.             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
  1699.             pull_constant_loc[i] = pull_index;;
  1700.          }
  1701.       }
  1702.    }
  1703.    c->prog_data.nr_params = pull_uniform_base;
  1704.  
  1705.    foreach_list(node, &this->instructions) {
  1706.       fs_inst *inst = (fs_inst *)node;
  1707.  
  1708.       for (int i = 0; i < 3; i++) {
  1709.          if (inst->src[i].file != UNIFORM)
  1710.             continue;
  1711.  
  1712.          int pull_index = pull_constant_loc[inst->src[i].reg +
  1713.                                             inst->src[i].reg_offset];
  1714.          if (pull_index == -1)
  1715.             continue;
  1716.  
  1717.          assert(!inst->src[i].reladdr);
  1718.  
  1719.          fs_reg dst = fs_reg(this, glsl_type::float_type);
  1720.          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
  1721.          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
  1722.          fs_inst *pull =
  1723.             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
  1724.                                  dst, index, offset);
  1725.          pull->ir = inst->ir;
  1726.          pull->annotation = inst->annotation;
  1727.  
  1728.          inst->insert_before(pull);
  1729.  
  1730.          inst->src[i].file = GRF;
  1731.          inst->src[i].reg = dst.reg;
  1732.          inst->src[i].reg_offset = 0;
  1733.          inst->src[i].smear = pull_index & 3;
  1734.       }
  1735.    }
  1736. }
  1737.  
  1738. bool
  1739. fs_visitor::opt_algebraic()
  1740. {
  1741.    bool progress = false;
  1742.  
  1743.    foreach_list(node, &this->instructions) {
  1744.       fs_inst *inst = (fs_inst *)node;
  1745.  
  1746.       switch (inst->opcode) {
  1747.       case BRW_OPCODE_MUL:
  1748.          if (inst->src[1].file != IMM)
  1749.             continue;
  1750.  
  1751.          /* a * 1.0 = a */
  1752.          if (inst->src[1].is_one()) {
  1753.             inst->opcode = BRW_OPCODE_MOV;
  1754.             inst->src[1] = reg_undef;
  1755.             progress = true;
  1756.             break;
  1757.          }
  1758.  
  1759.          /* a * 0.0 = 0.0 */
  1760.          if (inst->src[1].is_zero()) {
  1761.             inst->opcode = BRW_OPCODE_MOV;
  1762.             inst->src[0] = inst->src[1];
  1763.             inst->src[1] = reg_undef;
  1764.             progress = true;
  1765.             break;
  1766.          }
  1767.  
  1768.          break;
  1769.       case BRW_OPCODE_ADD:
  1770.          if (inst->src[1].file != IMM)
  1771.             continue;
  1772.  
  1773.          /* a + 0.0 = a */
  1774.          if (inst->src[1].is_zero()) {
  1775.             inst->opcode = BRW_OPCODE_MOV;
  1776.             inst->src[1] = reg_undef;
  1777.             progress = true;
  1778.             break;
  1779.          }
  1780.          break;
  1781.       default:
  1782.          break;
  1783.       }
  1784.    }
  1785.  
  1786.    return progress;
  1787. }
  1788.  
  1789. /**
  1790.  * Removes any instructions writing a VGRF where that VGRF is not used by any
  1791.  * later instruction.
  1792.  */
  1793. bool
  1794. fs_visitor::dead_code_eliminate()
  1795. {
  1796.    bool progress = false;
  1797.    int pc = 0;
  1798.  
  1799.    calculate_live_intervals();
  1800.  
  1801.    foreach_list_safe(node, &this->instructions) {
  1802.       fs_inst *inst = (fs_inst *)node;
  1803.  
  1804.       if (inst->dst.file == GRF) {
  1805.          assert(this->virtual_grf_end[inst->dst.reg] >= pc);
  1806.          if (this->virtual_grf_end[inst->dst.reg] == pc) {
  1807.             inst->remove();
  1808.             progress = true;
  1809.          }
  1810.       }
  1811.  
  1812.       pc++;
  1813.    }
  1814.  
  1815.    if (progress)
  1816.       live_intervals_valid = false;
  1817.  
  1818.    return progress;
  1819. }
  1820.  
  1821. struct dead_code_hash_key
  1822. {
  1823.    int vgrf;
  1824.    int reg_offset;
  1825. };
  1826.  
  1827. static bool
  1828. dead_code_hash_compare(const void *a, const void *b)
  1829. {
  1830.    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
  1831. }
  1832.  
  1833. static void
  1834. clear_dead_code_hash(struct hash_table *ht)
  1835. {
  1836.    struct hash_entry *entry;
  1837.  
  1838.    hash_table_foreach(ht, entry) {
  1839.       _mesa_hash_table_remove(ht, entry);
  1840.    }
  1841. }
  1842.  
  1843. static void
  1844. insert_dead_code_hash(struct hash_table *ht,
  1845.                       int vgrf, int reg_offset, fs_inst *inst)
  1846. {
  1847.    /* We don't bother freeing keys, because they'll be GCed with the ht. */
  1848.    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
  1849.  
  1850.    key->vgrf = vgrf;
  1851.    key->reg_offset = reg_offset;
  1852.  
  1853.    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
  1854. }
  1855.  
  1856. static struct hash_entry *
  1857. get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
  1858. {
  1859.    struct dead_code_hash_key key;
  1860.  
  1861.    key.vgrf = vgrf;
  1862.    key.reg_offset = reg_offset;
  1863.  
  1864.    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
  1865. }
  1866.  
  1867. static void
  1868. remove_dead_code_hash(struct hash_table *ht,
  1869.                       int vgrf, int reg_offset)
  1870. {
  1871.    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
  1872.    if (!entry)
  1873.       return;
  1874.  
  1875.    _mesa_hash_table_remove(ht, entry);
  1876. }
  1877.  
  1878. /**
  1879.  * Walks basic blocks, removing any regs that are written but not read before
  1880.  * being redefined.
  1881.  *
  1882.  * The dead_code_eliminate() function implements a global dead code
  1883.  * elimination, but it only handles the removing the last write to a register
  1884.  * if it's never read.  This one can handle intermediate writes, but only
  1885.  * within a basic block.
  1886.  */
  1887. bool
  1888. fs_visitor::dead_code_eliminate_local()
  1889. {
  1890.    struct hash_table *ht;
  1891.    bool progress = false;
  1892.  
  1893.    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
  1894.  
  1895.    foreach_list_safe(node, &this->instructions) {
  1896.       fs_inst *inst = (fs_inst *)node;
  1897.  
  1898.       /* At a basic block, empty the HT since we don't understand dataflow
  1899.        * here.
  1900.        */
  1901.       if (inst->is_control_flow()) {
  1902.          clear_dead_code_hash(ht);
  1903.          continue;
  1904.       }
  1905.  
  1906.       /* Clear the HT of any instructions that got read. */
  1907.       for (int i = 0; i < 3; i++) {
  1908.          fs_reg src = inst->src[i];
  1909.          if (src.file != GRF)
  1910.             continue;
  1911.  
  1912.          int read = 1;
  1913.          if (inst->is_send_from_grf())
  1914.             read = virtual_grf_sizes[src.reg] - src.reg_offset;
  1915.  
  1916.          for (int reg_offset = src.reg_offset;
  1917.               reg_offset < src.reg_offset + read;
  1918.               reg_offset++) {
  1919.             remove_dead_code_hash(ht, src.reg, reg_offset);
  1920.          }
  1921.       }
  1922.  
  1923.       /* Add any update of a GRF to the HT, removing a previous write if it
  1924.        * wasn't read.
  1925.        */
  1926.       if (inst->dst.file == GRF) {
  1927.          if (inst->regs_written > 1) {
  1928.             /* We don't know how to trim channels from an instruction's
  1929.              * writes, so we can't incrementally remove unread channels from
  1930.              * it.  Just remove whatever it overwrites from the table
  1931.              */
  1932.             for (int i = 0; i < inst->regs_written; i++) {
  1933.                remove_dead_code_hash(ht,
  1934.                                      inst->dst.reg,
  1935.                                      inst->dst.reg_offset + i);
  1936.             }
  1937.          } else {
  1938.             struct hash_entry *entry =
  1939.                get_dead_code_hash_entry(ht, inst->dst.reg,
  1940.                                         inst->dst.reg_offset);
  1941.  
  1942.             if (inst->is_partial_write()) {
  1943.                /* For a partial write, we can't remove any previous dead code
  1944.                 * candidate, since we're just modifying their result, but we can
  1945.                 * be dead code eliminiated ourselves.
  1946.                 */
  1947.                if (entry) {
  1948.                   entry->data = inst;
  1949.                } else {
  1950.                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
  1951.                                         inst);
  1952.                }
  1953.             } else {
  1954.                if (entry) {
  1955.                   /* We're completely updating a channel, and there was a
  1956.                    * previous write to the channel that wasn't read.  Kill it!
  1957.                    */
  1958.                   fs_inst *inst = (fs_inst *)entry->data;
  1959.                   inst->remove();
  1960.                   progress = true;
  1961.                   _mesa_hash_table_remove(ht, entry);
  1962.                }
  1963.  
  1964.                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
  1965.                                      inst);
  1966.             }
  1967.          }
  1968.       }
  1969.    }
  1970.  
  1971.    _mesa_hash_table_destroy(ht, NULL);
  1972.  
  1973.    if (progress)
  1974.       live_intervals_valid = false;
  1975.  
  1976.    return progress;
  1977. }
  1978.  
  1979. /**
  1980.  * Implements a second type of register coalescing: This one checks if
  1981.  * the two regs involved in a raw move don't interfere, in which case
  1982.  * they can both by stored in the same place and the MOV removed.
  1983.  */
  1984. bool
  1985. fs_visitor::register_coalesce_2()
  1986. {
  1987.    bool progress = false;
  1988.  
  1989.    calculate_live_intervals();
  1990.  
  1991.    foreach_list_safe(node, &this->instructions) {
  1992.       fs_inst *inst = (fs_inst *)node;
  1993.  
  1994.       if (inst->opcode != BRW_OPCODE_MOV ||
  1995.           inst->is_partial_write() ||
  1996.           inst->saturate ||
  1997.           inst->src[0].file != GRF ||
  1998.           inst->src[0].negate ||
  1999.           inst->src[0].abs ||
  2000.           inst->src[0].smear != -1 ||
  2001.           inst->dst.file != GRF ||
  2002.           inst->dst.type != inst->src[0].type ||
  2003.           virtual_grf_sizes[inst->src[0].reg] != 1 ||
  2004.           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
  2005.          continue;
  2006.       }
  2007.  
  2008.       int reg_from = inst->src[0].reg;
  2009.       assert(inst->src[0].reg_offset == 0);
  2010.       int reg_to = inst->dst.reg;
  2011.       int reg_to_offset = inst->dst.reg_offset;
  2012.  
  2013.       foreach_list(node, &this->instructions) {
  2014.          fs_inst *scan_inst = (fs_inst *)node;
  2015.  
  2016.          if (scan_inst->dst.file == GRF &&
  2017.              scan_inst->dst.reg == reg_from) {
  2018.             scan_inst->dst.reg = reg_to;
  2019.             scan_inst->dst.reg_offset = reg_to_offset;
  2020.          }
  2021.          for (int i = 0; i < 3; i++) {
  2022.             if (scan_inst->src[i].file == GRF &&
  2023.                 scan_inst->src[i].reg == reg_from) {
  2024.                scan_inst->src[i].reg = reg_to;
  2025.                scan_inst->src[i].reg_offset = reg_to_offset;
  2026.             }
  2027.          }
  2028.       }
  2029.  
  2030.       inst->remove();
  2031.  
  2032.       /* We don't need to recalculate live intervals inside the loop despite
  2033.        * flagging live_intervals_valid because we only use live intervals for
  2034.        * the interferes test, and we must have had a situation where the
  2035.        * intervals were:
  2036.        *
  2037.        *  from  to
  2038.        *  ^
  2039.        *  |
  2040.        *  v
  2041.        *        ^
  2042.        *        |
  2043.        *        v
  2044.        *
  2045.        * Some register R that might get coalesced with one of these two could
  2046.        * only be referencing "to", otherwise "from"'s range would have been
  2047.        * longer.  R's range could also only start at the end of "to" or later,
  2048.        * otherwise it will conflict with "to" when we try to coalesce "to"
  2049.        * into Rw anyway.
  2050.        */
  2051.       live_intervals_valid = false;
  2052.  
  2053.       progress = true;
  2054.       continue;
  2055.    }
  2056.  
  2057.    return progress;
  2058. }
  2059.  
  2060. bool
  2061. fs_visitor::register_coalesce()
  2062. {
  2063.    bool progress = false;
  2064.    int if_depth = 0;
  2065.    int loop_depth = 0;
  2066.  
  2067.    foreach_list_safe(node, &this->instructions) {
  2068.       fs_inst *inst = (fs_inst *)node;
  2069.  
  2070.       /* Make sure that we dominate the instructions we're going to
  2071.        * scan for interfering with our coalescing, or we won't have
  2072.        * scanned enough to see if anything interferes with our
  2073.        * coalescing.  We don't dominate the following instructions if
  2074.        * we're in a loop or an if block.
  2075.        */
  2076.       switch (inst->opcode) {
  2077.       case BRW_OPCODE_DO:
  2078.          loop_depth++;
  2079.          break;
  2080.       case BRW_OPCODE_WHILE:
  2081.          loop_depth--;
  2082.          break;
  2083.       case BRW_OPCODE_IF:
  2084.          if_depth++;
  2085.          break;
  2086.       case BRW_OPCODE_ENDIF:
  2087.          if_depth--;
  2088.          break;
  2089.       default:
  2090.          break;
  2091.       }
  2092.       if (loop_depth || if_depth)
  2093.          continue;
  2094.  
  2095.       if (inst->opcode != BRW_OPCODE_MOV ||
  2096.           inst->is_partial_write() ||
  2097.           inst->saturate ||
  2098.           inst->dst.file != GRF || (inst->src[0].file != GRF &&
  2099.                                     inst->src[0].file != UNIFORM)||
  2100.           inst->dst.type != inst->src[0].type)
  2101.          continue;
  2102.  
  2103.       bool has_source_modifiers = (inst->src[0].abs ||
  2104.                                    inst->src[0].negate ||
  2105.                                    inst->src[0].smear != -1 ||
  2106.                                    inst->src[0].file == UNIFORM);
  2107.  
  2108.       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
  2109.        * them: check for no writes to either one until the exit of the
  2110.        * program.
  2111.        */
  2112.       bool interfered = false;
  2113.  
  2114.       for (fs_inst *scan_inst = (fs_inst *)inst->next;
  2115.            !scan_inst->is_tail_sentinel();
  2116.            scan_inst = (fs_inst *)scan_inst->next) {
  2117.          if (scan_inst->dst.file == GRF) {
  2118.             if (scan_inst->overwrites_reg(inst->dst) ||
  2119.                 scan_inst->overwrites_reg(inst->src[0])) {
  2120.                interfered = true;
  2121.                break;
  2122.             }
  2123.          }
  2124.  
  2125.          if (has_source_modifiers) {
  2126.             for (int i = 0; i < 3; i++) {
  2127.                if (scan_inst->src[i].file == GRF &&
  2128.                    scan_inst->src[i].reg == inst->dst.reg &&
  2129.                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
  2130.                    inst->dst.type != scan_inst->src[i].type)
  2131.                {
  2132.                  interfered = true;
  2133.                  break;
  2134.                }
  2135.             }
  2136.          }
  2137.  
  2138.  
  2139.          /* The gen6 MATH instruction can't handle source modifiers or
  2140.           * unusual register regions, so avoid coalescing those for
  2141.           * now.  We should do something more specific.
  2142.           */
  2143.          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
  2144.             interfered = true;
  2145.             break;
  2146.          }
  2147.  
  2148.          /* The accumulator result appears to get used for the
  2149.           * conditional modifier generation.  When negating a UD
  2150.           * value, there is a 33rd bit generated for the sign in the
  2151.           * accumulator value, so now you can't check, for example,
  2152.           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
  2153.           */
  2154.          if (scan_inst->conditional_mod &&
  2155.              inst->src[0].negate &&
  2156.              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
  2157.             interfered = true;
  2158.             break;
  2159.          }
  2160.       }
  2161.       if (interfered) {
  2162.          continue;
  2163.       }
  2164.  
  2165.       /* Rewrite the later usage to point at the source of the move to
  2166.        * be removed.
  2167.        */
  2168.       for (fs_inst *scan_inst = inst;
  2169.            !scan_inst->is_tail_sentinel();
  2170.            scan_inst = (fs_inst *)scan_inst->next) {
  2171.          for (int i = 0; i < 3; i++) {
  2172.             if (scan_inst->src[i].file == GRF &&
  2173.                 scan_inst->src[i].reg == inst->dst.reg &&
  2174.                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
  2175.                fs_reg new_src = inst->src[0];
  2176.                if (scan_inst->src[i].abs) {
  2177.                   new_src.negate = 0;
  2178.                   new_src.abs = 1;
  2179.                }
  2180.                new_src.negate ^= scan_inst->src[i].negate;
  2181.                scan_inst->src[i] = new_src;
  2182.             }
  2183.          }
  2184.       }
  2185.  
  2186.       inst->remove();
  2187.       progress = true;
  2188.    }
  2189.  
  2190.    if (progress)
  2191.       live_intervals_valid = false;
  2192.  
  2193.    return progress;
  2194. }
  2195.  
  2196.  
  2197. bool
  2198. fs_visitor::compute_to_mrf()
  2199. {
  2200.    bool progress = false;
  2201.    int next_ip = 0;
  2202.  
  2203.    calculate_live_intervals();
  2204.  
  2205.    foreach_list_safe(node, &this->instructions) {
  2206.       fs_inst *inst = (fs_inst *)node;
  2207.  
  2208.       int ip = next_ip;
  2209.       next_ip++;
  2210.  
  2211.       if (inst->opcode != BRW_OPCODE_MOV ||
  2212.           inst->is_partial_write() ||
  2213.           inst->dst.file != MRF || inst->src[0].file != GRF ||
  2214.           inst->dst.type != inst->src[0].type ||
  2215.           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
  2216.          continue;
  2217.  
  2218.       /* Work out which hardware MRF registers are written by this
  2219.        * instruction.
  2220.        */
  2221.       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
  2222.       int mrf_high;
  2223.       if (inst->dst.reg & BRW_MRF_COMPR4) {
  2224.          mrf_high = mrf_low + 4;
  2225.       } else if (dispatch_width == 16 &&
  2226.                  (!inst->force_uncompressed && !inst->force_sechalf)) {
  2227.          mrf_high = mrf_low + 1;
  2228.       } else {
  2229.          mrf_high = mrf_low;
  2230.       }
  2231.  
  2232.       /* Can't compute-to-MRF this GRF if someone else was going to
  2233.        * read it later.
  2234.        */
  2235.       if (this->virtual_grf_end[inst->src[0].reg] > ip)
  2236.          continue;
  2237.  
  2238.       /* Found a move of a GRF to a MRF.  Let's see if we can go
  2239.        * rewrite the thing that made this GRF to write into the MRF.
  2240.        */
  2241.       fs_inst *scan_inst;
  2242.       for (scan_inst = (fs_inst *)inst->prev;
  2243.            scan_inst->prev != NULL;
  2244.            scan_inst = (fs_inst *)scan_inst->prev) {
  2245.          if (scan_inst->dst.file == GRF &&
  2246.              scan_inst->dst.reg == inst->src[0].reg) {
  2247.             /* Found the last thing to write our reg we want to turn
  2248.              * into a compute-to-MRF.
  2249.              */
  2250.  
  2251.             /* If this one instruction didn't populate all the
  2252.              * channels, bail.  We might be able to rewrite everything
  2253.              * that writes that reg, but it would require smarter
  2254.              * tracking to delay the rewriting until complete success.
  2255.              */
  2256.             if (scan_inst->is_partial_write())
  2257.                break;
  2258.  
  2259.             /* Things returning more than one register would need us to
  2260.              * understand coalescing out more than one MOV at a time.
  2261.              */
  2262.             if (scan_inst->regs_written > 1)
  2263.                break;
  2264.  
  2265.             /* SEND instructions can't have MRF as a destination. */
  2266.             if (scan_inst->mlen)
  2267.                break;
  2268.  
  2269.             if (brw->gen == 6) {
  2270.                /* gen6 math instructions must have the destination be
  2271.                 * GRF, so no compute-to-MRF for them.
  2272.                 */
  2273.                if (scan_inst->is_math()) {
  2274.                   break;
  2275.                }
  2276.             }
  2277.  
  2278.             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
  2279.                /* Found the creator of our MRF's source value. */
  2280.                scan_inst->dst.file = MRF;
  2281.                scan_inst->dst.reg = inst->dst.reg;
  2282.                scan_inst->saturate |= inst->saturate;
  2283.                inst->remove();
  2284.                progress = true;
  2285.             }
  2286.             break;
  2287.          }
  2288.  
  2289.          /* We don't handle control flow here.  Most computation of
  2290.           * values that end up in MRFs are shortly before the MRF
  2291.           * write anyway.
  2292.           */
  2293.          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
  2294.             break;
  2295.  
  2296.          /* You can't read from an MRF, so if someone else reads our
  2297.           * MRF's source GRF that we wanted to rewrite, that stops us.
  2298.           */
  2299.          bool interfered = false;
  2300.          for (int i = 0; i < 3; i++) {
  2301.             if (scan_inst->src[i].file == GRF &&
  2302.                 scan_inst->src[i].reg == inst->src[0].reg &&
  2303.                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
  2304.                interfered = true;
  2305.             }
  2306.          }
  2307.          if (interfered)
  2308.             break;
  2309.  
  2310.          if (scan_inst->dst.file == MRF) {
  2311.             /* If somebody else writes our MRF here, we can't
  2312.              * compute-to-MRF before that.
  2313.              */
  2314.             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
  2315.             int scan_mrf_high;
  2316.  
  2317.             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
  2318.                scan_mrf_high = scan_mrf_low + 4;
  2319.             } else if (dispatch_width == 16 &&
  2320.                        (!scan_inst->force_uncompressed &&
  2321.                         !scan_inst->force_sechalf)) {
  2322.                scan_mrf_high = scan_mrf_low + 1;
  2323.             } else {
  2324.                scan_mrf_high = scan_mrf_low;
  2325.             }
  2326.  
  2327.             if (mrf_low == scan_mrf_low ||
  2328.                 mrf_low == scan_mrf_high ||
  2329.                 mrf_high == scan_mrf_low ||
  2330.                 mrf_high == scan_mrf_high) {
  2331.                break;
  2332.             }
  2333.          }
  2334.  
  2335.          if (scan_inst->mlen > 0) {
  2336.             /* Found a SEND instruction, which means that there are
  2337.              * live values in MRFs from base_mrf to base_mrf +
  2338.              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
  2339.              * above it.
  2340.              */
  2341.             if (mrf_low >= scan_inst->base_mrf &&
  2342.                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
  2343.                break;
  2344.             }
  2345.             if (mrf_high >= scan_inst->base_mrf &&
  2346.                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
  2347.                break;
  2348.             }
  2349.          }
  2350.       }
  2351.    }
  2352.  
  2353.    if (progress)
  2354.       live_intervals_valid = false;
  2355.  
  2356.    return progress;
  2357. }
  2358.  
  2359. /**
  2360.  * Walks through basic blocks, looking for repeated MRF writes and
  2361.  * removing the later ones.
  2362.  */
  2363. bool
  2364. fs_visitor::remove_duplicate_mrf_writes()
  2365. {
  2366.    fs_inst *last_mrf_move[16];
  2367.    bool progress = false;
  2368.  
  2369.    /* Need to update the MRF tracking for compressed instructions. */
  2370.    if (dispatch_width == 16)
  2371.       return false;
  2372.  
  2373.    memset(last_mrf_move, 0, sizeof(last_mrf_move));
  2374.  
  2375.    foreach_list_safe(node, &this->instructions) {
  2376.       fs_inst *inst = (fs_inst *)node;
  2377.  
  2378.       if (inst->is_control_flow()) {
  2379.          memset(last_mrf_move, 0, sizeof(last_mrf_move));
  2380.       }
  2381.  
  2382.       if (inst->opcode == BRW_OPCODE_MOV &&
  2383.           inst->dst.file == MRF) {
  2384.          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
  2385.          if (prev_inst && inst->equals(prev_inst)) {
  2386.             inst->remove();
  2387.             progress = true;
  2388.             continue;
  2389.          }
  2390.       }
  2391.  
  2392.       /* Clear out the last-write records for MRFs that were overwritten. */
  2393.       if (inst->dst.file == MRF) {
  2394.          last_mrf_move[inst->dst.reg] = NULL;
  2395.       }
  2396.  
  2397.       if (inst->mlen > 0) {
  2398.          /* Found a SEND instruction, which will include two or fewer
  2399.           * implied MRF writes.  We could do better here.
  2400.           */
  2401.          for (int i = 0; i < implied_mrf_writes(inst); i++) {
  2402.             last_mrf_move[inst->base_mrf + i] = NULL;
  2403.          }
  2404.       }
  2405.  
  2406.       /* Clear out any MRF move records whose sources got overwritten. */
  2407.       if (inst->dst.file == GRF) {
  2408.          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
  2409.             if (last_mrf_move[i] &&
  2410.                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
  2411.                last_mrf_move[i] = NULL;
  2412.             }
  2413.          }
  2414.       }
  2415.  
  2416.       if (inst->opcode == BRW_OPCODE_MOV &&
  2417.           inst->dst.file == MRF &&
  2418.           inst->src[0].file == GRF &&
  2419.           !inst->is_partial_write()) {
  2420.          last_mrf_move[inst->dst.reg] = inst;
  2421.       }
  2422.    }
  2423.  
  2424.    if (progress)
  2425.       live_intervals_valid = false;
  2426.  
  2427.    return progress;
  2428. }
  2429.  
  2430. static void
  2431. clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
  2432.                         int first_grf, int grf_len)
  2433. {
  2434.    bool inst_16wide = (dispatch_width > 8 &&
  2435.                        !inst->force_uncompressed &&
  2436.                        !inst->force_sechalf);
  2437.  
  2438.    /* Clear the flag for registers that actually got read (as expected). */
  2439.    for (int i = 0; i < 3; i++) {
  2440.       int grf;
  2441.       if (inst->src[i].file == GRF) {
  2442.          grf = inst->src[i].reg;
  2443.       } else if (inst->src[i].file == HW_REG &&
  2444.                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  2445.          grf = inst->src[i].fixed_hw_reg.nr;
  2446.       } else {
  2447.          continue;
  2448.       }
  2449.  
  2450.       if (grf >= first_grf &&
  2451.           grf < first_grf + grf_len) {
  2452.          deps[grf - first_grf] = false;
  2453.          if (inst_16wide)
  2454.             deps[grf - first_grf + 1] = false;
  2455.       }
  2456.    }
  2457. }
  2458.  
  2459. /**
  2460.  * Implements this workaround for the original 965:
  2461.  *
  2462.  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
  2463.  *      check for post destination dependencies on this instruction, software
  2464.  *      must ensure that there is no destination hazard for the case of ‘write
  2465.  *      followed by a posted write’ shown in the following example.
  2466.  *
  2467.  *      1. mov r3 0
  2468.  *      2. send r3.xy <rest of send instruction>
  2469.  *      3. mov r2 r3
  2470.  *
  2471.  *      Due to no post-destination dependency check on the ‘send’, the above
  2472.  *      code sequence could have two instructions (1 and 2) in flight at the
  2473.  *      same time that both consider ‘r3’ as the target of their final writes.
  2474.  */
  2475. void
  2476. fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
  2477. {
  2478.    int reg_size = dispatch_width / 8;
  2479.    int write_len = inst->regs_written * reg_size;
  2480.    int first_write_grf = inst->dst.reg;
  2481.    bool needs_dep[BRW_MAX_MRF];
  2482.    assert(write_len < (int)sizeof(needs_dep) - 1);
  2483.  
  2484.    memset(needs_dep, false, sizeof(needs_dep));
  2485.    memset(needs_dep, true, write_len);
  2486.  
  2487.    clear_deps_for_inst_src(inst, dispatch_width,
  2488.                            needs_dep, first_write_grf, write_len);
  2489.  
  2490.    /* Walk backwards looking for writes to registers we're writing which
  2491.     * aren't read since being written.  If we hit the start of the program,
  2492.     * we assume that there are no outstanding dependencies on entry to the
  2493.     * program.
  2494.     */
  2495.    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
  2496.         scan_inst != NULL;
  2497.         scan_inst = (fs_inst *)scan_inst->prev) {
  2498.  
  2499.       /* If we hit control flow, assume that there *are* outstanding
  2500.        * dependencies, and force their cleanup before our instruction.
  2501.        */
  2502.       if (scan_inst->is_control_flow()) {
  2503.          for (int i = 0; i < write_len; i++) {
  2504.             if (needs_dep[i]) {
  2505.                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
  2506.             }
  2507.          }
  2508.          return;
  2509.       }
  2510.  
  2511.       bool scan_inst_16wide = (dispatch_width > 8 &&
  2512.                                !scan_inst->force_uncompressed &&
  2513.                                !scan_inst->force_sechalf);
  2514.  
  2515.       /* We insert our reads as late as possible on the assumption that any
  2516.        * instruction but a MOV that might have left us an outstanding
  2517.        * dependency has more latency than a MOV.
  2518.        */
  2519.       if (scan_inst->dst.file == GRF) {
  2520.          for (int i = 0; i < scan_inst->regs_written; i++) {
  2521.             int reg = scan_inst->dst.reg + i * reg_size;
  2522.  
  2523.             if (reg >= first_write_grf &&
  2524.                 reg < first_write_grf + write_len &&
  2525.                 needs_dep[reg - first_write_grf]) {
  2526.                inst->insert_before(DEP_RESOLVE_MOV(reg));
  2527.                needs_dep[reg - first_write_grf] = false;
  2528.                if (scan_inst_16wide)
  2529.                   needs_dep[reg - first_write_grf + 1] = false;
  2530.             }
  2531.          }
  2532.       }
  2533.  
  2534.       /* Clear the flag for registers that actually got read (as expected). */
  2535.       clear_deps_for_inst_src(scan_inst, dispatch_width,
  2536.                               needs_dep, first_write_grf, write_len);
  2537.  
  2538.       /* Continue the loop only if we haven't resolved all the dependencies */
  2539.       int i;
  2540.       for (i = 0; i < write_len; i++) {
  2541.          if (needs_dep[i])
  2542.             break;
  2543.       }
  2544.       if (i == write_len)
  2545.          return;
  2546.    }
  2547. }
  2548.  
  2549. /**
  2550.  * Implements this workaround for the original 965:
  2551.  *
  2552.  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
  2553.  *      used as a destination register until after it has been sourced by an
  2554.  *      instruction with a different destination register.
  2555.  */
  2556. void
  2557. fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
  2558. {
  2559.    int write_len = inst->regs_written * dispatch_width / 8;
  2560.    int first_write_grf = inst->dst.reg;
  2561.    bool needs_dep[BRW_MAX_MRF];
  2562.    assert(write_len < (int)sizeof(needs_dep) - 1);
  2563.  
  2564.    memset(needs_dep, false, sizeof(needs_dep));
  2565.    memset(needs_dep, true, write_len);
  2566.    /* Walk forwards looking for writes to registers we're writing which aren't
  2567.     * read before being written.
  2568.     */
  2569.    for (fs_inst *scan_inst = (fs_inst *)inst->next;
  2570.         !scan_inst->is_tail_sentinel();
  2571.         scan_inst = (fs_inst *)scan_inst->next) {
  2572.       /* If we hit control flow, force resolve all remaining dependencies. */
  2573.       if (scan_inst->is_control_flow()) {
  2574.          for (int i = 0; i < write_len; i++) {
  2575.             if (needs_dep[i])
  2576.                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
  2577.          }
  2578.          return;
  2579.       }
  2580.  
  2581.       /* Clear the flag for registers that actually got read (as expected). */
  2582.       clear_deps_for_inst_src(scan_inst, dispatch_width,
  2583.                               needs_dep, first_write_grf, write_len);
  2584.  
  2585.       /* We insert our reads as late as possible since they're reading the
  2586.        * result of a SEND, which has massive latency.
  2587.        */
  2588.       if (scan_inst->dst.file == GRF &&
  2589.           scan_inst->dst.reg >= first_write_grf &&
  2590.           scan_inst->dst.reg < first_write_grf + write_len &&
  2591.           needs_dep[scan_inst->dst.reg - first_write_grf]) {
  2592.          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
  2593.          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
  2594.       }
  2595.  
  2596.       /* Continue the loop only if we haven't resolved all the dependencies */
  2597.       int i;
  2598.       for (i = 0; i < write_len; i++) {
  2599.          if (needs_dep[i])
  2600.             break;
  2601.       }
  2602.       if (i == write_len)
  2603.          return;
  2604.    }
  2605.  
  2606.    /* If we hit the end of the program, resolve all remaining dependencies out
  2607.     * of paranoia.
  2608.     */
  2609.    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
  2610.    assert(last_inst->eot);
  2611.    for (int i = 0; i < write_len; i++) {
  2612.       if (needs_dep[i])
  2613.          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
  2614.    }
  2615. }
  2616.  
  2617. void
  2618. fs_visitor::insert_gen4_send_dependency_workarounds()
  2619. {
  2620.    if (brw->gen != 4 || brw->is_g4x)
  2621.       return;
  2622.  
  2623.    /* Note that we're done with register allocation, so GRF fs_regs always
  2624.     * have a .reg_offset of 0.
  2625.     */
  2626.  
  2627.    foreach_list_safe(node, &this->instructions) {
  2628.       fs_inst *inst = (fs_inst *)node;
  2629.  
  2630.       if (inst->mlen != 0 && inst->dst.file == GRF) {
  2631.          insert_gen4_pre_send_dependency_workarounds(inst);
  2632.          insert_gen4_post_send_dependency_workarounds(inst);
  2633.       }
  2634.    }
  2635. }
  2636.  
  2637. /**
  2638.  * Turns the generic expression-style uniform pull constant load instruction
  2639.  * into a hardware-specific series of instructions for loading a pull
  2640.  * constant.
  2641.  *
  2642.  * The expression style allows the CSE pass before this to optimize out
  2643.  * repeated loads from the same offset, and gives the pre-register-allocation
  2644.  * scheduling full flexibility, while the conversion to native instructions
  2645.  * allows the post-register-allocation scheduler the best information
  2646.  * possible.
  2647.  *
  2648.  * Note that execution masking for setting up pull constant loads is special:
  2649.  * the channels that need to be written are unrelated to the current execution
  2650.  * mask, since a later instruction will use one of the result channels as a
  2651.  * source operand for all 8 or 16 of its channels.
  2652.  */
  2653. void
  2654. fs_visitor::lower_uniform_pull_constant_loads()
  2655. {
  2656.    foreach_list(node, &this->instructions) {
  2657.       fs_inst *inst = (fs_inst *)node;
  2658.  
  2659.       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
  2660.          continue;
  2661.  
  2662.       if (brw->gen >= 7) {
  2663.          /* The offset arg before was a vec4-aligned byte offset.  We need to
  2664.           * turn it into a dword offset.
  2665.           */
  2666.          fs_reg const_offset_reg = inst->src[1];
  2667.          assert(const_offset_reg.file == IMM &&
  2668.                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
  2669.          const_offset_reg.imm.u /= 4;
  2670.          fs_reg payload = fs_reg(this, glsl_type::uint_type);
  2671.  
  2672.          /* This is actually going to be a MOV, but since only the first dword
  2673.           * is accessed, we have a special opcode to do just that one.  Note
  2674.           * that this needs to be an operation that will be considered a def
  2675.           * by live variable analysis, or register allocation will explode.
  2676.           */
  2677.          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
  2678.                                                payload, const_offset_reg);
  2679.          setup->force_writemask_all = true;
  2680.  
  2681.          setup->ir = inst->ir;
  2682.          setup->annotation = inst->annotation;
  2683.          inst->insert_before(setup);
  2684.  
  2685.          /* Similarly, this will only populate the first 4 channels of the
  2686.           * result register (since we only use smear values from 0-3), but we
  2687.           * don't tell the optimizer.
  2688.           */
  2689.          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
  2690.          inst->src[1] = payload;
  2691.  
  2692.          this->live_intervals_valid = false;
  2693.       } else {
  2694.          /* Before register allocation, we didn't tell the scheduler about the
  2695.           * MRF we use.  We know it's safe to use this MRF because nothing
  2696.           * else does except for register spill/unspill, which generates and
  2697.           * uses its MRF within a single IR instruction.
  2698.           */
  2699.          inst->base_mrf = 14;
  2700.          inst->mlen = 1;
  2701.       }
  2702.    }
  2703. }
  2704.  
  2705. void
  2706. fs_visitor::dump_instruction(backend_instruction *be_inst)
  2707. {
  2708.    fs_inst *inst = (fs_inst *)be_inst;
  2709.  
  2710.    if (inst->predicate) {
  2711.       printf("(%cf0.%d) ",
  2712.              inst->predicate_inverse ? '-' : '+',
  2713.              inst->flag_subreg);
  2714.    }
  2715.  
  2716.    printf("%s", brw_instruction_name(inst->opcode));
  2717.    if (inst->saturate)
  2718.       printf(".sat");
  2719.    if (inst->conditional_mod) {
  2720.       printf(".cmod");
  2721.       if (!inst->predicate &&
  2722.           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
  2723.                               inst->opcode != BRW_OPCODE_IF &&
  2724.                               inst->opcode != BRW_OPCODE_WHILE))) {
  2725.          printf(".f0.%d\n", inst->flag_subreg);
  2726.       }
  2727.    }
  2728.    printf(" ");
  2729.  
  2730.  
  2731.    switch (inst->dst.file) {
  2732.    case GRF:
  2733.       printf("vgrf%d", inst->dst.reg);
  2734.       if (inst->dst.reg_offset)
  2735.          printf("+%d", inst->dst.reg_offset);
  2736.       break;
  2737.    case MRF:
  2738.       printf("m%d", inst->dst.reg);
  2739.       break;
  2740.    case BAD_FILE:
  2741.       printf("(null)");
  2742.       break;
  2743.    case UNIFORM:
  2744.       printf("***u%d***", inst->dst.reg);
  2745.       break;
  2746.    default:
  2747.       printf("???");
  2748.       break;
  2749.    }
  2750.    printf(", ");
  2751.  
  2752.    for (int i = 0; i < 3; i++) {
  2753.       if (inst->src[i].negate)
  2754.          printf("-");
  2755.       if (inst->src[i].abs)
  2756.          printf("|");
  2757.       switch (inst->src[i].file) {
  2758.       case GRF:
  2759.          printf("vgrf%d", inst->src[i].reg);
  2760.          if (inst->src[i].reg_offset)
  2761.             printf("+%d", inst->src[i].reg_offset);
  2762.          break;
  2763.       case MRF:
  2764.          printf("***m%d***", inst->src[i].reg);
  2765.          break;
  2766.       case UNIFORM:
  2767.          printf("u%d", inst->src[i].reg);
  2768.          if (inst->src[i].reg_offset)
  2769.             printf(".%d", inst->src[i].reg_offset);
  2770.          break;
  2771.       case BAD_FILE:
  2772.          printf("(null)");
  2773.          break;
  2774.       case IMM:
  2775.          switch (inst->src[i].type) {
  2776.          case BRW_REGISTER_TYPE_F:
  2777.             printf("%ff", inst->src[i].imm.f);
  2778.             break;
  2779.          case BRW_REGISTER_TYPE_D:
  2780.             printf("%dd", inst->src[i].imm.i);
  2781.             break;
  2782.          case BRW_REGISTER_TYPE_UD:
  2783.             printf("%uu", inst->src[i].imm.u);
  2784.             break;
  2785.          default:
  2786.             printf("???");
  2787.             break;
  2788.          }
  2789.          break;
  2790.       default:
  2791.          printf("???");
  2792.          break;
  2793.       }
  2794.       if (inst->src[i].abs)
  2795.          printf("|");
  2796.  
  2797.       if (i < 3)
  2798.          printf(", ");
  2799.    }
  2800.  
  2801.    printf(" ");
  2802.  
  2803.    if (inst->force_uncompressed)
  2804.       printf("1sthalf ");
  2805.  
  2806.    if (inst->force_sechalf)
  2807.       printf("2ndhalf ");
  2808.  
  2809.    printf("\n");
  2810. }
  2811.  
  2812. /**
  2813.  * Possibly returns an instruction that set up @param reg.
  2814.  *
  2815.  * Sometimes we want to take the result of some expression/variable
  2816.  * dereference tree and rewrite the instruction generating the result
  2817.  * of the tree.  When processing the tree, we know that the
  2818.  * instructions generated are all writing temporaries that are dead
  2819.  * outside of this tree.  So, if we have some instructions that write
  2820.  * a temporary, we're free to point that temp write somewhere else.
  2821.  *
  2822.  * Note that this doesn't guarantee that the instruction generated
  2823.  * only reg -- it might be the size=4 destination of a texture instruction.
  2824.  */
  2825. fs_inst *
  2826. fs_visitor::get_instruction_generating_reg(fs_inst *start,
  2827.                                            fs_inst *end,
  2828.                                            fs_reg reg)
  2829. {
  2830.    if (end == start ||
  2831.        end->is_partial_write() ||
  2832.        reg.reladdr ||
  2833.        !reg.equals(end->dst)) {
  2834.       return NULL;
  2835.    } else {
  2836.       return end;
  2837.    }
  2838. }
  2839.  
  2840. void
  2841. fs_visitor::setup_payload_gen6()
  2842. {
  2843.    bool uses_depth =
  2844.       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
  2845.    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
  2846.  
  2847.    assert(brw->gen >= 6);
  2848.  
  2849.    /* R0-1: masks, pixel X/Y coordinates. */
  2850.    c->nr_payload_regs = 2;
  2851.    /* R2: only for 32-pixel dispatch.*/
  2852.  
  2853.    /* R3-26: barycentric interpolation coordinates.  These appear in the
  2854.     * same order that they appear in the brw_wm_barycentric_interp_mode
  2855.     * enum.  Each set of coordinates occupies 2 registers if dispatch width
  2856.     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
  2857.     * appear if they were enabled using the "Barycentric Interpolation
  2858.     * Mode" bits in WM_STATE.
  2859.     */
  2860.    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
  2861.       if (barycentric_interp_modes & (1 << i)) {
  2862.          c->barycentric_coord_reg[i] = c->nr_payload_regs;
  2863.          c->nr_payload_regs += 2;
  2864.          if (dispatch_width == 16) {
  2865.             c->nr_payload_regs += 2;
  2866.          }
  2867.       }
  2868.    }
  2869.  
  2870.    /* R27: interpolated depth if uses source depth */
  2871.    if (uses_depth) {
  2872.       c->source_depth_reg = c->nr_payload_regs;
  2873.       c->nr_payload_regs++;
  2874.       if (dispatch_width == 16) {
  2875.          /* R28: interpolated depth if not 8-wide. */
  2876.          c->nr_payload_regs++;
  2877.       }
  2878.    }
  2879.    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
  2880.    if (uses_depth) {
  2881.       c->source_w_reg = c->nr_payload_regs;
  2882.       c->nr_payload_regs++;
  2883.       if (dispatch_width == 16) {
  2884.          /* R30: interpolated W if not 8-wide. */
  2885.          c->nr_payload_regs++;
  2886.       }
  2887.    }
  2888.    /* R31: MSAA position offsets. */
  2889.    /* R32-: bary for 32-pixel. */
  2890.    /* R58-59: interp W for 32-pixel. */
  2891.  
  2892.    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
  2893.       c->source_depth_to_render_target = true;
  2894.    }
  2895. }
  2896.  
  2897. bool
  2898. fs_visitor::run()
  2899. {
  2900.    sanity_param_count = fp->Base.Parameters->NumParameters;
  2901.    uint32_t orig_nr_params = c->prog_data.nr_params;
  2902.  
  2903.    if (brw->gen >= 6)
  2904.       setup_payload_gen6();
  2905.    else
  2906.       setup_payload_gen4();
  2907.  
  2908.    if (0) {
  2909.       emit_dummy_fs();
  2910.    } else {
  2911.       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
  2912.          emit_shader_time_begin();
  2913.  
  2914.       calculate_urb_setup();
  2915.       if (brw->gen < 6)
  2916.          emit_interpolation_setup_gen4();
  2917.       else
  2918.          emit_interpolation_setup_gen6();
  2919.  
  2920.       /* We handle discards by keeping track of the still-live pixels in f0.1.
  2921.        * Initialize it with the dispatched pixels.
  2922.        */
  2923.       if (fp->UsesKill) {
  2924.          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
  2925.          discard_init->flag_subreg = 1;
  2926.       }
  2927.  
  2928.       /* Generate FS IR for main().  (the visitor only descends into
  2929.        * functions called "main").
  2930.        */
  2931.       if (shader) {
  2932.          foreach_list(node, &*shader->ir) {
  2933.             ir_instruction *ir = (ir_instruction *)node;
  2934.             base_ir = ir;
  2935.             this->result = reg_undef;
  2936.             ir->accept(this);
  2937.          }
  2938.       } else {
  2939.          emit_fragment_program_code();
  2940.       }
  2941.       base_ir = NULL;
  2942.       if (failed)
  2943.          return false;
  2944.  
  2945.       emit(FS_OPCODE_PLACEHOLDER_HALT);
  2946.  
  2947.       emit_fb_writes();
  2948.  
  2949.       split_virtual_grfs();
  2950.  
  2951.       move_uniform_array_access_to_pull_constants();
  2952.       setup_pull_constants();
  2953.  
  2954.       bool progress;
  2955.       do {
  2956.          progress = false;
  2957.  
  2958.          compact_virtual_grfs();
  2959.  
  2960.          progress = remove_duplicate_mrf_writes() || progress;
  2961.  
  2962.          progress = opt_algebraic() || progress;
  2963.          progress = opt_cse() || progress;
  2964.          progress = opt_copy_propagate() || progress;
  2965.          progress = dead_code_eliminate() || progress;
  2966.          progress = dead_code_eliminate_local() || progress;
  2967.          progress = register_coalesce() || progress;
  2968.          progress = register_coalesce_2() || progress;
  2969.          progress = compute_to_mrf() || progress;
  2970.       } while (progress);
  2971.  
  2972.       remove_dead_constants();
  2973.  
  2974.       schedule_instructions(false);
  2975.  
  2976.       lower_uniform_pull_constant_loads();
  2977.  
  2978.       assign_curb_setup();
  2979.       assign_urb_setup();
  2980.  
  2981.       if (0) {
  2982.          /* Debug of register spilling: Go spill everything. */
  2983.          for (int i = 0; i < virtual_grf_count; i++) {
  2984.             spill_reg(i);
  2985.          }
  2986.       }
  2987.  
  2988.       if (0)
  2989.          assign_regs_trivial();
  2990.       else {
  2991.          while (!assign_regs()) {
  2992.             if (failed)
  2993.                break;
  2994.          }
  2995.       }
  2996.    }
  2997.    assert(force_uncompressed_stack == 0);
  2998.    assert(force_sechalf_stack == 0);
  2999.  
  3000.    /* This must come after all optimization and register allocation, since
  3001.     * it inserts dead code that happens to have side effects, and it does
  3002.     * so based on the actual physical registers in use.
  3003.     */
  3004.    insert_gen4_send_dependency_workarounds();
  3005.  
  3006.    if (failed)
  3007.       return false;
  3008.  
  3009.    schedule_instructions(true);
  3010.  
  3011.    if (dispatch_width == 8) {
  3012.       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
  3013.    } else {
  3014.       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
  3015.  
  3016.       /* Make sure we didn't try to sneak in an extra uniform */
  3017.       assert(orig_nr_params == c->prog_data.nr_params);
  3018.       (void) orig_nr_params;
  3019.    }
  3020.  
  3021.    /* If any state parameters were appended, then ParameterValues could have
  3022.     * been realloced, in which case the driver uniform storage set up by
  3023.     * _mesa_associate_uniform_storage() would point to freed memory.  Make
  3024.     * sure that didn't happen.
  3025.     */
  3026.    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
  3027.  
  3028.    return !failed;
  3029. }
  3030.  
  3031. const unsigned *
  3032. brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
  3033.                struct gl_fragment_program *fp,
  3034.                struct gl_shader_program *prog,
  3035.                unsigned *final_assembly_size)
  3036. {
  3037.    bool start_busy = false;
  3038.    float start_time = 0;
  3039.  
  3040.    if (unlikely(brw->perf_debug)) {
  3041.       start_busy = (brw->batch.last_bo &&
  3042.                     drm_intel_bo_busy(brw->batch.last_bo));
  3043.       start_time = get_time();
  3044.    }
  3045.  
  3046.    struct brw_shader *shader = NULL;
  3047.    if (prog)
  3048.       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
  3049.  
  3050.    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
  3051.       if (prog) {
  3052.          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
  3053.          _mesa_print_ir(shader->ir, NULL);
  3054.          printf("\n\n");
  3055.       } else {
  3056.          printf("ARB_fragment_program %d ir for native fragment shader\n",
  3057.                 fp->Base.Id);
  3058.          _mesa_print_program(&fp->Base);
  3059.       }
  3060.    }
  3061.  
  3062.    /* Now the main event: Visit the shader IR and generate our FS IR for it.
  3063.     */
  3064.    fs_visitor v(brw, c, prog, fp, 8);
  3065.    if (!v.run()) {
  3066.       if (prog) {
  3067.          prog->LinkStatus = false;
  3068.          ralloc_strcat(&prog->InfoLog, v.fail_msg);
  3069.       }
  3070.  
  3071.       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
  3072.                     v.fail_msg);
  3073.  
  3074.       return NULL;
  3075.    }
  3076.  
  3077.    exec_list *simd16_instructions = NULL;
  3078.    fs_visitor v2(brw, c, prog, fp, 16);
  3079.    bool no16 = INTEL_DEBUG & DEBUG_NO16;
  3080.    if (brw->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
  3081.       v2.import_uniforms(&v);
  3082.       if (!v2.run()) {
  3083.          perf_debug("16-wide shader failed to compile, falling back to "
  3084.                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
  3085.       } else {
  3086.          simd16_instructions = &v2.instructions;
  3087.       }
  3088.    }
  3089.  
  3090.    c->prog_data.dispatch_width = 8;
  3091.  
  3092.    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
  3093.    const unsigned *generated = g.generate_assembly(&v.instructions,
  3094.                                                    simd16_instructions,
  3095.                                                    final_assembly_size);
  3096.  
  3097.    if (unlikely(brw->perf_debug) && shader) {
  3098.       if (shader->compiled_once)
  3099.          brw_wm_debug_recompile(brw, prog, &c->key);
  3100.       shader->compiled_once = true;
  3101.  
  3102.       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
  3103.          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
  3104.                     (get_time() - start_time) * 1000);
  3105.       }
  3106.    }
  3107.  
  3108.    return generated;
  3109. }
  3110.  
  3111. bool
  3112. brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
  3113. {
  3114.    struct brw_context *brw = brw_context(ctx);
  3115.    struct brw_wm_prog_key key;
  3116.  
  3117.    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
  3118.       return true;
  3119.  
  3120.    struct gl_fragment_program *fp = (struct gl_fragment_program *)
  3121.       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
  3122.    struct brw_fragment_program *bfp = brw_fragment_program(fp);
  3123.    bool program_uses_dfdy = fp->UsesDFdy;
  3124.  
  3125.    memset(&key, 0, sizeof(key));
  3126.  
  3127.    if (brw->gen < 6) {
  3128.       if (fp->UsesKill)
  3129.          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
  3130.  
  3131.       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
  3132.          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
  3133.  
  3134.       /* Just assume depth testing. */
  3135.       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
  3136.       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
  3137.    }
  3138.  
  3139.    if (brw->gen < 6)
  3140.       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
  3141.  
  3142.    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
  3143.       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
  3144.          continue;
  3145.  
  3146.       if (brw->gen < 6) {
  3147.          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
  3148.             key.input_slots_valid |= BITFIELD64_BIT(i);
  3149.       }
  3150.    }
  3151.  
  3152.    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
  3153.  
  3154.    for (int i = 0; i < MAX_SAMPLERS; i++) {
  3155.       if (fp->Base.ShadowSamplers & (1 << i)) {
  3156.          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
  3157.          key.tex.swizzles[i] =
  3158.             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
  3159.       } else {
  3160.          /* Color sampler: assume no swizzling. */
  3161.          key.tex.swizzles[i] = SWIZZLE_XYZW;
  3162.       }
  3163.    }
  3164.  
  3165.    if (fp->Base.InputsRead & VARYING_BIT_POS) {
  3166.       key.drawable_height = ctx->DrawBuffer->Height;
  3167.    }
  3168.  
  3169.    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
  3170.       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
  3171.    }
  3172.  
  3173.    key.nr_color_regions = 1;
  3174.  
  3175.    key.program_string_id = bfp->id;
  3176.  
  3177.    uint32_t old_prog_offset = brw->wm.prog_offset;
  3178.    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
  3179.  
  3180.    bool success = do_wm_prog(brw, prog, bfp, &key);
  3181.  
  3182.    brw->wm.prog_offset = old_prog_offset;
  3183.    brw->wm.prog_data = old_prog_data;
  3184.  
  3185.    return success;
  3186. }
  3187.