Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_fs_emit.cpp
  25.  *
  26.  * This file supports emitting code from the FS LIR to the actual
  27.  * native instructions.
  28.  */
  29.  
  30. extern "C" {
  31. #include "main/macros.h"
  32. #include "brw_context.h"
  33. #include "brw_eu.h"
  34. } /* extern "C" */
  35.  
  36. #include "brw_fs.h"
  37. #include "brw_cfg.h"
  38.  
  39. fs_generator::fs_generator(struct brw_context *brw,
  40.                            struct brw_wm_compile *c,
  41.                            struct gl_shader_program *prog,
  42.                            struct gl_fragment_program *fp,
  43.                            bool dual_source_output)
  44.  
  45.    : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output)
  46. {
  47.    ctx = &brw->ctx;
  48.  
  49.    shader = prog ? prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL;
  50.  
  51.    mem_ctx = c;
  52.  
  53.    p = rzalloc(mem_ctx, struct brw_compile);
  54.    brw_init_compile(brw, p, mem_ctx);
  55. }
  56.  
  57. fs_generator::~fs_generator()
  58. {
  59. }
  60.  
  61. void
  62. fs_generator::patch_discard_jumps_to_fb_writes()
  63. {
  64.    if (brw->gen < 6 || this->discard_halt_patches.is_empty())
  65.       return;
  66.  
  67.    /* There is a somewhat strange undocumented requirement of using
  68.     * HALT, according to the simulator.  If some channel has HALTed to
  69.     * a particular UIP, then by the end of the program, every channel
  70.     * must have HALTed to that UIP.  Furthermore, the tracking is a
  71.     * stack, so you can't do the final halt of a UIP after starting
  72.     * halting to a new UIP.
  73.     *
  74.     * Symptoms of not emitting this instruction on actual hardware
  75.     * included GPU hangs and sparkly rendering on the piglit discard
  76.     * tests.
  77.     */
  78.    struct brw_instruction *last_halt = gen6_HALT(p);
  79.    last_halt->bits3.break_cont.uip = 2;
  80.    last_halt->bits3.break_cont.jip = 2;
  81.  
  82.    int ip = p->nr_insn;
  83.  
  84.    foreach_list(node, &this->discard_halt_patches) {
  85.       ip_record *patch_ip = (ip_record *)node;
  86.       struct brw_instruction *patch = &p->store[patch_ip->ip];
  87.  
  88.       assert(patch->header.opcode == BRW_OPCODE_HALT);
  89.       /* HALT takes a half-instruction distance from the pre-incremented IP. */
  90.       patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
  91.    }
  92.  
  93.    this->discard_halt_patches.make_empty();
  94. }
  95.  
  96. void
  97. fs_generator::generate_fb_write(fs_inst *inst)
  98. {
  99.    bool eot = inst->eot;
  100.    struct brw_reg implied_header;
  101.    uint32_t msg_control;
  102.  
  103.    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
  104.     * move, here's g1.
  105.     */
  106.    brw_push_insn_state(p);
  107.    brw_set_mask_control(p, BRW_MASK_DISABLE);
  108.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  109.  
  110.    if (fp->UsesKill) {
  111.       struct brw_reg pixel_mask;
  112.  
  113.       if (brw->gen >= 6)
  114.          pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
  115.       else
  116.          pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
  117.  
  118.       brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
  119.    }
  120.  
  121.    if (inst->header_present) {
  122.       if (brw->gen >= 6) {
  123.          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  124.          brw_MOV(p,
  125.                  retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
  126.                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  127.          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  128.  
  129.          if (inst->target > 0 && c->key.replicate_alpha) {
  130.             /* Set "Source0 Alpha Present to RenderTarget" bit in message
  131.              * header.
  132.              */
  133.             brw_OR(p,
  134.                    vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
  135.                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  136.                    brw_imm_ud(0x1 << 11));
  137.          }
  138.  
  139.          if (inst->target > 0) {
  140.             /* Set the render target index for choosing BLEND_STATE. */
  141.             brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  142.                                            inst->base_mrf, 2),
  143.                               BRW_REGISTER_TYPE_UD),
  144.                     brw_imm_ud(inst->target));
  145.          }
  146.  
  147.          implied_header = brw_null_reg();
  148.       } else {
  149.          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
  150.  
  151.          brw_MOV(p,
  152.                  brw_message_reg(inst->base_mrf + 1),
  153.                  brw_vec8_grf(1, 0));
  154.       }
  155.    } else {
  156.       implied_header = brw_null_reg();
  157.    }
  158.  
  159.    if (this->dual_source_output)
  160.       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
  161.    else if (dispatch_width == 16)
  162.       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
  163.    else
  164.       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
  165.  
  166.    brw_pop_insn_state(p);
  167.  
  168.    brw_fb_WRITE(p,
  169.                 dispatch_width,
  170.                 inst->base_mrf,
  171.                 implied_header,
  172.                 msg_control,
  173.                 inst->target,
  174.                 inst->mlen,
  175.                 0,
  176.                 eot,
  177.                 inst->header_present);
  178. }
  179.  
  180. /* Computes the integer pixel x,y values from the origin.
  181.  *
  182.  * This is the basis of gl_FragCoord computation, but is also used
  183.  * pre-gen6 for computing the deltas from v0 for computing
  184.  * interpolation.
  185.  */
  186. void
  187. fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
  188. {
  189.    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
  190.    struct brw_reg src;
  191.    struct brw_reg deltas;
  192.  
  193.    if (is_x) {
  194.       src = stride(suboffset(g1_uw, 4), 2, 4, 0);
  195.       deltas = brw_imm_v(0x10101010);
  196.    } else {
  197.       src = stride(suboffset(g1_uw, 5), 2, 4, 0);
  198.       deltas = brw_imm_v(0x11001100);
  199.    }
  200.  
  201.    if (dispatch_width == 16) {
  202.       dst = vec16(dst);
  203.    }
  204.  
  205.    /* We do this 8 or 16-wide, but since the destination is UW we
  206.     * don't do compression in the 16-wide case.
  207.     */
  208.    brw_push_insn_state(p);
  209.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  210.    brw_ADD(p, dst, src, deltas);
  211.    brw_pop_insn_state(p);
  212. }
  213.  
  214. void
  215. fs_generator::generate_linterp(fs_inst *inst,
  216.                              struct brw_reg dst, struct brw_reg *src)
  217. {
  218.    struct brw_reg delta_x = src[0];
  219.    struct brw_reg delta_y = src[1];
  220.    struct brw_reg interp = src[2];
  221.  
  222.    if (brw->has_pln &&
  223.        delta_y.nr == delta_x.nr + 1 &&
  224.        (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
  225.       brw_PLN(p, dst, interp, delta_x);
  226.    } else {
  227.       brw_LINE(p, brw_null_reg(), interp, delta_x);
  228.       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
  229.    }
  230. }
  231.  
  232. void
  233. fs_generator::generate_math1_gen7(fs_inst *inst,
  234.                                 struct brw_reg dst,
  235.                                 struct brw_reg src0)
  236. {
  237.    assert(inst->mlen == 0);
  238.    brw_math(p, dst,
  239.             brw_math_function(inst->opcode),
  240.             0, src0,
  241.             BRW_MATH_DATA_VECTOR,
  242.             BRW_MATH_PRECISION_FULL);
  243. }
  244.  
  245. void
  246. fs_generator::generate_math2_gen7(fs_inst *inst,
  247.                                 struct brw_reg dst,
  248.                                 struct brw_reg src0,
  249.                                 struct brw_reg src1)
  250. {
  251.    assert(inst->mlen == 0);
  252.    brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
  253. }
  254.  
  255. void
  256. fs_generator::generate_math1_gen6(fs_inst *inst,
  257.                                 struct brw_reg dst,
  258.                                 struct brw_reg src0)
  259. {
  260.    int op = brw_math_function(inst->opcode);
  261.  
  262.    assert(inst->mlen == 0);
  263.  
  264.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  265.    brw_math(p, dst,
  266.             op,
  267.             0, src0,
  268.             BRW_MATH_DATA_VECTOR,
  269.             BRW_MATH_PRECISION_FULL);
  270.  
  271.    if (dispatch_width == 16) {
  272.       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  273.       brw_math(p, sechalf(dst),
  274.                op,
  275.                0, sechalf(src0),
  276.                BRW_MATH_DATA_VECTOR,
  277.                BRW_MATH_PRECISION_FULL);
  278.       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  279.    }
  280. }
  281.  
  282. void
  283. fs_generator::generate_math2_gen6(fs_inst *inst,
  284.                                 struct brw_reg dst,
  285.                                 struct brw_reg src0,
  286.                                 struct brw_reg src1)
  287. {
  288.    int op = brw_math_function(inst->opcode);
  289.  
  290.    assert(inst->mlen == 0);
  291.  
  292.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  293.    brw_math2(p, dst, op, src0, src1);
  294.  
  295.    if (dispatch_width == 16) {
  296.       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  297.       brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
  298.       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  299.    }
  300. }
  301.  
  302. void
  303. fs_generator::generate_math_gen4(fs_inst *inst,
  304.                                struct brw_reg dst,
  305.                                struct brw_reg src)
  306. {
  307.    int op = brw_math_function(inst->opcode);
  308.  
  309.    assert(inst->mlen >= 1);
  310.  
  311.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  312.    brw_math(p, dst,
  313.             op,
  314.             inst->base_mrf, src,
  315.             BRW_MATH_DATA_VECTOR,
  316.             BRW_MATH_PRECISION_FULL);
  317.  
  318.    if (dispatch_width == 16) {
  319.       brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  320.       brw_math(p, sechalf(dst),
  321.                op,
  322.                inst->base_mrf + 1, sechalf(src),
  323.                BRW_MATH_DATA_VECTOR,
  324.                BRW_MATH_PRECISION_FULL);
  325.  
  326.       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  327.    }
  328. }
  329.  
  330. void
  331. fs_generator::generate_math_g45(fs_inst *inst,
  332.                                 struct brw_reg dst,
  333.                                 struct brw_reg src)
  334. {
  335.    if (inst->opcode == SHADER_OPCODE_POW ||
  336.        inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
  337.        inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
  338.       generate_math_gen4(inst, dst, src);
  339.       return;
  340.    }
  341.  
  342.    int op = brw_math_function(inst->opcode);
  343.  
  344.    assert(inst->mlen >= 1);
  345.  
  346.    brw_math(p, dst,
  347.             op,
  348.             inst->base_mrf, src,
  349.             BRW_MATH_DATA_VECTOR,
  350.             BRW_MATH_PRECISION_FULL);
  351. }
  352.  
  353. void
  354. fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
  355. {
  356.    int msg_type = -1;
  357.    int rlen = 4;
  358.    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
  359.    uint32_t return_format;
  360.  
  361.    switch (dst.type) {
  362.    case BRW_REGISTER_TYPE_D:
  363.       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
  364.       break;
  365.    case BRW_REGISTER_TYPE_UD:
  366.       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
  367.       break;
  368.    default:
  369.       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
  370.       break;
  371.    }
  372.  
  373.    if (dispatch_width == 16)
  374.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  375.  
  376.    if (brw->gen >= 5) {
  377.       switch (inst->opcode) {
  378.       case SHADER_OPCODE_TEX:
  379.          if (inst->shadow_compare) {
  380.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
  381.          } else {
  382.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
  383.          }
  384.          break;
  385.       case FS_OPCODE_TXB:
  386.          if (inst->shadow_compare) {
  387.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
  388.          } else {
  389.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
  390.          }
  391.          break;
  392.       case SHADER_OPCODE_TXL:
  393.          if (inst->shadow_compare) {
  394.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
  395.          } else {
  396.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
  397.          }
  398.          break;
  399.       case SHADER_OPCODE_TXS:
  400.          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
  401.          break;
  402.       case SHADER_OPCODE_TXD:
  403.          if (inst->shadow_compare) {
  404.             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
  405.             assert(brw->is_haswell);
  406.             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
  407.          } else {
  408.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
  409.          }
  410.          break;
  411.       case SHADER_OPCODE_TXF:
  412.          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
  413.          break;
  414.       case SHADER_OPCODE_TXF_MS:
  415.          if (brw->gen >= 7)
  416.             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
  417.          else
  418.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
  419.          break;
  420.       case SHADER_OPCODE_LOD:
  421.          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
  422.          break;
  423.       default:
  424.          assert(!"not reached");
  425.          break;
  426.       }
  427.    } else {
  428.       switch (inst->opcode) {
  429.       case SHADER_OPCODE_TEX:
  430.          /* Note that G45 and older determines shadow compare and dispatch width
  431.           * from message length for most messages.
  432.           */
  433.          assert(dispatch_width == 8);
  434.          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
  435.          if (inst->shadow_compare) {
  436.             assert(inst->mlen == 6);
  437.          } else {
  438.             assert(inst->mlen <= 4);
  439.          }
  440.          break;
  441.       case FS_OPCODE_TXB:
  442.          if (inst->shadow_compare) {
  443.             assert(inst->mlen == 6);
  444.             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
  445.          } else {
  446.             assert(inst->mlen == 9);
  447.             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
  448.             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  449.          }
  450.          break;
  451.       case SHADER_OPCODE_TXL:
  452.          if (inst->shadow_compare) {
  453.             assert(inst->mlen == 6);
  454.             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
  455.          } else {
  456.             assert(inst->mlen == 9);
  457.             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
  458.             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  459.          }
  460.          break;
  461.       case SHADER_OPCODE_TXD:
  462.          /* There is no sample_d_c message; comparisons are done manually */
  463.          assert(inst->mlen == 7 || inst->mlen == 10);
  464.          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
  465.          break;
  466.       case SHADER_OPCODE_TXF:
  467.          assert(inst->mlen == 9);
  468.          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
  469.          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  470.          break;
  471.       case SHADER_OPCODE_TXS:
  472.          assert(inst->mlen == 3);
  473.          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
  474.          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  475.          break;
  476.       default:
  477.          assert(!"not reached");
  478.          break;
  479.       }
  480.    }
  481.    assert(msg_type != -1);
  482.  
  483.    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
  484.       rlen = 8;
  485.       dst = vec16(dst);
  486.    }
  487.  
  488.    /* Load the message header if present.  If there's a texture offset,
  489.     * we need to set it up explicitly and load the offset bitfield.
  490.     * Otherwise, we can use an implied move from g0 to the first message reg.
  491.     */
  492.    if (inst->texture_offset) {
  493.       brw_push_insn_state(p);
  494.       brw_set_mask_control(p, BRW_MASK_DISABLE);
  495.       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  496.       /* Explicitly set up the message header by copying g0 to the MRF. */
  497.       brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
  498.                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  499.  
  500.       /* Then set the offset bits in DWord 2. */
  501.       brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  502.                                      inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
  503.                  brw_imm_ud(inst->texture_offset));
  504.       brw_pop_insn_state(p);
  505.    } else if (inst->header_present) {
  506.       /* Set up an implied move from g0 to the MRF. */
  507.       src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
  508.    }
  509.  
  510.    brw_SAMPLE(p,
  511.               retype(dst, BRW_REGISTER_TYPE_UW),
  512.               inst->base_mrf,
  513.               src,
  514.               SURF_INDEX_TEXTURE(inst->sampler),
  515.               inst->sampler,
  516.               msg_type,
  517.               rlen,
  518.               inst->mlen,
  519.               inst->header_present,
  520.               simd_mode,
  521.               return_format);
  522. }
  523.  
  524.  
  525. /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
  526.  * looking like:
  527.  *
  528.  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
  529.  *
  530.  * and we're trying to produce:
  531.  *
  532.  *           DDX                     DDY
  533.  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
  534.  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
  535.  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
  536.  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
  537.  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
  538.  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
  539.  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
  540.  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
  541.  *
  542.  * and add another set of two more subspans if in 16-pixel dispatch mode.
  543.  *
  544.  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
  545.  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
  546.  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
  547.  * between each other.  We could probably do it like ddx and swizzle the right
  548.  * order later, but bail for now and just produce
  549.  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
  550.  */
  551. void
  552. fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
  553. {
  554.    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
  555.                                  BRW_REGISTER_TYPE_F,
  556.                                  BRW_VERTICAL_STRIDE_2,
  557.                                  BRW_WIDTH_2,
  558.                                  BRW_HORIZONTAL_STRIDE_0,
  559.                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  560.    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
  561.                                  BRW_REGISTER_TYPE_F,
  562.                                  BRW_VERTICAL_STRIDE_2,
  563.                                  BRW_WIDTH_2,
  564.                                  BRW_HORIZONTAL_STRIDE_0,
  565.                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  566.    brw_ADD(p, dst, src0, negate(src1));
  567. }
  568.  
  569. /* The negate_value boolean is used to negate the derivative computation for
  570.  * FBOs, since they place the origin at the upper left instead of the lower
  571.  * left.
  572.  */
  573. void
  574. fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
  575.                          bool negate_value)
  576. {
  577.    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
  578.                                  BRW_REGISTER_TYPE_F,
  579.                                  BRW_VERTICAL_STRIDE_4,
  580.                                  BRW_WIDTH_4,
  581.                                  BRW_HORIZONTAL_STRIDE_0,
  582.                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  583.    struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
  584.                                  BRW_REGISTER_TYPE_F,
  585.                                  BRW_VERTICAL_STRIDE_4,
  586.                                  BRW_WIDTH_4,
  587.                                  BRW_HORIZONTAL_STRIDE_0,
  588.                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  589.    if (negate_value)
  590.       brw_ADD(p, dst, src1, negate(src0));
  591.    else
  592.       brw_ADD(p, dst, src0, negate(src1));
  593. }
  594.  
  595. void
  596. fs_generator::generate_discard_jump(fs_inst *inst)
  597. {
  598.    assert(brw->gen >= 6);
  599.  
  600.    /* This HALT will be patched up at FB write time to point UIP at the end of
  601.     * the program, and at brw_uip_jip() JIP will be set to the end of the
  602.     * current block (or the program).
  603.     */
  604.    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
  605.  
  606.    brw_push_insn_state(p);
  607.    brw_set_mask_control(p, BRW_MASK_DISABLE);
  608.    gen6_HALT(p);
  609.    brw_pop_insn_state(p);
  610. }
  611.  
  612. void
  613. fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
  614. {
  615.    assert(inst->mlen != 0);
  616.  
  617.    brw_MOV(p,
  618.            retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
  619.            retype(src, BRW_REGISTER_TYPE_UD));
  620.    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
  621.                                  inst->offset);
  622. }
  623.  
  624. void
  625. fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
  626. {
  627.    assert(inst->mlen != 0);
  628.  
  629.    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
  630.                                 inst->offset);
  631. }
  632.  
  633. void
  634. fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
  635.                                                   struct brw_reg dst,
  636.                                                   struct brw_reg index,
  637.                                                   struct brw_reg offset)
  638. {
  639.    assert(inst->mlen != 0);
  640.  
  641.    assert(index.file == BRW_IMMEDIATE_VALUE &&
  642.           index.type == BRW_REGISTER_TYPE_UD);
  643.    uint32_t surf_index = index.dw1.ud;
  644.  
  645.    assert(offset.file == BRW_IMMEDIATE_VALUE &&
  646.           offset.type == BRW_REGISTER_TYPE_UD);
  647.    uint32_t read_offset = offset.dw1.ud;
  648.  
  649.    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
  650.                         read_offset, surf_index);
  651. }
  652.  
  653. void
  654. fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
  655.                                                        struct brw_reg dst,
  656.                                                        struct brw_reg index,
  657.                                                        struct brw_reg offset)
  658. {
  659.    assert(inst->mlen == 0);
  660.  
  661.    assert(index.file == BRW_IMMEDIATE_VALUE &&
  662.           index.type == BRW_REGISTER_TYPE_UD);
  663.    uint32_t surf_index = index.dw1.ud;
  664.  
  665.    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
  666.    /* Reference just the dword we need, to avoid angering validate_reg(). */
  667.    offset = brw_vec1_grf(offset.nr, 0);
  668.  
  669.    brw_push_insn_state(p);
  670.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  671.    brw_set_mask_control(p, BRW_MASK_DISABLE);
  672.    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
  673.    brw_pop_insn_state(p);
  674.  
  675.    /* We use the SIMD4x2 mode because we want to end up with 4 components in
  676.     * the destination loaded consecutively from the same offset (which appears
  677.     * in the first component, and the rest are ignored).
  678.     */
  679.    dst.width = BRW_WIDTH_4;
  680.    brw_set_dest(p, send, dst);
  681.    brw_set_src0(p, send, offset);
  682.    brw_set_sampler_message(p, send,
  683.                            surf_index,
  684.                            0, /* LD message ignores sampler unit */
  685.                            GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
  686.                            1, /* rlen */
  687.                            1, /* mlen */
  688.                            false, /* no header */
  689.                            BRW_SAMPLER_SIMD_MODE_SIMD4X2,
  690.                            0);
  691. }
  692.  
  693. void
  694. fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
  695.                                                   struct brw_reg dst,
  696.                                                   struct brw_reg index,
  697.                                                   struct brw_reg offset)
  698. {
  699.    assert(brw->gen < 7); /* Should use the gen7 variant. */
  700.    assert(inst->header_present);
  701.    assert(inst->mlen);
  702.  
  703.    assert(index.file == BRW_IMMEDIATE_VALUE &&
  704.           index.type == BRW_REGISTER_TYPE_UD);
  705.    uint32_t surf_index = index.dw1.ud;
  706.  
  707.    uint32_t simd_mode, rlen, msg_type;
  708.    if (dispatch_width == 16) {
  709.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  710.       rlen = 8;
  711.    } else {
  712.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
  713.       rlen = 4;
  714.    }
  715.  
  716.    if (brw->gen >= 5)
  717.       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
  718.    else {
  719.       /* We always use the SIMD16 message so that we only have to load U, and
  720.        * not V or R.
  721.        */
  722.       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
  723.       assert(inst->mlen == 3);
  724.       assert(inst->regs_written == 8);
  725.       rlen = 8;
  726.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  727.    }
  728.  
  729.    struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
  730.                                       BRW_REGISTER_TYPE_D);
  731.    brw_MOV(p, offset_mrf, offset);
  732.  
  733.    struct brw_reg header = brw_vec8_grf(0, 0);
  734.    gen6_resolve_implied_move(p, &header, inst->base_mrf);
  735.  
  736.    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
  737.    send->header.compression_control = BRW_COMPRESSION_NONE;
  738.    brw_set_dest(p, send, dst);
  739.    brw_set_src0(p, send, header);
  740.    if (brw->gen < 6)
  741.       send->header.destreg__conditionalmod = inst->base_mrf;
  742.  
  743.    /* Our surface is set up as floats, regardless of what actual data is
  744.     * stored in it.
  745.     */
  746.    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
  747.    brw_set_sampler_message(p, send,
  748.                            surf_index,
  749.                            0, /* sampler (unused) */
  750.                            msg_type,
  751.                            rlen,
  752.                            inst->mlen,
  753.                            inst->header_present,
  754.                            simd_mode,
  755.                            return_format);
  756. }
  757.  
  758. void
  759. fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
  760.                                                        struct brw_reg dst,
  761.                                                        struct brw_reg index,
  762.                                                        struct brw_reg offset)
  763. {
  764.    assert(brw->gen >= 7);
  765.    /* Varying-offset pull constant loads are treated as a normal expression on
  766.     * gen7, so the fact that it's a send message is hidden at the IR level.
  767.     */
  768.    assert(!inst->header_present);
  769.    assert(!inst->mlen);
  770.  
  771.    assert(index.file == BRW_IMMEDIATE_VALUE &&
  772.           index.type == BRW_REGISTER_TYPE_UD);
  773.    uint32_t surf_index = index.dw1.ud;
  774.  
  775.    uint32_t simd_mode, rlen, mlen;
  776.    if (dispatch_width == 16) {
  777.       mlen = 2;
  778.       rlen = 8;
  779.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  780.    } else {
  781.       mlen = 1;
  782.       rlen = 4;
  783.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
  784.    }
  785.  
  786.    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
  787.    brw_set_dest(p, send, dst);
  788.    brw_set_src0(p, send, offset);
  789.    brw_set_sampler_message(p, send,
  790.                            surf_index,
  791.                            0, /* LD message ignores sampler unit */
  792.                            GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
  793.                            rlen,
  794.                            mlen,
  795.                            false, /* no header */
  796.                            simd_mode,
  797.                            0);
  798. }
  799.  
  800. /**
  801.  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
  802.  * into the flags register (f0.0).
  803.  *
  804.  * Used only on Gen6 and above.
  805.  */
  806. void
  807. fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
  808. {
  809.    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
  810.    struct brw_reg dispatch_mask;
  811.  
  812.    if (brw->gen >= 6)
  813.       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
  814.    else
  815.       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
  816.  
  817.    brw_push_insn_state(p);
  818.    brw_set_mask_control(p, BRW_MASK_DISABLE);
  819.    brw_MOV(p, flags, dispatch_mask);
  820.    brw_pop_insn_state(p);
  821. }
  822.  
  823.  
  824. static uint32_t brw_file_from_reg(fs_reg *reg)
  825. {
  826.    switch (reg->file) {
  827.    case ARF:
  828.       return BRW_ARCHITECTURE_REGISTER_FILE;
  829.    case GRF:
  830.       return BRW_GENERAL_REGISTER_FILE;
  831.    case MRF:
  832.       return BRW_MESSAGE_REGISTER_FILE;
  833.    case IMM:
  834.       return BRW_IMMEDIATE_VALUE;
  835.    default:
  836.       assert(!"not reached");
  837.       return BRW_GENERAL_REGISTER_FILE;
  838.    }
  839. }
  840.  
  841. static struct brw_reg
  842. brw_reg_from_fs_reg(fs_reg *reg)
  843. {
  844.    struct brw_reg brw_reg;
  845.  
  846.    switch (reg->file) {
  847.    case GRF:
  848.    case ARF:
  849.    case MRF:
  850.       if (reg->smear == -1) {
  851.          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
  852.       } else {
  853.          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
  854.       }
  855.       brw_reg = retype(brw_reg, reg->type);
  856.       if (reg->sechalf)
  857.          brw_reg = sechalf(brw_reg);
  858.       break;
  859.    case IMM:
  860.       switch (reg->type) {
  861.       case BRW_REGISTER_TYPE_F:
  862.          brw_reg = brw_imm_f(reg->imm.f);
  863.          break;
  864.       case BRW_REGISTER_TYPE_D:
  865.          brw_reg = brw_imm_d(reg->imm.i);
  866.          break;
  867.       case BRW_REGISTER_TYPE_UD:
  868.          brw_reg = brw_imm_ud(reg->imm.u);
  869.          break;
  870.       default:
  871.          assert(!"not reached");
  872.          brw_reg = brw_null_reg();
  873.          break;
  874.       }
  875.       break;
  876.    case HW_REG:
  877.       brw_reg = reg->fixed_hw_reg;
  878.       break;
  879.    case BAD_FILE:
  880.       /* Probably unused. */
  881.       brw_reg = brw_null_reg();
  882.       break;
  883.    case UNIFORM:
  884.       assert(!"not reached");
  885.       brw_reg = brw_null_reg();
  886.       break;
  887.    default:
  888.       assert(!"not reached");
  889.       brw_reg = brw_null_reg();
  890.       break;
  891.    }
  892.    if (reg->abs)
  893.       brw_reg = brw_abs(brw_reg);
  894.    if (reg->negate)
  895.       brw_reg = negate(brw_reg);
  896.  
  897.    return brw_reg;
  898. }
  899.  
  900. /**
  901.  * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
  902.  * sampler LD messages.
  903.  *
  904.  * We don't want to bake it into the send message's code generation because
  905.  * that means we don't get a chance to schedule the instructions.
  906.  */
  907. void
  908. fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
  909.                                           struct brw_reg dst,
  910.                                           struct brw_reg value)
  911. {
  912.    assert(value.file == BRW_IMMEDIATE_VALUE);
  913.  
  914.    brw_push_insn_state(p);
  915.    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  916.    brw_set_mask_control(p, BRW_MASK_DISABLE);
  917.    brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
  918.    brw_pop_insn_state(p);
  919. }
  920.  
  921. /**
  922.  * Change the register's data type from UD to W, doubling the strides in order
  923.  * to compensate for halving the data type width.
  924.  */
  925. static struct brw_reg
  926. ud_reg_to_w(struct brw_reg r)
  927. {
  928.    assert(r.type == BRW_REGISTER_TYPE_UD);
  929.    r.type = BRW_REGISTER_TYPE_W;
  930.  
  931.    /* The BRW_*_STRIDE enums are defined so that incrementing the field
  932.     * doubles the real stride.
  933.     */
  934.    if (r.hstride != 0)
  935.       ++r.hstride;
  936.    if (r.vstride != 0)
  937.       ++r.vstride;
  938.  
  939.    return r;
  940. }
  941.  
  942. void
  943. fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
  944.                                             struct brw_reg dst,
  945.                                             struct brw_reg x,
  946.                                             struct brw_reg y)
  947. {
  948.    assert(brw->gen >= 7);
  949.    assert(dst.type == BRW_REGISTER_TYPE_UD);
  950.    assert(x.type == BRW_REGISTER_TYPE_F);
  951.    assert(y.type == BRW_REGISTER_TYPE_F);
  952.  
  953.    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
  954.     *
  955.     *   Because this instruction does not have a 16-bit floating-point type,
  956.     *   the destination data type must be Word (W).
  957.     *
  958.     *   The destination must be DWord-aligned and specify a horizontal stride
  959.     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
  960.     *   each destination channel and the upper word is not modified.
  961.     */
  962.    struct brw_reg dst_w = ud_reg_to_w(dst);
  963.  
  964.    /* Give each 32-bit channel of dst the form below , where "." means
  965.     * unchanged.
  966.     *   0x....hhhh
  967.     */
  968.    brw_F32TO16(p, dst_w, y);
  969.  
  970.    /* Now the form:
  971.     *   0xhhhh0000
  972.     */
  973.    brw_SHL(p, dst, dst, brw_imm_ud(16u));
  974.  
  975.    /* And, finally the form of packHalf2x16's output:
  976.     *   0xhhhhllll
  977.     */
  978.    brw_F32TO16(p, dst_w, x);
  979. }
  980.  
  981. void
  982. fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
  983.                                               struct brw_reg dst,
  984.                                               struct brw_reg src)
  985. {
  986.    assert(brw->gen >= 7);
  987.    assert(dst.type == BRW_REGISTER_TYPE_F);
  988.    assert(src.type == BRW_REGISTER_TYPE_UD);
  989.  
  990.    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
  991.     *
  992.     *   Because this instruction does not have a 16-bit floating-point type,
  993.     *   the source data type must be Word (W). The destination type must be
  994.     *   F (Float).
  995.     */
  996.    struct brw_reg src_w = ud_reg_to_w(src);
  997.  
  998.    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
  999.     * For the Y case, we wish to access only the upper word; therefore
  1000.     * a 16-bit subregister offset is needed.
  1001.     */
  1002.    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
  1003.           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
  1004.    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
  1005.       src_w.subnr += 2;
  1006.  
  1007.    brw_F16TO32(p, dst, src_w);
  1008. }
  1009.  
  1010. void
  1011. fs_generator::generate_shader_time_add(fs_inst *inst,
  1012.                                        struct brw_reg payload,
  1013.                                        struct brw_reg offset,
  1014.                                        struct brw_reg value)
  1015. {
  1016.    assert(brw->gen >= 7);
  1017.    brw_push_insn_state(p);
  1018.    brw_set_mask_control(p, true);
  1019.  
  1020.    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
  1021.    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
  1022.                                           offset.type);
  1023.    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
  1024.                                          value.type);
  1025.  
  1026.    assert(offset.file == BRW_IMMEDIATE_VALUE);
  1027.    if (value.file == BRW_GENERAL_REGISTER_FILE) {
  1028.       value.width = BRW_WIDTH_1;
  1029.       value.hstride = BRW_HORIZONTAL_STRIDE_0;
  1030.       value.vstride = BRW_VERTICAL_STRIDE_0;
  1031.    } else {
  1032.       assert(value.file == BRW_IMMEDIATE_VALUE);
  1033.    }
  1034.  
  1035.    /* Trying to deal with setup of the params from the IR is crazy in the FS8
  1036.     * case, and we don't really care about squeezing every bit of performance
  1037.     * out of this path, so we just emit the MOVs from here.
  1038.     */
  1039.    brw_MOV(p, payload_offset, offset);
  1040.    brw_MOV(p, payload_value, value);
  1041.    brw_shader_time_add(p, payload, SURF_INDEX_WM_SHADER_TIME);
  1042.    brw_pop_insn_state(p);
  1043. }
  1044.  
  1045. void
  1046. fs_generator::generate_code(exec_list *instructions)
  1047. {
  1048.    int last_native_insn_offset = p->next_insn_offset;
  1049.    const char *last_annotation_string = NULL;
  1050.    const void *last_annotation_ir = NULL;
  1051.  
  1052.    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
  1053.       if (shader) {
  1054.          printf("Native code for fragment shader %d (%d-wide dispatch):\n",
  1055.                 prog->Name, dispatch_width);
  1056.       } else {
  1057.          printf("Native code for fragment program %d (%d-wide dispatch):\n",
  1058.                 fp->Base.Id, dispatch_width);
  1059.       }
  1060.    }
  1061.  
  1062.    cfg_t *cfg = NULL;
  1063.    if (unlikely(INTEL_DEBUG & DEBUG_WM))
  1064.       cfg = new(mem_ctx) cfg_t(mem_ctx, instructions);
  1065.  
  1066.    foreach_list(node, instructions) {
  1067.       fs_inst *inst = (fs_inst *)node;
  1068.       struct brw_reg src[3], dst;
  1069.  
  1070.       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
  1071.          foreach_list(node, &cfg->block_list) {
  1072.             bblock_link *link = (bblock_link *)node;
  1073.             bblock_t *block = link->block;
  1074.  
  1075.             if (block->start == inst) {
  1076.                printf("   START B%d", block->block_num);
  1077.                foreach_list(predecessor_node, &block->parents) {
  1078.                   bblock_link *predecessor_link =
  1079.                      (bblock_link *)predecessor_node;
  1080.                   bblock_t *predecessor_block = predecessor_link->block;
  1081.                   printf(" <-B%d", predecessor_block->block_num);
  1082.                }
  1083.                printf("\n");
  1084.             }
  1085.          }
  1086.  
  1087.          if (last_annotation_ir != inst->ir) {
  1088.             last_annotation_ir = inst->ir;
  1089.             if (last_annotation_ir) {
  1090.                printf("   ");
  1091.                if (shader)
  1092.                   ((ir_instruction *)inst->ir)->print();
  1093.                else {
  1094.                   const prog_instruction *fpi;
  1095.                   fpi = (const prog_instruction *)inst->ir;
  1096.                   printf("%d: ", (int)(fpi - fp->Base.Instructions));
  1097.                   _mesa_fprint_instruction_opt(stdout,
  1098.                                                fpi,
  1099.                                                0, PROG_PRINT_DEBUG, NULL);
  1100.                }
  1101.                printf("\n");
  1102.             }
  1103.          }
  1104.          if (last_annotation_string != inst->annotation) {
  1105.             last_annotation_string = inst->annotation;
  1106.             if (last_annotation_string)
  1107.                printf("   %s\n", last_annotation_string);
  1108.          }
  1109.       }
  1110.  
  1111.       for (unsigned int i = 0; i < 3; i++) {
  1112.          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
  1113.  
  1114.          /* The accumulator result appears to get used for the
  1115.           * conditional modifier generation.  When negating a UD
  1116.           * value, there is a 33rd bit generated for the sign in the
  1117.           * accumulator value, so now you can't check, for example,
  1118.           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
  1119.           */
  1120.          assert(!inst->conditional_mod ||
  1121.                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
  1122.                 !inst->src[i].negate);
  1123.       }
  1124.       dst = brw_reg_from_fs_reg(&inst->dst);
  1125.  
  1126.       brw_set_conditionalmod(p, inst->conditional_mod);
  1127.       brw_set_predicate_control(p, inst->predicate);
  1128.       brw_set_predicate_inverse(p, inst->predicate_inverse);
  1129.       brw_set_flag_reg(p, 0, inst->flag_subreg);
  1130.       brw_set_saturate(p, inst->saturate);
  1131.       brw_set_mask_control(p, inst->force_writemask_all);
  1132.  
  1133.       if (inst->force_uncompressed || dispatch_width == 8) {
  1134.          brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  1135.       } else if (inst->force_sechalf) {
  1136.          brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1137.       } else {
  1138.          brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1139.       }
  1140.  
  1141.       switch (inst->opcode) {
  1142.       case BRW_OPCODE_MOV:
  1143.          brw_MOV(p, dst, src[0]);
  1144.          break;
  1145.       case BRW_OPCODE_ADD:
  1146.          brw_ADD(p, dst, src[0], src[1]);
  1147.          break;
  1148.       case BRW_OPCODE_MUL:
  1149.          brw_MUL(p, dst, src[0], src[1]);
  1150.          break;
  1151.       case BRW_OPCODE_MACH:
  1152.          brw_set_acc_write_control(p, 1);
  1153.          brw_MACH(p, dst, src[0], src[1]);
  1154.          brw_set_acc_write_control(p, 0);
  1155.          break;
  1156.  
  1157.       case BRW_OPCODE_MAD:
  1158.          brw_set_access_mode(p, BRW_ALIGN_16);
  1159.          if (dispatch_width == 16) {
  1160.             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  1161.             brw_MAD(p, dst, src[0], src[1], src[2]);
  1162.             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1163.             brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1164.             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1165.          } else {
  1166.             brw_MAD(p, dst, src[0], src[1], src[2]);
  1167.          }
  1168.          brw_set_access_mode(p, BRW_ALIGN_1);
  1169.          break;
  1170.  
  1171.       case BRW_OPCODE_LRP:
  1172.          brw_set_access_mode(p, BRW_ALIGN_16);
  1173.          if (dispatch_width == 16) {
  1174.             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  1175.             brw_LRP(p, dst, src[0], src[1], src[2]);
  1176.             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1177.             brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1178.             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1179.          } else {
  1180.             brw_LRP(p, dst, src[0], src[1], src[2]);
  1181.          }
  1182.          brw_set_access_mode(p, BRW_ALIGN_1);
  1183.          break;
  1184.  
  1185.       case BRW_OPCODE_FRC:
  1186.          brw_FRC(p, dst, src[0]);
  1187.          break;
  1188.       case BRW_OPCODE_RNDD:
  1189.          brw_RNDD(p, dst, src[0]);
  1190.          break;
  1191.       case BRW_OPCODE_RNDE:
  1192.          brw_RNDE(p, dst, src[0]);
  1193.          break;
  1194.       case BRW_OPCODE_RNDZ:
  1195.          brw_RNDZ(p, dst, src[0]);
  1196.          break;
  1197.  
  1198.       case BRW_OPCODE_AND:
  1199.          brw_AND(p, dst, src[0], src[1]);
  1200.          break;
  1201.       case BRW_OPCODE_OR:
  1202.          brw_OR(p, dst, src[0], src[1]);
  1203.          break;
  1204.       case BRW_OPCODE_XOR:
  1205.          brw_XOR(p, dst, src[0], src[1]);
  1206.          break;
  1207.       case BRW_OPCODE_NOT:
  1208.          brw_NOT(p, dst, src[0]);
  1209.          break;
  1210.       case BRW_OPCODE_ASR:
  1211.          brw_ASR(p, dst, src[0], src[1]);
  1212.          break;
  1213.       case BRW_OPCODE_SHR:
  1214.          brw_SHR(p, dst, src[0], src[1]);
  1215.          break;
  1216.       case BRW_OPCODE_SHL:
  1217.          brw_SHL(p, dst, src[0], src[1]);
  1218.          break;
  1219.       case BRW_OPCODE_F32TO16:
  1220.          brw_F32TO16(p, dst, src[0]);
  1221.          break;
  1222.       case BRW_OPCODE_F16TO32:
  1223.          brw_F16TO32(p, dst, src[0]);
  1224.          break;
  1225.       case BRW_OPCODE_CMP:
  1226.          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
  1227.          break;
  1228.       case BRW_OPCODE_SEL:
  1229.          brw_SEL(p, dst, src[0], src[1]);
  1230.          break;
  1231.       case BRW_OPCODE_BFREV:
  1232.          /* BFREV only supports UD type for src and dst. */
  1233.          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
  1234.                       retype(src[0], BRW_REGISTER_TYPE_UD));
  1235.          break;
  1236.       case BRW_OPCODE_FBH:
  1237.          /* FBH only supports UD type for dst. */
  1238.          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
  1239.          break;
  1240.       case BRW_OPCODE_FBL:
  1241.          /* FBL only supports UD type for dst. */
  1242.          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
  1243.          break;
  1244.       case BRW_OPCODE_CBIT:
  1245.          /* CBIT only supports UD type for dst. */
  1246.          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
  1247.          break;
  1248.  
  1249.       case BRW_OPCODE_BFE:
  1250.          brw_set_access_mode(p, BRW_ALIGN_16);
  1251.          if (dispatch_width == 16) {
  1252.             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  1253.             brw_BFE(p, dst, src[0], src[1], src[2]);
  1254.             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1255.             brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1256.             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1257.          } else {
  1258.             brw_BFE(p, dst, src[0], src[1], src[2]);
  1259.          }
  1260.          brw_set_access_mode(p, BRW_ALIGN_1);
  1261.          break;
  1262.  
  1263.       case BRW_OPCODE_BFI1:
  1264.          brw_BFI1(p, dst, src[0], src[1]);
  1265.          break;
  1266.       case BRW_OPCODE_BFI2:
  1267.          brw_set_access_mode(p, BRW_ALIGN_16);
  1268.          if (dispatch_width == 16) {
  1269.             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
  1270.             brw_BFI2(p, dst, src[0], src[1], src[2]);
  1271.             brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1272.             brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1273.             brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1274.          } else {
  1275.             brw_BFI2(p, dst, src[0], src[1], src[2]);
  1276.          }
  1277.          brw_set_access_mode(p, BRW_ALIGN_1);
  1278.          break;
  1279.  
  1280.       case BRW_OPCODE_IF:
  1281.          if (inst->src[0].file != BAD_FILE) {
  1282.             /* The instruction has an embedded compare (only allowed on gen6) */
  1283.             assert(brw->gen == 6);
  1284.             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
  1285.          } else {
  1286.             brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
  1287.          }
  1288.          break;
  1289.  
  1290.       case BRW_OPCODE_ELSE:
  1291.          brw_ELSE(p);
  1292.          break;
  1293.       case BRW_OPCODE_ENDIF:
  1294.          brw_ENDIF(p);
  1295.          break;
  1296.  
  1297.       case BRW_OPCODE_DO:
  1298.          brw_DO(p, BRW_EXECUTE_8);
  1299.          break;
  1300.  
  1301.       case BRW_OPCODE_BREAK:
  1302.          brw_BREAK(p);
  1303.          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
  1304.          break;
  1305.       case BRW_OPCODE_CONTINUE:
  1306.          /* FINISHME: We need to write the loop instruction support still. */
  1307.          if (brw->gen >= 6)
  1308.             gen6_CONT(p);
  1309.          else
  1310.             brw_CONT(p);
  1311.          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
  1312.          break;
  1313.  
  1314.       case BRW_OPCODE_WHILE:
  1315.          brw_WHILE(p);
  1316.          break;
  1317.  
  1318.       case SHADER_OPCODE_RCP:
  1319.       case SHADER_OPCODE_RSQ:
  1320.       case SHADER_OPCODE_SQRT:
  1321.       case SHADER_OPCODE_EXP2:
  1322.       case SHADER_OPCODE_LOG2:
  1323.       case SHADER_OPCODE_SIN:
  1324.       case SHADER_OPCODE_COS:
  1325.          if (brw->gen >= 7) {
  1326.             generate_math1_gen7(inst, dst, src[0]);
  1327.          } else if (brw->gen == 6) {
  1328.             generate_math1_gen6(inst, dst, src[0]);
  1329.          } else if (brw->gen == 5 || brw->is_g4x) {
  1330.             generate_math_g45(inst, dst, src[0]);
  1331.          } else {
  1332.             generate_math_gen4(inst, dst, src[0]);
  1333.          }
  1334.          break;
  1335.       case SHADER_OPCODE_INT_QUOTIENT:
  1336.       case SHADER_OPCODE_INT_REMAINDER:
  1337.       case SHADER_OPCODE_POW:
  1338.          if (brw->gen >= 7) {
  1339.             generate_math2_gen7(inst, dst, src[0], src[1]);
  1340.          } else if (brw->gen == 6) {
  1341.             generate_math2_gen6(inst, dst, src[0], src[1]);
  1342.          } else {
  1343.             generate_math_gen4(inst, dst, src[0]);
  1344.          }
  1345.          break;
  1346.       case FS_OPCODE_PIXEL_X:
  1347.          generate_pixel_xy(dst, true);
  1348.          break;
  1349.       case FS_OPCODE_PIXEL_Y:
  1350.          generate_pixel_xy(dst, false);
  1351.          break;
  1352.       case FS_OPCODE_CINTERP:
  1353.          brw_MOV(p, dst, src[0]);
  1354.          break;
  1355.       case FS_OPCODE_LINTERP:
  1356.          generate_linterp(inst, dst, src);
  1357.          break;
  1358.       case SHADER_OPCODE_TEX:
  1359.       case FS_OPCODE_TXB:
  1360.       case SHADER_OPCODE_TXD:
  1361.       case SHADER_OPCODE_TXF:
  1362.       case SHADER_OPCODE_TXF_MS:
  1363.       case SHADER_OPCODE_TXL:
  1364.       case SHADER_OPCODE_TXS:
  1365.       case SHADER_OPCODE_LOD:
  1366.          generate_tex(inst, dst, src[0]);
  1367.          break;
  1368.       case FS_OPCODE_DDX:
  1369.          generate_ddx(inst, dst, src[0]);
  1370.          break;
  1371.       case FS_OPCODE_DDY:
  1372.          /* Make sure fp->UsesDFdy flag got set (otherwise there's no
  1373.           * guarantee that c->key.render_to_fbo is set).
  1374.           */
  1375.          assert(fp->UsesDFdy);
  1376.          generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
  1377.          break;
  1378.  
  1379.       case FS_OPCODE_SPILL:
  1380.          generate_spill(inst, src[0]);
  1381.          break;
  1382.  
  1383.       case FS_OPCODE_UNSPILL:
  1384.          generate_unspill(inst, dst);
  1385.          break;
  1386.  
  1387.       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  1388.          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
  1389.          break;
  1390.  
  1391.       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
  1392.          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
  1393.          break;
  1394.  
  1395.       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
  1396.          generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
  1397.          break;
  1398.  
  1399.       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
  1400.          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
  1401.          break;
  1402.  
  1403.       case FS_OPCODE_FB_WRITE:
  1404.          generate_fb_write(inst);
  1405.          break;
  1406.  
  1407.       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
  1408.          generate_mov_dispatch_to_flags(inst);
  1409.          break;
  1410.  
  1411.       case FS_OPCODE_DISCARD_JUMP:
  1412.          generate_discard_jump(inst);
  1413.          break;
  1414.  
  1415.       case SHADER_OPCODE_SHADER_TIME_ADD:
  1416.          generate_shader_time_add(inst, src[0], src[1], src[2]);
  1417.          break;
  1418.  
  1419.       case FS_OPCODE_SET_SIMD4X2_OFFSET:
  1420.          generate_set_simd4x2_offset(inst, dst, src[0]);
  1421.          break;
  1422.  
  1423.       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
  1424.           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
  1425.           break;
  1426.  
  1427.       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
  1428.       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
  1429.          generate_unpack_half_2x16_split(inst, dst, src[0]);
  1430.          break;
  1431.  
  1432.       case FS_OPCODE_PLACEHOLDER_HALT:
  1433.          /* This is the place where the final HALT needs to be inserted if
  1434.           * we've emitted any discards.  If not, this will emit no code.
  1435.           */
  1436.          patch_discard_jumps_to_fb_writes();
  1437.          break;
  1438.  
  1439.       default:
  1440.          if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
  1441.             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
  1442.                           opcode_descs[inst->opcode].name);
  1443.          } else {
  1444.             _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
  1445.          }
  1446.          abort();
  1447.       }
  1448.  
  1449.       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
  1450.          brw_dump_compile(p, stdout,
  1451.                           last_native_insn_offset, p->next_insn_offset);
  1452.  
  1453.          foreach_list(node, &cfg->block_list) {
  1454.             bblock_link *link = (bblock_link *)node;
  1455.             bblock_t *block = link->block;
  1456.  
  1457.             if (block->end == inst) {
  1458.                printf("   END B%d", block->block_num);
  1459.                foreach_list(successor_node, &block->children) {
  1460.                   bblock_link *successor_link =
  1461.                      (bblock_link *)successor_node;
  1462.                   bblock_t *successor_block = successor_link->block;
  1463.                   printf(" ->B%d", successor_block->block_num);
  1464.                }
  1465.                printf("\n");
  1466.             }
  1467.          }
  1468.       }
  1469.  
  1470.       last_native_insn_offset = p->next_insn_offset;
  1471.    }
  1472.  
  1473.    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
  1474.       printf("\n");
  1475.    }
  1476.  
  1477.    brw_set_uip_jip(p);
  1478.  
  1479.    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
  1480.     * emit issues, it doesn't get the jump distances into the output,
  1481.     * which is often something we want to debug.  So this is here in
  1482.     * case you're doing that.
  1483.     */
  1484.    if (0) {
  1485.       brw_dump_compile(p, stdout, 0, p->next_insn_offset);
  1486.    }
  1487. }
  1488.  
  1489. const unsigned *
  1490. fs_generator::generate_assembly(exec_list *simd8_instructions,
  1491.                                 exec_list *simd16_instructions,
  1492.                                 unsigned *assembly_size)
  1493. {
  1494.    dispatch_width = 8;
  1495.    generate_code(simd8_instructions);
  1496.  
  1497.    if (simd16_instructions) {
  1498.       /* We have to do a compaction pass now, or the one at the end of
  1499.        * execution will squash down where our prog_offset start needs
  1500.        * to be.
  1501.        */
  1502.       brw_compact_instructions(p);
  1503.  
  1504.       /* align to 64 byte boundary. */
  1505.       while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) {
  1506.          brw_NOP(p);
  1507.       }
  1508.  
  1509.       /* Save off the start of this 16-wide program */
  1510.       c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction);
  1511.  
  1512.       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1513.  
  1514.       dispatch_width = 16;
  1515.       generate_code(simd16_instructions);
  1516.    }
  1517.  
  1518.    return brw_get_program(p, assembly_size);
  1519. }
  1520.