Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_fs_generator.cpp
  25.  *
  26.  * This file supports generating code from the FS LIR to the actual
  27.  * native instructions.
  28.  */
  29.  
  30. #include "main/macros.h"
  31. #include "brw_context.h"
  32. #include "brw_eu.h"
  33. #include "brw_fs.h"
  34. #include "brw_cfg.h"
  35.  
  36. static uint32_t brw_file_from_reg(fs_reg *reg)
  37. {
  38.    switch (reg->file) {
  39.    case GRF:
  40.       return BRW_GENERAL_REGISTER_FILE;
  41.    case MRF:
  42.       return BRW_MESSAGE_REGISTER_FILE;
  43.    case IMM:
  44.       return BRW_IMMEDIATE_VALUE;
  45.    default:
  46.       unreachable("not reached");
  47.    }
  48. }
  49.  
  50. static struct brw_reg
  51. brw_reg_from_fs_reg(fs_reg *reg)
  52. {
  53.    struct brw_reg brw_reg;
  54.  
  55.    switch (reg->file) {
  56.    case GRF:
  57.    case MRF:
  58.       if (reg->stride == 0) {
  59.          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
  60.       } else if (reg->width < 8) {
  61.          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
  62.          brw_reg = stride(brw_reg, reg->width * reg->stride,
  63.                           reg->width, reg->stride);
  64.       } else {
  65.          /* From the Haswell PRM:
  66.           *
  67.           * VertStride must be used to cross GRF register boundaries. This
  68.           * rule implies that elements within a 'Width' cannot cross GRF
  69.           * boundaries.
  70.           *
  71.           * So, for registers with width > 8, we have to use a width of 8
  72.           * and trust the compression state to sort out the exec size.
  73.           */
  74.          brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
  75.          brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
  76.       }
  77.  
  78.       brw_reg = retype(brw_reg, reg->type);
  79.       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
  80.       break;
  81.    case IMM:
  82.       switch (reg->type) {
  83.       case BRW_REGISTER_TYPE_F:
  84.          brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
  85.          break;
  86.       case BRW_REGISTER_TYPE_D:
  87.          brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
  88.          break;
  89.       case BRW_REGISTER_TYPE_UD:
  90.          brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
  91.          break;
  92.       case BRW_REGISTER_TYPE_W:
  93.          brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d);
  94.          break;
  95.       case BRW_REGISTER_TYPE_UW:
  96.          brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud);
  97.          break;
  98.       case BRW_REGISTER_TYPE_VF:
  99.          brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud);
  100.          break;
  101.       default:
  102.          unreachable("not reached");
  103.       }
  104.       break;
  105.    case HW_REG:
  106.       assert(reg->type == reg->fixed_hw_reg.type);
  107.       brw_reg = reg->fixed_hw_reg;
  108.       break;
  109.    case BAD_FILE:
  110.       /* Probably unused. */
  111.       brw_reg = brw_null_reg();
  112.       break;
  113.    default:
  114.       unreachable("not reached");
  115.    }
  116.    if (reg->abs)
  117.       brw_reg = brw_abs(brw_reg);
  118.    if (reg->negate)
  119.       brw_reg = negate(brw_reg);
  120.  
  121.    return brw_reg;
  122. }
  123.  
  124. fs_generator::fs_generator(struct brw_context *brw,
  125.                            void *mem_ctx,
  126.                            const void *key,
  127.                            struct brw_stage_prog_data *prog_data,
  128.                            struct gl_program *prog,
  129.                            unsigned promoted_constants,
  130.                            bool runtime_check_aads_emit,
  131.                            const char *stage_abbrev)
  132.  
  133.    : brw(brw), devinfo(brw->intelScreen->devinfo), key(key),
  134.      prog_data(prog_data),
  135.      prog(prog), promoted_constants(promoted_constants),
  136.      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
  137.      stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
  138. {
  139.    p = rzalloc(mem_ctx, struct brw_codegen);
  140.    brw_init_codegen(devinfo, p, mem_ctx);
  141. }
  142.  
  143. fs_generator::~fs_generator()
  144. {
  145. }
  146.  
  147. class ip_record : public exec_node {
  148. public:
  149.    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
  150.  
  151.    ip_record(int ip)
  152.    {
  153.       this->ip = ip;
  154.    }
  155.  
  156.    int ip;
  157. };
  158.  
  159. bool
  160. fs_generator::patch_discard_jumps_to_fb_writes()
  161. {
  162.    if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
  163.       return false;
  164.  
  165.    int scale = brw_jump_scale(p->devinfo);
  166.  
  167.    /* There is a somewhat strange undocumented requirement of using
  168.     * HALT, according to the simulator.  If some channel has HALTed to
  169.     * a particular UIP, then by the end of the program, every channel
  170.     * must have HALTed to that UIP.  Furthermore, the tracking is a
  171.     * stack, so you can't do the final halt of a UIP after starting
  172.     * halting to a new UIP.
  173.     *
  174.     * Symptoms of not emitting this instruction on actual hardware
  175.     * included GPU hangs and sparkly rendering on the piglit discard
  176.     * tests.
  177.     */
  178.    brw_inst *last_halt = gen6_HALT(p);
  179.    brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
  180.    brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
  181.  
  182.    int ip = p->nr_insn;
  183.  
  184.    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
  185.       brw_inst *patch = &p->store[patch_ip->ip];
  186.  
  187.       assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
  188.       /* HALT takes a half-instruction distance from the pre-incremented IP. */
  189.       brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
  190.    }
  191.  
  192.    this->discard_halt_patches.make_empty();
  193.    return true;
  194. }
  195.  
  196. void
  197. fs_generator::fire_fb_write(fs_inst *inst,
  198.                             struct brw_reg payload,
  199.                             struct brw_reg implied_header,
  200.                             GLuint nr)
  201. {
  202.    uint32_t msg_control;
  203.  
  204.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  205.  
  206.    if (devinfo->gen < 6) {
  207.       brw_push_insn_state(p);
  208.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  209.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  210.       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  211.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  212.       brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
  213.       brw_pop_insn_state(p);
  214.    }
  215.  
  216.    if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
  217.       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
  218.    else if (prog_data->dual_src_blend) {
  219.       if (dispatch_width == 8 || !inst->eot)
  220.          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
  221.       else
  222.          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
  223.    } else if (dispatch_width == 16)
  224.       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
  225.    else
  226.       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
  227.  
  228.    uint32_t surf_index =
  229.       prog_data->binding_table.render_target_start + inst->target;
  230.  
  231.    bool last_render_target = inst->eot ||
  232.                              (prog_data->dual_src_blend && dispatch_width == 16);
  233.  
  234.  
  235.    brw_fb_WRITE(p,
  236.                 dispatch_width,
  237.                 payload,
  238.                 implied_header,
  239.                 msg_control,
  240.                 surf_index,
  241.                 nr,
  242.                 0,
  243.                 inst->eot,
  244.                 last_render_target,
  245.                 inst->header_size != 0);
  246.  
  247.    brw_mark_surface_used(&prog_data->base, surf_index);
  248. }
  249.  
  250. void
  251. fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
  252. {
  253.    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
  254.    const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
  255.    struct brw_reg implied_header;
  256.  
  257.    if (devinfo->gen < 8 && !devinfo->is_haswell) {
  258.       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  259.    }
  260.  
  261.    if (inst->base_mrf >= 0)
  262.       payload = brw_message_reg(inst->base_mrf);
  263.  
  264.    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
  265.     * move, here's g1.
  266.     */
  267.    if (inst->header_size != 0) {
  268.       brw_push_insn_state(p);
  269.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  270.       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  271.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  272.       brw_set_default_flag_reg(p, 0, 0);
  273.  
  274.       /* On HSW, the GPU will use the predicate on SENDC, unless the header is
  275.        * present.
  276.        */
  277.       if (prog_data->uses_kill) {
  278.          struct brw_reg pixel_mask;
  279.  
  280.          if (devinfo->gen >= 6)
  281.             pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
  282.          else
  283.             pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
  284.  
  285.          brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
  286.       }
  287.  
  288.       if (devinfo->gen >= 6) {
  289.          brw_push_insn_state(p);
  290.          brw_set_default_exec_size(p, BRW_EXECUTE_16);
  291.          brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  292.          brw_MOV(p,
  293.                  retype(payload, BRW_REGISTER_TYPE_UD),
  294.                  retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  295.          brw_pop_insn_state(p);
  296.  
  297.          if (inst->target > 0 && key->replicate_alpha) {
  298.             /* Set "Source0 Alpha Present to RenderTarget" bit in message
  299.              * header.
  300.              */
  301.             brw_OR(p,
  302.                    vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
  303.                    vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
  304.                    brw_imm_ud(0x1 << 11));
  305.          }
  306.  
  307.          if (inst->target > 0) {
  308.             /* Set the render target index for choosing BLEND_STATE. */
  309.             brw_MOV(p, retype(vec1(suboffset(payload, 2)),
  310.                               BRW_REGISTER_TYPE_UD),
  311.                     brw_imm_ud(inst->target));
  312.          }
  313.  
  314.          implied_header = brw_null_reg();
  315.       } else {
  316.          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
  317.       }
  318.  
  319.       brw_pop_insn_state(p);
  320.    } else {
  321.       implied_header = brw_null_reg();
  322.    }
  323.  
  324.    if (!runtime_check_aads_emit) {
  325.       fire_fb_write(inst, payload, implied_header, inst->mlen);
  326.    } else {
  327.       /* This can only happen in gen < 6 */
  328.       assert(devinfo->gen < 6);
  329.  
  330.       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
  331.  
  332.       /* Check runtime bit to detect if we have to send AA data or not */
  333.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  334.       brw_AND(p,
  335.               v1_null_ud,
  336.               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
  337.               brw_imm_ud(1<<26));
  338.       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
  339.  
  340.       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
  341.       brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
  342.       {
  343.          /* Don't send AA data */
  344.          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
  345.       }
  346.       brw_land_fwd_jump(p, jmp);
  347.       fire_fb_write(inst, payload, implied_header, inst->mlen);
  348.    }
  349. }
  350.  
  351. void
  352. fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
  353. {
  354.    brw_inst *insn;
  355.  
  356.    insn = brw_next_insn(p, BRW_OPCODE_SEND);
  357.  
  358.    brw_set_dest(p, insn, brw_null_reg());
  359.    brw_set_src0(p, insn, payload);
  360.    brw_set_src1(p, insn, brw_imm_d(0));
  361.  
  362.    brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
  363.    brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
  364.  
  365.    brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
  366.    brw_inst_set_rlen(p->devinfo, insn, 0);
  367.    brw_inst_set_eot(p->devinfo, insn, inst->eot);
  368.    brw_inst_set_header_present(p->devinfo, insn, true);
  369.    brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
  370. }
  371.  
  372. void
  373. fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
  374. {
  375.    struct brw_inst *insn;
  376.  
  377.    insn = brw_next_insn(p, BRW_OPCODE_SEND);
  378.  
  379.    brw_set_dest(p, insn, brw_null_reg());
  380.    brw_set_src0(p, insn, payload);
  381.    brw_set_src1(p, insn, brw_imm_d(0));
  382.  
  383.    /* Terminate a compute shader by sending a message to the thread spawner.
  384.     */
  385.    brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
  386.    brw_inst_set_mlen(devinfo, insn, 1);
  387.    brw_inst_set_rlen(devinfo, insn, 0);
  388.    brw_inst_set_eot(devinfo, insn, inst->eot);
  389.    brw_inst_set_header_present(devinfo, insn, false);
  390.  
  391.    brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
  392.    brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
  393.  
  394.    /* Note that even though the thread has a URB resource associated with it,
  395.     * we set the "do not dereference URB" bit, because the URB resource is
  396.     * managed by the fixed-function unit, so it will free it automatically.
  397.     */
  398.    brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
  399.  
  400.    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
  401. }
  402.  
  403. void
  404. fs_generator::generate_blorp_fb_write(fs_inst *inst)
  405. {
  406.    brw_fb_WRITE(p,
  407.                 16 /* dispatch_width */,
  408.                 brw_message_reg(inst->base_mrf),
  409.                 brw_reg_from_fs_reg(&inst->src[0]),
  410.                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
  411.                 inst->target,
  412.                 inst->mlen,
  413.                 0,
  414.                 true,
  415.                 true,
  416.                 inst->header_size != 0);
  417. }
  418.  
  419. void
  420. fs_generator::generate_linterp(fs_inst *inst,
  421.                              struct brw_reg dst, struct brw_reg *src)
  422. {
  423.    /* PLN reads:
  424.     *                      /   in SIMD16   \
  425.     *    -----------------------------------
  426.     *   | src1+0 | src1+1 | src1+2 | src1+3 |
  427.     *   |-----------------------------------|
  428.     *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
  429.     *    -----------------------------------
  430.     *
  431.     * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
  432.     *
  433.     *    -----------------------------------
  434.     *   | src1+0 | src1+1 | src1+2 | src1+3 |
  435.     *   |-----------------------------------|
  436.     *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
  437.     *   |-----------------------------------|
  438.     *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
  439.     *    -----------------------------------
  440.     *
  441.     * See also: emit_interpolation_setup_gen4().
  442.     */
  443.    struct brw_reg delta_x = src[0];
  444.    struct brw_reg delta_y = offset(src[0], dispatch_width / 8);
  445.    struct brw_reg interp = src[1];
  446.  
  447.    if (devinfo->has_pln &&
  448.        (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
  449.       brw_PLN(p, dst, interp, delta_x);
  450.    } else {
  451.       brw_LINE(p, brw_null_reg(), interp, delta_x);
  452.       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
  453.    }
  454. }
  455.  
  456. void
  457. fs_generator::generate_math_gen6(fs_inst *inst,
  458.                                  struct brw_reg dst,
  459.                                  struct brw_reg src0,
  460.                                  struct brw_reg src1)
  461. {
  462.    int op = brw_math_function(inst->opcode);
  463.    bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
  464.  
  465.    if (dispatch_width == 8) {
  466.       gen6_math(p, dst, op, src0, src1);
  467.    } else if (dispatch_width == 16) {
  468.       brw_push_insn_state(p);
  469.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  470.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  471.       gen6_math(p, firsthalf(dst), op, firsthalf(src0), firsthalf(src1));
  472.       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  473.       gen6_math(p, sechalf(dst), op, sechalf(src0),
  474.                 binop ? sechalf(src1) : brw_null_reg());
  475.       brw_pop_insn_state(p);
  476.    }
  477. }
  478.  
  479. void
  480. fs_generator::generate_math_gen4(fs_inst *inst,
  481.                                struct brw_reg dst,
  482.                                struct brw_reg src)
  483. {
  484.    int op = brw_math_function(inst->opcode);
  485.  
  486.    assert(inst->mlen >= 1);
  487.  
  488.    if (dispatch_width == 8) {
  489.       gen4_math(p, dst,
  490.                 op,
  491.                 inst->base_mrf, src,
  492.                 BRW_MATH_PRECISION_FULL);
  493.    } else if (dispatch_width == 16) {
  494.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  495.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  496.       gen4_math(p, firsthalf(dst),
  497.                 op,
  498.                 inst->base_mrf, firsthalf(src),
  499.                 BRW_MATH_PRECISION_FULL);
  500.       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  501.       gen4_math(p, sechalf(dst),
  502.                 op,
  503.                 inst->base_mrf + 1, sechalf(src),
  504.                 BRW_MATH_PRECISION_FULL);
  505.  
  506.       brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  507.    }
  508. }
  509.  
  510. void
  511. fs_generator::generate_math_g45(fs_inst *inst,
  512.                                 struct brw_reg dst,
  513.                                 struct brw_reg src)
  514. {
  515.    if (inst->opcode == SHADER_OPCODE_POW ||
  516.        inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
  517.        inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
  518.       generate_math_gen4(inst, dst, src);
  519.       return;
  520.    }
  521.  
  522.    int op = brw_math_function(inst->opcode);
  523.  
  524.    assert(inst->mlen >= 1);
  525.  
  526.    gen4_math(p, dst,
  527.              op,
  528.              inst->base_mrf, src,
  529.              BRW_MATH_PRECISION_FULL);
  530. }
  531.  
  532. void
  533. fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
  534.                            struct brw_reg sampler_index)
  535. {
  536.    int msg_type = -1;
  537.    int rlen = 4;
  538.    uint32_t simd_mode;
  539.    uint32_t return_format;
  540.    bool is_combined_send = inst->eot;
  541.  
  542.    switch (dst.type) {
  543.    case BRW_REGISTER_TYPE_D:
  544.       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
  545.       break;
  546.    case BRW_REGISTER_TYPE_UD:
  547.       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
  548.       break;
  549.    default:
  550.       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
  551.       break;
  552.    }
  553.  
  554.    switch (inst->exec_size) {
  555.    case 8:
  556.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
  557.       break;
  558.    case 16:
  559.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  560.       break;
  561.    default:
  562.       unreachable("Invalid width for texture instruction");
  563.    }
  564.  
  565.    if (devinfo->gen >= 5) {
  566.       switch (inst->opcode) {
  567.       case SHADER_OPCODE_TEX:
  568.          if (inst->shadow_compare) {
  569.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
  570.          } else {
  571.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
  572.          }
  573.          break;
  574.       case FS_OPCODE_TXB:
  575.          if (inst->shadow_compare) {
  576.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
  577.          } else {
  578.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
  579.          }
  580.          break;
  581.       case SHADER_OPCODE_TXL:
  582.          if (inst->shadow_compare) {
  583.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
  584.          } else {
  585.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
  586.          }
  587.          break;
  588.       case SHADER_OPCODE_TXS:
  589.          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
  590.          break;
  591.       case SHADER_OPCODE_TXD:
  592.          if (inst->shadow_compare) {
  593.             /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
  594.             assert(devinfo->gen >= 8 || devinfo->is_haswell);
  595.             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
  596.          } else {
  597.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
  598.          }
  599.          break;
  600.       case SHADER_OPCODE_TXF:
  601.          msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
  602.          break;
  603.       case SHADER_OPCODE_TXF_CMS:
  604.          if (devinfo->gen >= 7)
  605.             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
  606.          else
  607.             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
  608.          break;
  609.       case SHADER_OPCODE_TXF_UMS:
  610.          assert(devinfo->gen >= 7);
  611.          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
  612.          break;
  613.       case SHADER_OPCODE_TXF_MCS:
  614.          assert(devinfo->gen >= 7);
  615.          msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
  616.          break;
  617.       case SHADER_OPCODE_LOD:
  618.          msg_type = GEN5_SAMPLER_MESSAGE_LOD;
  619.          break;
  620.       case SHADER_OPCODE_TG4:
  621.          if (inst->shadow_compare) {
  622.             assert(devinfo->gen >= 7);
  623.             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
  624.          } else {
  625.             assert(devinfo->gen >= 6);
  626.             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
  627.          }
  628.          break;
  629.       case SHADER_OPCODE_TG4_OFFSET:
  630.          assert(devinfo->gen >= 7);
  631.          if (inst->shadow_compare) {
  632.             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
  633.          } else {
  634.             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
  635.          }
  636.          break;
  637.       default:
  638.          unreachable("not reached");
  639.       }
  640.    } else {
  641.       switch (inst->opcode) {
  642.       case SHADER_OPCODE_TEX:
  643.          /* Note that G45 and older determines shadow compare and dispatch width
  644.           * from message length for most messages.
  645.           */
  646.          if (dispatch_width == 8) {
  647.             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
  648.             if (inst->shadow_compare) {
  649.                assert(inst->mlen == 6);
  650.             } else {
  651.                assert(inst->mlen <= 4);
  652.             }
  653.          } else {
  654.             if (inst->shadow_compare) {
  655.                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
  656.                assert(inst->mlen == 9);
  657.             } else {
  658.                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
  659.                assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
  660.             }
  661.          }
  662.          break;
  663.       case FS_OPCODE_TXB:
  664.          if (inst->shadow_compare) {
  665.             assert(dispatch_width == 8);
  666.             assert(inst->mlen == 6);
  667.             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
  668.          } else {
  669.             assert(inst->mlen == 9);
  670.             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
  671.             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  672.          }
  673.          break;
  674.       case SHADER_OPCODE_TXL:
  675.          if (inst->shadow_compare) {
  676.             assert(dispatch_width == 8);
  677.             assert(inst->mlen == 6);
  678.             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
  679.          } else {
  680.             assert(inst->mlen == 9);
  681.             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
  682.             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  683.          }
  684.          break;
  685.       case SHADER_OPCODE_TXD:
  686.          /* There is no sample_d_c message; comparisons are done manually */
  687.          assert(dispatch_width == 8);
  688.          assert(inst->mlen == 7 || inst->mlen == 10);
  689.          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
  690.          break;
  691.       case SHADER_OPCODE_TXF:
  692.          assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
  693.          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
  694.          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  695.          break;
  696.       case SHADER_OPCODE_TXS:
  697.          assert(inst->mlen == 3);
  698.          msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
  699.          simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  700.          break;
  701.       default:
  702.          unreachable("not reached");
  703.       }
  704.    }
  705.    assert(msg_type != -1);
  706.  
  707.    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
  708.       rlen = 8;
  709.       dst = vec16(dst);
  710.    }
  711.  
  712.    if (is_combined_send) {
  713.       assert(devinfo->gen >= 9 || devinfo->is_cherryview);
  714.       rlen = 0;
  715.    }
  716.  
  717.    assert(devinfo->gen < 7 || inst->header_size == 0 ||
  718.           src.file == BRW_GENERAL_REGISTER_FILE);
  719.  
  720.    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
  721.  
  722.    /* Load the message header if present.  If there's a texture offset,
  723.     * we need to set it up explicitly and load the offset bitfield.
  724.     * Otherwise, we can use an implied move from g0 to the first message reg.
  725.     */
  726.    if (inst->header_size != 0) {
  727.       if (devinfo->gen < 6 && !inst->offset) {
  728.          /* Set up an implied move from g0 to the MRF. */
  729.          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
  730.       } else {
  731.          struct brw_reg header_reg;
  732.  
  733.          if (devinfo->gen >= 7) {
  734.             header_reg = src;
  735.          } else {
  736.             assert(inst->base_mrf != -1);
  737.             header_reg = brw_message_reg(inst->base_mrf);
  738.          }
  739.  
  740.          brw_push_insn_state(p);
  741.          brw_set_default_exec_size(p, BRW_EXECUTE_8);
  742.          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  743.          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  744.          /* Explicitly set up the message header by copying g0 to the MRF. */
  745.          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
  746.  
  747.          if (inst->offset) {
  748.             /* Set the offset bits in DWord 2. */
  749.             brw_MOV(p, get_element_ud(header_reg, 2),
  750.                        brw_imm_ud(inst->offset));
  751.          }
  752.  
  753.          brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
  754.          brw_pop_insn_state(p);
  755.       }
  756.    }
  757.  
  758.    uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
  759.          inst->opcode == SHADER_OPCODE_TG4_OFFSET)
  760.          ? prog_data->binding_table.gather_texture_start
  761.          : prog_data->binding_table.texture_start;
  762.  
  763.    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
  764.       uint32_t sampler = sampler_index.dw1.ud;
  765.  
  766.       brw_SAMPLE(p,
  767.                  retype(dst, BRW_REGISTER_TYPE_UW),
  768.                  inst->base_mrf,
  769.                  src,
  770.                  sampler + base_binding_table_index,
  771.                  sampler % 16,
  772.                  msg_type,
  773.                  rlen,
  774.                  inst->mlen,
  775.                  inst->header_size != 0,
  776.                  simd_mode,
  777.                  return_format);
  778.  
  779.       brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
  780.    } else {
  781.       /* Non-const sampler index */
  782.       /* Note: this clobbers `dst` as a temporary before emitting the send */
  783.  
  784.       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
  785.       struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
  786.  
  787.       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
  788.  
  789.       brw_push_insn_state(p);
  790.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  791.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  792.  
  793.       /* Some care required: `sampler` and `temp` may alias:
  794.        *    addr = sampler & 0xff
  795.        *    temp = (sampler << 8) & 0xf00
  796.        *    addr = addr | temp
  797.        */
  798.       brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
  799.       brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
  800.       brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
  801.       brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
  802.       brw_OR(p, addr, addr, temp);
  803.  
  804.       brw_pop_insn_state(p);
  805.  
  806.       /* dst = send(offset, a0.0 | <descriptor>) */
  807.       brw_inst *insn = brw_send_indirect_message(
  808.          p, BRW_SFID_SAMPLER, dst, src, addr);
  809.       brw_set_sampler_message(p, insn,
  810.                               0 /* surface */,
  811.                               0 /* sampler */,
  812.                               msg_type,
  813.                               rlen,
  814.                               inst->mlen /* mlen */,
  815.                               inst->header_size != 0 /* header */,
  816.                               simd_mode,
  817.                               return_format);
  818.  
  819.       /* visitor knows more than we do about the surface limit required,
  820.        * so has already done marking.
  821.        */
  822.    }
  823.  
  824.    if (is_combined_send) {
  825.       brw_inst_set_eot(p->devinfo, brw_last_inst, true);
  826.       brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
  827.    }
  828. }
  829.  
  830.  
  831. /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
  832.  * looking like:
  833.  *
  834.  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
  835.  *
  836.  * Ideally, we want to produce:
  837.  *
  838.  *           DDX                     DDY
  839.  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
  840.  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
  841.  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
  842.  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
  843.  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
  844.  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
  845.  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
  846.  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
  847.  *
  848.  * and add another set of two more subspans if in 16-pixel dispatch mode.
  849.  *
  850.  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
  851.  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
  852.  * pair.  But the ideal approximation may impose a huge performance cost on
  853.  * sample_d.  On at least Haswell, sample_d instruction does some
  854.  * optimizations if the same LOD is used for all pixels in the subspan.
  855.  *
  856.  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
  857.  * appropriate swizzling.
  858.  */
  859. void
  860. fs_generator::generate_ddx(enum opcode opcode,
  861.                            struct brw_reg dst, struct brw_reg src)
  862. {
  863.    unsigned vstride, width;
  864.  
  865.    if (opcode == FS_OPCODE_DDX_FINE) {
  866.       /* produce accurate derivatives */
  867.       vstride = BRW_VERTICAL_STRIDE_2;
  868.       width = BRW_WIDTH_2;
  869.    } else {
  870.       /* replicate the derivative at the top-left pixel to other pixels */
  871.       vstride = BRW_VERTICAL_STRIDE_4;
  872.       width = BRW_WIDTH_4;
  873.    }
  874.  
  875.    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
  876.                                  src.negate, src.abs,
  877.                                  BRW_REGISTER_TYPE_F,
  878.                                  vstride,
  879.                                  width,
  880.                                  BRW_HORIZONTAL_STRIDE_0,
  881.                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  882.    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
  883.                                  src.negate, src.abs,
  884.                                  BRW_REGISTER_TYPE_F,
  885.                                  vstride,
  886.                                  width,
  887.                                  BRW_HORIZONTAL_STRIDE_0,
  888.                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  889.    brw_ADD(p, dst, src0, negate(src1));
  890. }
  891.  
  892. /* The negate_value boolean is used to negate the derivative computation for
  893.  * FBOs, since they place the origin at the upper left instead of the lower
  894.  * left.
  895.  */
  896. void
  897. fs_generator::generate_ddy(enum opcode opcode,
  898.                            struct brw_reg dst, struct brw_reg src,
  899.                            bool negate_value)
  900. {
  901.    if (opcode == FS_OPCODE_DDY_FINE) {
  902.       /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
  903.        * Region Restrictions):
  904.        *
  905.        *     In Align16 access mode, SIMD16 is not allowed for DW operations
  906.        *     and SIMD8 is not allowed for DF operations.
  907.        *
  908.        * In this context, "DW operations" means "operations acting on 32-bit
  909.        * values", so it includes operations on floats.
  910.        *
  911.        * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
  912.        * (Instruction Compression -> Rules and Restrictions):
  913.        *
  914.        *     A compressed instruction must be in Align1 access mode. Align16
  915.        *     mode instructions cannot be compressed.
  916.        *
  917.        * Similar text exists in the g45 PRM.
  918.        *
  919.        * On these platforms, if we're building a SIMD16 shader, we need to
  920.        * manually unroll to a pair of SIMD8 instructions.
  921.        */
  922.       bool unroll_to_simd8 =
  923.          (dispatch_width == 16 &&
  924.           (devinfo->gen == 4 || (devinfo->gen == 7 && !devinfo->is_haswell)));
  925.  
  926.       /* produce accurate derivatives */
  927.       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
  928.                                     src.negate, src.abs,
  929.                                     BRW_REGISTER_TYPE_F,
  930.                                     BRW_VERTICAL_STRIDE_4,
  931.                                     BRW_WIDTH_4,
  932.                                     BRW_HORIZONTAL_STRIDE_1,
  933.                                     BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
  934.       struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
  935.                                     src.negate, src.abs,
  936.                                     BRW_REGISTER_TYPE_F,
  937.                                     BRW_VERTICAL_STRIDE_4,
  938.                                     BRW_WIDTH_4,
  939.                                     BRW_HORIZONTAL_STRIDE_1,
  940.                                     BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
  941.       brw_push_insn_state(p);
  942.       brw_set_default_access_mode(p, BRW_ALIGN_16);
  943.       if (unroll_to_simd8) {
  944.          brw_set_default_exec_size(p, BRW_EXECUTE_8);
  945.          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  946.          if (negate_value) {
  947.             brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0)));
  948.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  949.             brw_ADD(p, sechalf(dst), sechalf(src1), negate(sechalf(src0)));
  950.          } else {
  951.             brw_ADD(p, firsthalf(dst), firsthalf(src0), negate(firsthalf(src1)));
  952.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  953.             brw_ADD(p, sechalf(dst), sechalf(src0), negate(sechalf(src1)));
  954.          }
  955.       } else {
  956.          if (negate_value)
  957.             brw_ADD(p, dst, src1, negate(src0));
  958.          else
  959.             brw_ADD(p, dst, src0, negate(src1));
  960.       }
  961.       brw_pop_insn_state(p);
  962.    } else {
  963.       /* replicate the derivative at the top-left pixel to other pixels */
  964.       struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
  965.                                     src.negate, src.abs,
  966.                                     BRW_REGISTER_TYPE_F,
  967.                                     BRW_VERTICAL_STRIDE_4,
  968.                                     BRW_WIDTH_4,
  969.                                     BRW_HORIZONTAL_STRIDE_0,
  970.                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  971.       struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
  972.                                     src.negate, src.abs,
  973.                                     BRW_REGISTER_TYPE_F,
  974.                                     BRW_VERTICAL_STRIDE_4,
  975.                                     BRW_WIDTH_4,
  976.                                     BRW_HORIZONTAL_STRIDE_0,
  977.                                     BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
  978.       if (negate_value)
  979.          brw_ADD(p, dst, src1, negate(src0));
  980.       else
  981.          brw_ADD(p, dst, src0, negate(src1));
  982.    }
  983. }
  984.  
  985. void
  986. fs_generator::generate_discard_jump(fs_inst *inst)
  987. {
  988.    assert(devinfo->gen >= 6);
  989.  
  990.    /* This HALT will be patched up at FB write time to point UIP at the end of
  991.     * the program, and at brw_uip_jip() JIP will be set to the end of the
  992.     * current block (or the program).
  993.     */
  994.    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
  995.  
  996.    brw_push_insn_state(p);
  997.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  998.    gen6_HALT(p);
  999.    brw_pop_insn_state(p);
  1000. }
  1001.  
  1002. void
  1003. fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
  1004. {
  1005.    assert(inst->mlen != 0);
  1006.  
  1007.    brw_MOV(p,
  1008.            brw_uvec_mrf(inst->exec_size, (inst->base_mrf + 1), 0),
  1009.            retype(src, BRW_REGISTER_TYPE_UD));
  1010.    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
  1011.                                  inst->exec_size / 8, inst->offset);
  1012. }
  1013.  
  1014. void
  1015. fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
  1016. {
  1017.    assert(inst->mlen != 0);
  1018.  
  1019.    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
  1020.                                 inst->exec_size / 8, inst->offset);
  1021. }
  1022.  
  1023. void
  1024. fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
  1025. {
  1026.    gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
  1027. }
  1028.  
  1029. void
  1030. fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
  1031.                                                   struct brw_reg dst,
  1032.                                                   struct brw_reg index,
  1033.                                                   struct brw_reg offset)
  1034. {
  1035.    assert(inst->mlen != 0);
  1036.  
  1037.    assert(index.file == BRW_IMMEDIATE_VALUE &&
  1038.           index.type == BRW_REGISTER_TYPE_UD);
  1039.    uint32_t surf_index = index.dw1.ud;
  1040.  
  1041.    assert(offset.file == BRW_IMMEDIATE_VALUE &&
  1042.           offset.type == BRW_REGISTER_TYPE_UD);
  1043.    uint32_t read_offset = offset.dw1.ud;
  1044.  
  1045.    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
  1046.                         read_offset, surf_index);
  1047.  
  1048.    brw_mark_surface_used(prog_data, surf_index);
  1049. }
  1050.  
  1051. void
  1052. fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
  1053.                                                        struct brw_reg dst,
  1054.                                                        struct brw_reg index,
  1055.                                                        struct brw_reg offset)
  1056. {
  1057.    assert(inst->mlen == 0);
  1058.    assert(index.type == BRW_REGISTER_TYPE_UD);
  1059.  
  1060.    assert(offset.file == BRW_GENERAL_REGISTER_FILE);
  1061.    /* Reference just the dword we need, to avoid angering validate_reg(). */
  1062.    offset = brw_vec1_grf(offset.nr, 0);
  1063.  
  1064.    /* We use the SIMD4x2 mode because we want to end up with 4 components in
  1065.     * the destination loaded consecutively from the same offset (which appears
  1066.     * in the first component, and the rest are ignored).
  1067.     */
  1068.    dst.width = BRW_WIDTH_4;
  1069.  
  1070.    struct brw_reg src = offset;
  1071.    bool header_present = false;
  1072.    int mlen = 1;
  1073.  
  1074.    if (devinfo->gen >= 9) {
  1075.       /* Skylake requires a message header in order to use SIMD4x2 mode. */
  1076.       src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
  1077.       mlen = 2;
  1078.       header_present = true;
  1079.  
  1080.       brw_push_insn_state(p);
  1081.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1082.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1083.       brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  1084.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  1085.  
  1086.       brw_MOV(p, get_element_ud(src, 2),
  1087.               brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
  1088.       brw_pop_insn_state(p);
  1089.    }
  1090.  
  1091.    if (index.file == BRW_IMMEDIATE_VALUE) {
  1092.  
  1093.       uint32_t surf_index = index.dw1.ud;
  1094.  
  1095.       brw_push_insn_state(p);
  1096.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1097.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1098.       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
  1099.       brw_pop_insn_state(p);
  1100.  
  1101.       brw_set_dest(p, send, dst);
  1102.       brw_set_src0(p, send, src);
  1103.       brw_set_sampler_message(p, send,
  1104.                               surf_index,
  1105.                               0, /* LD message ignores sampler unit */
  1106.                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
  1107.                               1, /* rlen */
  1108.                               mlen,
  1109.                               header_present,
  1110.                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
  1111.                               0);
  1112.  
  1113.       brw_mark_surface_used(prog_data, surf_index);
  1114.  
  1115.    } else {
  1116.  
  1117.       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
  1118.  
  1119.       brw_push_insn_state(p);
  1120.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1121.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  1122.  
  1123.       /* a0.0 = surf_index & 0xff */
  1124.       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
  1125.       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
  1126.       brw_set_dest(p, insn_and, addr);
  1127.       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
  1128.       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
  1129.  
  1130.       /* dst = send(payload, a0.0 | <descriptor>) */
  1131.       brw_inst *insn = brw_send_indirect_message(
  1132.          p, BRW_SFID_SAMPLER, dst, src, addr);
  1133.       brw_set_sampler_message(p, insn,
  1134.                               0,
  1135.                               0, /* LD message ignores sampler unit */
  1136.                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
  1137.                               1, /* rlen */
  1138.                               mlen,
  1139.                               header_present,
  1140.                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
  1141.                               0);
  1142.  
  1143.       brw_pop_insn_state(p);
  1144.  
  1145.       /* visitor knows more than we do about the surface limit required,
  1146.        * so has already done marking.
  1147.        */
  1148.  
  1149.    }
  1150. }
  1151.  
  1152. void
  1153. fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
  1154.                                                   struct brw_reg dst,
  1155.                                                   struct brw_reg index,
  1156.                                                   struct brw_reg offset)
  1157. {
  1158.    assert(devinfo->gen < 7); /* Should use the gen7 variant. */
  1159.    assert(inst->header_size != 0);
  1160.    assert(inst->mlen);
  1161.  
  1162.    assert(index.file == BRW_IMMEDIATE_VALUE &&
  1163.           index.type == BRW_REGISTER_TYPE_UD);
  1164.    uint32_t surf_index = index.dw1.ud;
  1165.  
  1166.    uint32_t simd_mode, rlen, msg_type;
  1167.    if (dispatch_width == 16) {
  1168.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  1169.       rlen = 8;
  1170.    } else {
  1171.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
  1172.       rlen = 4;
  1173.    }
  1174.  
  1175.    if (devinfo->gen >= 5)
  1176.       msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
  1177.    else {
  1178.       /* We always use the SIMD16 message so that we only have to load U, and
  1179.        * not V or R.
  1180.        */
  1181.       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
  1182.       assert(inst->mlen == 3);
  1183.       assert(inst->regs_written == 8);
  1184.       rlen = 8;
  1185.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  1186.    }
  1187.  
  1188.    struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
  1189.                                       BRW_REGISTER_TYPE_D);
  1190.    brw_MOV(p, offset_mrf, offset);
  1191.  
  1192.    struct brw_reg header = brw_vec8_grf(0, 0);
  1193.    gen6_resolve_implied_move(p, &header, inst->base_mrf);
  1194.  
  1195.    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
  1196.    brw_inst_set_qtr_control(p->devinfo, send, BRW_COMPRESSION_NONE);
  1197.    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
  1198.    brw_set_src0(p, send, header);
  1199.    if (devinfo->gen < 6)
  1200.       brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
  1201.  
  1202.    /* Our surface is set up as floats, regardless of what actual data is
  1203.     * stored in it.
  1204.     */
  1205.    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
  1206.    brw_set_sampler_message(p, send,
  1207.                            surf_index,
  1208.                            0, /* sampler (unused) */
  1209.                            msg_type,
  1210.                            rlen,
  1211.                            inst->mlen,
  1212.                            inst->header_size != 0,
  1213.                            simd_mode,
  1214.                            return_format);
  1215.  
  1216.    brw_mark_surface_used(prog_data, surf_index);
  1217. }
  1218.  
  1219. void
  1220. fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
  1221.                                                        struct brw_reg dst,
  1222.                                                        struct brw_reg index,
  1223.                                                        struct brw_reg offset)
  1224. {
  1225.    assert(devinfo->gen >= 7);
  1226.    /* Varying-offset pull constant loads are treated as a normal expression on
  1227.     * gen7, so the fact that it's a send message is hidden at the IR level.
  1228.     */
  1229.    assert(inst->header_size == 0);
  1230.    assert(!inst->mlen);
  1231.    assert(index.type == BRW_REGISTER_TYPE_UD);
  1232.  
  1233.    uint32_t simd_mode, rlen, mlen;
  1234.    if (dispatch_width == 16) {
  1235.       mlen = 2;
  1236.       rlen = 8;
  1237.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
  1238.    } else {
  1239.       mlen = 1;
  1240.       rlen = 4;
  1241.       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
  1242.    }
  1243.  
  1244.    if (index.file == BRW_IMMEDIATE_VALUE) {
  1245.  
  1246.       uint32_t surf_index = index.dw1.ud;
  1247.  
  1248.       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
  1249.       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
  1250.       brw_set_src0(p, send, offset);
  1251.       brw_set_sampler_message(p, send,
  1252.                               surf_index,
  1253.                               0, /* LD message ignores sampler unit */
  1254.                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
  1255.                               rlen,
  1256.                               mlen,
  1257.                               false, /* no header */
  1258.                               simd_mode,
  1259.                               0);
  1260.  
  1261.       brw_mark_surface_used(prog_data, surf_index);
  1262.  
  1263.    } else {
  1264.  
  1265.       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
  1266.  
  1267.       brw_push_insn_state(p);
  1268.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1269.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  1270.  
  1271.       /* a0.0 = surf_index & 0xff */
  1272.       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
  1273.       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
  1274.       brw_set_dest(p, insn_and, addr);
  1275.       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
  1276.       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
  1277.  
  1278.       brw_pop_insn_state(p);
  1279.  
  1280.       /* dst = send(offset, a0.0 | <descriptor>) */
  1281.       brw_inst *insn = brw_send_indirect_message(
  1282.          p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
  1283.          offset, addr);
  1284.       brw_set_sampler_message(p, insn,
  1285.                               0 /* surface */,
  1286.                               0 /* sampler */,
  1287.                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
  1288.                               rlen /* rlen */,
  1289.                               mlen /* mlen */,
  1290.                               false /* header */,
  1291.                               simd_mode,
  1292.                               0);
  1293.  
  1294.       /* visitor knows more than we do about the surface limit required,
  1295.        * so has already done marking.
  1296.        */
  1297.    }
  1298. }
  1299.  
  1300. /**
  1301.  * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
  1302.  * into the flags register (f0.0).
  1303.  *
  1304.  * Used only on Gen6 and above.
  1305.  */
  1306. void
  1307. fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
  1308. {
  1309.    struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
  1310.    struct brw_reg dispatch_mask;
  1311.  
  1312.    if (devinfo->gen >= 6)
  1313.       dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
  1314.    else
  1315.       dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
  1316.  
  1317.    brw_push_insn_state(p);
  1318.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1319.    brw_MOV(p, flags, dispatch_mask);
  1320.    brw_pop_insn_state(p);
  1321. }
  1322.  
  1323. void
  1324. fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
  1325.                                                 struct brw_reg dst,
  1326.                                                 struct brw_reg src,
  1327.                                                 struct brw_reg msg_data,
  1328.                                                 unsigned msg_type)
  1329. {
  1330.    assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
  1331.           msg_data.type == BRW_REGISTER_TYPE_UD);
  1332.  
  1333.    brw_pixel_interpolator_query(p,
  1334.          retype(dst, BRW_REGISTER_TYPE_UW),
  1335.          src,
  1336.          inst->pi_noperspective,
  1337.          msg_type,
  1338.          msg_data.dw1.ud,
  1339.          inst->mlen,
  1340.          inst->regs_written);
  1341. }
  1342.  
  1343.  
  1344. /**
  1345.  * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
  1346.  * sampler LD messages.
  1347.  *
  1348.  * We don't want to bake it into the send message's code generation because
  1349.  * that means we don't get a chance to schedule the instructions.
  1350.  */
  1351. void
  1352. fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
  1353.                                           struct brw_reg dst,
  1354.                                           struct brw_reg value)
  1355. {
  1356.    assert(value.file == BRW_IMMEDIATE_VALUE);
  1357.  
  1358.    brw_push_insn_state(p);
  1359.    brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1360.    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1361.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1362.    brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
  1363.    brw_pop_insn_state(p);
  1364. }
  1365.  
  1366. /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
  1367.  * (when mask is passed as a uniform) of register mask before moving it
  1368.  * to register dst.
  1369.  */
  1370. void
  1371. fs_generator::generate_set_omask(fs_inst *inst,
  1372.                                  struct brw_reg dst,
  1373.                                  struct brw_reg mask)
  1374. {
  1375.    bool stride_8_8_1 =
  1376.     (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
  1377.      mask.width == BRW_WIDTH_8 &&
  1378.      mask.hstride == BRW_HORIZONTAL_STRIDE_1);
  1379.  
  1380.    bool stride_0_1_0 = has_scalar_region(mask);
  1381.  
  1382.    assert(stride_8_8_1 || stride_0_1_0);
  1383.    assert(dst.type == BRW_REGISTER_TYPE_UW);
  1384.  
  1385.    brw_push_insn_state(p);
  1386.    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1387.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1388.  
  1389.    if (stride_8_8_1) {
  1390.       brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
  1391.    } else if (stride_0_1_0) {
  1392.       brw_MOV(p, dst, retype(mask, dst.type));
  1393.    }
  1394.    brw_pop_insn_state(p);
  1395. }
  1396.  
  1397. /* Sets vstride=1, width=4, hstride=0 of register src1 during
  1398.  * the ADD instruction.
  1399.  */
  1400. void
  1401. fs_generator::generate_set_sample_id(fs_inst *inst,
  1402.                                      struct brw_reg dst,
  1403.                                      struct brw_reg src0,
  1404.                                      struct brw_reg src1)
  1405. {
  1406.    assert(dst.type == BRW_REGISTER_TYPE_D ||
  1407.           dst.type == BRW_REGISTER_TYPE_UD);
  1408.    assert(src0.type == BRW_REGISTER_TYPE_D ||
  1409.           src0.type == BRW_REGISTER_TYPE_UD);
  1410.  
  1411.    brw_push_insn_state(p);
  1412.    brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1413.    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1414.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  1415.    struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
  1416.    if (dispatch_width == 8) {
  1417.       brw_ADD(p, dst, src0, reg);
  1418.    } else if (dispatch_width == 16) {
  1419.       brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
  1420.       brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
  1421.    }
  1422.    brw_pop_insn_state(p);
  1423. }
  1424.  
  1425. void
  1426. fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
  1427.                                             struct brw_reg dst,
  1428.                                             struct brw_reg x,
  1429.                                             struct brw_reg y)
  1430. {
  1431.    assert(devinfo->gen >= 7);
  1432.    assert(dst.type == BRW_REGISTER_TYPE_UD);
  1433.    assert(x.type == BRW_REGISTER_TYPE_F);
  1434.    assert(y.type == BRW_REGISTER_TYPE_F);
  1435.  
  1436.    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
  1437.     *
  1438.     *   Because this instruction does not have a 16-bit floating-point type,
  1439.     *   the destination data type must be Word (W).
  1440.     *
  1441.     *   The destination must be DWord-aligned and specify a horizontal stride
  1442.     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
  1443.     *   each destination channel and the upper word is not modified.
  1444.     */
  1445.    struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
  1446.  
  1447.    /* Give each 32-bit channel of dst the form below, where "." means
  1448.     * unchanged.
  1449.     *   0x....hhhh
  1450.     */
  1451.    brw_F32TO16(p, dst_w, y);
  1452.  
  1453.    /* Now the form:
  1454.     *   0xhhhh0000
  1455.     */
  1456.    brw_SHL(p, dst, dst, brw_imm_ud(16u));
  1457.  
  1458.    /* And, finally the form of packHalf2x16's output:
  1459.     *   0xhhhhllll
  1460.     */
  1461.    brw_F32TO16(p, dst_w, x);
  1462. }
  1463.  
  1464. void
  1465. fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
  1466.                                               struct brw_reg dst,
  1467.                                               struct brw_reg src)
  1468. {
  1469.    assert(devinfo->gen >= 7);
  1470.    assert(dst.type == BRW_REGISTER_TYPE_F);
  1471.    assert(src.type == BRW_REGISTER_TYPE_UD);
  1472.  
  1473.    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
  1474.     *
  1475.     *   Because this instruction does not have a 16-bit floating-point type,
  1476.     *   the source data type must be Word (W). The destination type must be
  1477.     *   F (Float).
  1478.     */
  1479.    struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
  1480.  
  1481.    /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
  1482.     * For the Y case, we wish to access only the upper word; therefore
  1483.     * a 16-bit subregister offset is needed.
  1484.     */
  1485.    assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
  1486.           inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
  1487.    if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
  1488.       src_w.subnr += 2;
  1489.  
  1490.    brw_F16TO32(p, dst, src_w);
  1491. }
  1492.  
  1493. void
  1494. fs_generator::generate_shader_time_add(fs_inst *inst,
  1495.                                        struct brw_reg payload,
  1496.                                        struct brw_reg offset,
  1497.                                        struct brw_reg value)
  1498. {
  1499.    assert(devinfo->gen >= 7);
  1500.    brw_push_insn_state(p);
  1501.    brw_set_default_mask_control(p, true);
  1502.  
  1503.    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
  1504.    struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
  1505.                                           offset.type);
  1506.    struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
  1507.                                          value.type);
  1508.  
  1509.    assert(offset.file == BRW_IMMEDIATE_VALUE);
  1510.    if (value.file == BRW_GENERAL_REGISTER_FILE) {
  1511.       value.width = BRW_WIDTH_1;
  1512.       value.hstride = BRW_HORIZONTAL_STRIDE_0;
  1513.       value.vstride = BRW_VERTICAL_STRIDE_0;
  1514.    } else {
  1515.       assert(value.file == BRW_IMMEDIATE_VALUE);
  1516.    }
  1517.  
  1518.    /* Trying to deal with setup of the params from the IR is crazy in the FS8
  1519.     * case, and we don't really care about squeezing every bit of performance
  1520.     * out of this path, so we just emit the MOVs from here.
  1521.     */
  1522.    brw_MOV(p, payload_offset, offset);
  1523.    brw_MOV(p, payload_value, value);
  1524.    brw_shader_time_add(p, payload,
  1525.                        prog_data->binding_table.shader_time_start);
  1526.    brw_pop_insn_state(p);
  1527.  
  1528.    brw_mark_surface_used(prog_data,
  1529.                          prog_data->binding_table.shader_time_start);
  1530. }
  1531.  
  1532. void
  1533. fs_generator::enable_debug(const char *shader_name)
  1534. {
  1535.    debug_flag = true;
  1536.    this->shader_name = shader_name;
  1537. }
  1538.  
  1539. int
  1540. fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
  1541. {
  1542.    /* align to 64 byte boundary. */
  1543.    while (p->next_insn_offset % 64)
  1544.       brw_NOP(p);
  1545.  
  1546.    this->dispatch_width = dispatch_width;
  1547.    if (dispatch_width == 16)
  1548.       brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1549.  
  1550.    int start_offset = p->next_insn_offset;
  1551.    int spill_count = 0, fill_count = 0;
  1552.    int loop_count = 0;
  1553.  
  1554.    struct annotation_info annotation;
  1555.    memset(&annotation, 0, sizeof(annotation));
  1556.  
  1557.    foreach_block_and_inst (block, fs_inst, inst, cfg) {
  1558.       struct brw_reg src[3], dst;
  1559.       unsigned int last_insn_offset = p->next_insn_offset;
  1560.       bool multiple_instructions_emitted = false;
  1561.  
  1562.       if (unlikely(debug_flag))
  1563.          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
  1564.  
  1565.       for (unsigned int i = 0; i < inst->sources; i++) {
  1566.          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
  1567.  
  1568.          /* The accumulator result appears to get used for the
  1569.           * conditional modifier generation.  When negating a UD
  1570.           * value, there is a 33rd bit generated for the sign in the
  1571.           * accumulator value, so now you can't check, for example,
  1572.           * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
  1573.           */
  1574.          assert(!inst->conditional_mod ||
  1575.                 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
  1576.                 !inst->src[i].negate);
  1577.       }
  1578.       dst = brw_reg_from_fs_reg(&inst->dst);
  1579.  
  1580.       brw_set_default_predicate_control(p, inst->predicate);
  1581.       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
  1582.       brw_set_default_flag_reg(p, 0, inst->flag_subreg);
  1583.       brw_set_default_saturate(p, inst->saturate);
  1584.       brw_set_default_mask_control(p, inst->force_writemask_all);
  1585.       brw_set_default_acc_write_control(p, inst->writes_accumulator);
  1586.       brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
  1587.  
  1588.       switch (inst->exec_size) {
  1589.       case 1:
  1590.       case 2:
  1591.       case 4:
  1592.          assert(inst->force_writemask_all);
  1593.          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1594.          break;
  1595.       case 8:
  1596.          if (inst->force_sechalf) {
  1597.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1598.          } else {
  1599.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1600.          }
  1601.          break;
  1602.       case 16:
  1603.       case 32:
  1604.          /* If the instruction writes to more than one register, it needs to
  1605.           * be a "compressed" instruction on Gen <= 5.
  1606.           */
  1607.          if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32)
  1608.             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1609.          else
  1610.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1611.          break;
  1612.       default:
  1613.          unreachable("Invalid instruction width");
  1614.       }
  1615.  
  1616.       switch (inst->opcode) {
  1617.       case BRW_OPCODE_MOV:
  1618.          brw_MOV(p, dst, src[0]);
  1619.          break;
  1620.       case BRW_OPCODE_ADD:
  1621.          brw_ADD(p, dst, src[0], src[1]);
  1622.          break;
  1623.       case BRW_OPCODE_MUL:
  1624.          brw_MUL(p, dst, src[0], src[1]);
  1625.          break;
  1626.       case BRW_OPCODE_AVG:
  1627.          brw_AVG(p, dst, src[0], src[1]);
  1628.          break;
  1629.       case BRW_OPCODE_MACH:
  1630.          brw_MACH(p, dst, src[0], src[1]);
  1631.          break;
  1632.  
  1633.       case BRW_OPCODE_LINE:
  1634.          brw_LINE(p, dst, src[0], src[1]);
  1635.          break;
  1636.  
  1637.       case BRW_OPCODE_MAD:
  1638.          assert(devinfo->gen >= 6);
  1639.          brw_set_default_access_mode(p, BRW_ALIGN_16);
  1640.          if (dispatch_width == 16 && !devinfo->supports_simd16_3src) {
  1641.             brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1642.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1643.             brw_inst *f = brw_MAD(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
  1644.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1645.             brw_inst *s = brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1646.             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1647.  
  1648.             if (inst->conditional_mod) {
  1649.                brw_inst_set_cond_modifier(p->devinfo, f, inst->conditional_mod);
  1650.                brw_inst_set_cond_modifier(p->devinfo, s, inst->conditional_mod);
  1651.                multiple_instructions_emitted = true;
  1652.             }
  1653.          } else {
  1654.             brw_MAD(p, dst, src[0], src[1], src[2]);
  1655.          }
  1656.          brw_set_default_access_mode(p, BRW_ALIGN_1);
  1657.          break;
  1658.  
  1659.       case BRW_OPCODE_LRP:
  1660.          assert(devinfo->gen >= 6);
  1661.          brw_set_default_access_mode(p, BRW_ALIGN_16);
  1662.          if (dispatch_width == 16 && !devinfo->supports_simd16_3src) {
  1663.             brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1664.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1665.             brw_inst *f = brw_LRP(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
  1666.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1667.             brw_inst *s = brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1668.             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1669.  
  1670.             if (inst->conditional_mod) {
  1671.                brw_inst_set_cond_modifier(p->devinfo, f, inst->conditional_mod);
  1672.                brw_inst_set_cond_modifier(p->devinfo, s, inst->conditional_mod);
  1673.                multiple_instructions_emitted = true;
  1674.             }
  1675.          } else {
  1676.             brw_LRP(p, dst, src[0], src[1], src[2]);
  1677.          }
  1678.          brw_set_default_access_mode(p, BRW_ALIGN_1);
  1679.          break;
  1680.  
  1681.       case BRW_OPCODE_FRC:
  1682.          brw_FRC(p, dst, src[0]);
  1683.          break;
  1684.       case BRW_OPCODE_RNDD:
  1685.          brw_RNDD(p, dst, src[0]);
  1686.          break;
  1687.       case BRW_OPCODE_RNDE:
  1688.          brw_RNDE(p, dst, src[0]);
  1689.          break;
  1690.       case BRW_OPCODE_RNDZ:
  1691.          brw_RNDZ(p, dst, src[0]);
  1692.          break;
  1693.  
  1694.       case BRW_OPCODE_AND:
  1695.          brw_AND(p, dst, src[0], src[1]);
  1696.          break;
  1697.       case BRW_OPCODE_OR:
  1698.          brw_OR(p, dst, src[0], src[1]);
  1699.          break;
  1700.       case BRW_OPCODE_XOR:
  1701.          brw_XOR(p, dst, src[0], src[1]);
  1702.          break;
  1703.       case BRW_OPCODE_NOT:
  1704.          brw_NOT(p, dst, src[0]);
  1705.          break;
  1706.       case BRW_OPCODE_ASR:
  1707.          brw_ASR(p, dst, src[0], src[1]);
  1708.          break;
  1709.       case BRW_OPCODE_SHR:
  1710.          brw_SHR(p, dst, src[0], src[1]);
  1711.          break;
  1712.       case BRW_OPCODE_SHL:
  1713.          brw_SHL(p, dst, src[0], src[1]);
  1714.          break;
  1715.       case BRW_OPCODE_F32TO16:
  1716.          assert(devinfo->gen >= 7);
  1717.          brw_F32TO16(p, dst, src[0]);
  1718.          break;
  1719.       case BRW_OPCODE_F16TO32:
  1720.          assert(devinfo->gen >= 7);
  1721.          brw_F16TO32(p, dst, src[0]);
  1722.          break;
  1723.       case BRW_OPCODE_CMP:
  1724.          /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says
  1725.           * that when the destination is a GRF that the dependency-clear bit on
  1726.           * the flag register is cleared early.
  1727.           *
  1728.           * Suggested workarounds are to disable coissuing CMP instructions
  1729.           * or to split CMP(16) instructions into two CMP(8) instructions.
  1730.           *
  1731.           * We choose to split into CMP(8) instructions since disabling
  1732.           * coissuing would affect CMP instructions not otherwise affected by
  1733.           * the errata.
  1734.           */
  1735.          if (dispatch_width == 16 && devinfo->gen == 7 && !devinfo->is_haswell) {
  1736.             if (dst.file == BRW_GENERAL_REGISTER_FILE) {
  1737.                brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1738.                brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1739.                brw_CMP(p, firsthalf(dst), inst->conditional_mod,
  1740.                           firsthalf(src[0]), firsthalf(src[1]));
  1741.                brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1742.                brw_CMP(p, sechalf(dst), inst->conditional_mod,
  1743.                           sechalf(src[0]), sechalf(src[1]));
  1744.                brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1745.  
  1746.                multiple_instructions_emitted = true;
  1747.             } else if (dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
  1748.                /* For unknown reasons, the aforementioned workaround is not
  1749.                 * sufficient. Overriding the type when the destination is the
  1750.                 * null register is necessary but not sufficient by itself.
  1751.                 */
  1752.                assert(dst.nr == BRW_ARF_NULL);
  1753.                dst.type = BRW_REGISTER_TYPE_D;
  1754.                brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
  1755.             } else {
  1756.                unreachable("not reached");
  1757.             }
  1758.          } else {
  1759.             brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
  1760.          }
  1761.          break;
  1762.       case BRW_OPCODE_SEL:
  1763.          brw_SEL(p, dst, src[0], src[1]);
  1764.          break;
  1765.       case BRW_OPCODE_BFREV:
  1766.          assert(devinfo->gen >= 7);
  1767.          /* BFREV only supports UD type for src and dst. */
  1768.          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
  1769.                       retype(src[0], BRW_REGISTER_TYPE_UD));
  1770.          break;
  1771.       case BRW_OPCODE_FBH:
  1772.          assert(devinfo->gen >= 7);
  1773.          /* FBH only supports UD type for dst. */
  1774.          brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
  1775.          break;
  1776.       case BRW_OPCODE_FBL:
  1777.          assert(devinfo->gen >= 7);
  1778.          /* FBL only supports UD type for dst. */
  1779.          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
  1780.          break;
  1781.       case BRW_OPCODE_CBIT:
  1782.          assert(devinfo->gen >= 7);
  1783.          /* CBIT only supports UD type for dst. */
  1784.          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
  1785.          break;
  1786.       case BRW_OPCODE_ADDC:
  1787.          assert(devinfo->gen >= 7);
  1788.          brw_ADDC(p, dst, src[0], src[1]);
  1789.          break;
  1790.       case BRW_OPCODE_SUBB:
  1791.          assert(devinfo->gen >= 7);
  1792.          brw_SUBB(p, dst, src[0], src[1]);
  1793.          break;
  1794.       case BRW_OPCODE_MAC:
  1795.          brw_MAC(p, dst, src[0], src[1]);
  1796.          break;
  1797.  
  1798.       case BRW_OPCODE_BFE:
  1799.          assert(devinfo->gen >= 7);
  1800.          brw_set_default_access_mode(p, BRW_ALIGN_16);
  1801.          if (dispatch_width == 16 && !devinfo->supports_simd16_3src) {
  1802.             brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1803.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1804.             brw_BFE(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
  1805.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1806.             brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1807.             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1808.          } else {
  1809.             brw_BFE(p, dst, src[0], src[1], src[2]);
  1810.          }
  1811.          brw_set_default_access_mode(p, BRW_ALIGN_1);
  1812.          break;
  1813.  
  1814.       case BRW_OPCODE_BFI1:
  1815.          assert(devinfo->gen >= 7);
  1816.          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
  1817.           * should
  1818.           *
  1819.           *    "Force BFI instructions to be executed always in SIMD8."
  1820.           */
  1821.          if (dispatch_width == 16 && devinfo->is_haswell) {
  1822.             brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1823.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1824.             brw_BFI1(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]));
  1825.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1826.             brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
  1827.             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1828.          } else {
  1829.             brw_BFI1(p, dst, src[0], src[1]);
  1830.          }
  1831.          break;
  1832.       case BRW_OPCODE_BFI2:
  1833.          assert(devinfo->gen >= 7);
  1834.          brw_set_default_access_mode(p, BRW_ALIGN_16);
  1835.          /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
  1836.           * should
  1837.           *
  1838.           *    "Force BFI instructions to be executed always in SIMD8."
  1839.           *
  1840.           * Otherwise we would be able to emit compressed instructions like we
  1841.           * do for the other three-source instructions.
  1842.           */
  1843.          if (dispatch_width == 16 &&
  1844.              (devinfo->is_haswell || !devinfo->supports_simd16_3src)) {
  1845.             brw_set_default_exec_size(p, BRW_EXECUTE_8);
  1846.             brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  1847.             brw_BFI2(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
  1848.             brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
  1849.             brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
  1850.             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
  1851.          } else {
  1852.             brw_BFI2(p, dst, src[0], src[1], src[2]);
  1853.          }
  1854.          brw_set_default_access_mode(p, BRW_ALIGN_1);
  1855.          break;
  1856.  
  1857.       case BRW_OPCODE_IF:
  1858.          if (inst->src[0].file != BAD_FILE) {
  1859.             /* The instruction has an embedded compare (only allowed on gen6) */
  1860.             assert(devinfo->gen == 6);
  1861.             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
  1862.          } else {
  1863.             brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
  1864.          }
  1865.          break;
  1866.  
  1867.       case BRW_OPCODE_ELSE:
  1868.          brw_ELSE(p);
  1869.          break;
  1870.       case BRW_OPCODE_ENDIF:
  1871.          brw_ENDIF(p);
  1872.          break;
  1873.  
  1874.       case BRW_OPCODE_DO:
  1875.          brw_DO(p, BRW_EXECUTE_8);
  1876.          break;
  1877.  
  1878.       case BRW_OPCODE_BREAK:
  1879.          brw_BREAK(p);
  1880.          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  1881.          break;
  1882.       case BRW_OPCODE_CONTINUE:
  1883.          brw_CONT(p);
  1884.          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  1885.          break;
  1886.  
  1887.       case BRW_OPCODE_WHILE:
  1888.          brw_WHILE(p);
  1889.          loop_count++;
  1890.          break;
  1891.  
  1892.       case SHADER_OPCODE_RCP:
  1893.       case SHADER_OPCODE_RSQ:
  1894.       case SHADER_OPCODE_SQRT:
  1895.       case SHADER_OPCODE_EXP2:
  1896.       case SHADER_OPCODE_LOG2:
  1897.       case SHADER_OPCODE_SIN:
  1898.       case SHADER_OPCODE_COS:
  1899.          assert(devinfo->gen < 6 || inst->mlen == 0);
  1900.          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
  1901.          if (devinfo->gen >= 7) {
  1902.             gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
  1903.                       brw_null_reg());
  1904.          } else if (devinfo->gen == 6) {
  1905.             generate_math_gen6(inst, dst, src[0], brw_null_reg());
  1906.          } else if (devinfo->gen == 5 || devinfo->is_g4x) {
  1907.             generate_math_g45(inst, dst, src[0]);
  1908.          } else {
  1909.             generate_math_gen4(inst, dst, src[0]);
  1910.          }
  1911.          break;
  1912.       case SHADER_OPCODE_INT_QUOTIENT:
  1913.       case SHADER_OPCODE_INT_REMAINDER:
  1914.       case SHADER_OPCODE_POW:
  1915.          assert(devinfo->gen < 6 || inst->mlen == 0);
  1916.          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
  1917.          if (devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
  1918.             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
  1919.          } else if (devinfo->gen >= 6) {
  1920.             generate_math_gen6(inst, dst, src[0], src[1]);
  1921.          } else {
  1922.             generate_math_gen4(inst, dst, src[0]);
  1923.          }
  1924.          break;
  1925.       case FS_OPCODE_CINTERP:
  1926.          brw_MOV(p, dst, src[0]);
  1927.          break;
  1928.       case FS_OPCODE_LINTERP:
  1929.          generate_linterp(inst, dst, src);
  1930.          break;
  1931.       case FS_OPCODE_PIXEL_X:
  1932.          assert(src[0].type == BRW_REGISTER_TYPE_UW);
  1933.          src[0].subnr = 0 * type_sz(src[0].type);
  1934.          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
  1935.          break;
  1936.       case FS_OPCODE_PIXEL_Y:
  1937.          assert(src[0].type == BRW_REGISTER_TYPE_UW);
  1938.          src[0].subnr = 4 * type_sz(src[0].type);
  1939.          brw_MOV(p, dst, stride(src[0], 8, 4, 1));
  1940.          break;
  1941.       case SHADER_OPCODE_TEX:
  1942.       case FS_OPCODE_TXB:
  1943.       case SHADER_OPCODE_TXD:
  1944.       case SHADER_OPCODE_TXF:
  1945.       case SHADER_OPCODE_TXF_CMS:
  1946.       case SHADER_OPCODE_TXF_UMS:
  1947.       case SHADER_OPCODE_TXF_MCS:
  1948.       case SHADER_OPCODE_TXL:
  1949.       case SHADER_OPCODE_TXS:
  1950.       case SHADER_OPCODE_LOD:
  1951.       case SHADER_OPCODE_TG4:
  1952.       case SHADER_OPCODE_TG4_OFFSET:
  1953.          generate_tex(inst, dst, src[0], src[1]);
  1954.          break;
  1955.       case FS_OPCODE_DDX_COARSE:
  1956.       case FS_OPCODE_DDX_FINE:
  1957.          generate_ddx(inst->opcode, dst, src[0]);
  1958.          break;
  1959.       case FS_OPCODE_DDY_COARSE:
  1960.       case FS_OPCODE_DDY_FINE:
  1961.          assert(src[1].file == BRW_IMMEDIATE_VALUE);
  1962.          generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud);
  1963.          break;
  1964.  
  1965.       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
  1966.          generate_scratch_write(inst, src[0]);
  1967.          spill_count++;
  1968.          break;
  1969.  
  1970.       case SHADER_OPCODE_GEN4_SCRATCH_READ:
  1971.          generate_scratch_read(inst, dst);
  1972.          fill_count++;
  1973.          break;
  1974.  
  1975.       case SHADER_OPCODE_GEN7_SCRATCH_READ:
  1976.          generate_scratch_read_gen7(inst, dst);
  1977.          fill_count++;
  1978.          break;
  1979.  
  1980.       case SHADER_OPCODE_URB_WRITE_SIMD8:
  1981.          generate_urb_write(inst, src[0]);
  1982.          break;
  1983.  
  1984.       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  1985.          generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
  1986.          break;
  1987.  
  1988.       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
  1989.          generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
  1990.          break;
  1991.  
  1992.       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
  1993.          generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
  1994.          break;
  1995.  
  1996.       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
  1997.          generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
  1998.          break;
  1999.  
  2000.       case FS_OPCODE_REP_FB_WRITE:
  2001.       case FS_OPCODE_FB_WRITE:
  2002.          generate_fb_write(inst, src[0]);
  2003.          break;
  2004.  
  2005.       case FS_OPCODE_BLORP_FB_WRITE:
  2006.          generate_blorp_fb_write(inst);
  2007.          break;
  2008.  
  2009.       case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
  2010.          generate_mov_dispatch_to_flags(inst);
  2011.          break;
  2012.  
  2013.       case FS_OPCODE_DISCARD_JUMP:
  2014.          generate_discard_jump(inst);
  2015.          break;
  2016.  
  2017.       case SHADER_OPCODE_SHADER_TIME_ADD:
  2018.          generate_shader_time_add(inst, src[0], src[1], src[2]);
  2019.          break;
  2020.  
  2021.       case SHADER_OPCODE_UNTYPED_ATOMIC:
  2022.          assert(src[1].file == BRW_IMMEDIATE_VALUE &&
  2023.                 src[2].file == BRW_IMMEDIATE_VALUE);
  2024.          brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud,
  2025.                             inst->mlen, !inst->dst.is_null());
  2026.          brw_mark_surface_used(prog_data, src[1].dw1.ud);
  2027.          break;
  2028.  
  2029.       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
  2030.          assert(src[1].file == BRW_IMMEDIATE_VALUE &&
  2031.                 src[2].file == BRW_IMMEDIATE_VALUE);
  2032.          brw_untyped_surface_read(p, dst, src[0], src[1],
  2033.                                   inst->mlen, src[2].dw1.ud);
  2034.          brw_mark_surface_used(prog_data, src[1].dw1.ud);
  2035.          break;
  2036.  
  2037.       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
  2038.          assert(src[2].file == BRW_IMMEDIATE_VALUE);
  2039.          brw_untyped_surface_write(p, src[0], src[1],
  2040.                                    inst->mlen, src[2].dw1.ud);
  2041.          break;
  2042.  
  2043.       case SHADER_OPCODE_TYPED_ATOMIC:
  2044.          assert(src[2].file == BRW_IMMEDIATE_VALUE);
  2045.          brw_typed_atomic(p, dst, src[0], src[1],
  2046.                           src[2].dw1.ud, inst->mlen, !inst->dst.is_null());
  2047.          break;
  2048.  
  2049.       case SHADER_OPCODE_TYPED_SURFACE_READ:
  2050.          assert(src[2].file == BRW_IMMEDIATE_VALUE);
  2051.          brw_typed_surface_read(p, dst, src[0], src[1],
  2052.                                 inst->mlen, src[2].dw1.ud);
  2053.          break;
  2054.  
  2055.       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
  2056.          assert(src[2].file == BRW_IMMEDIATE_VALUE);
  2057.          brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].dw1.ud);
  2058.          break;
  2059.  
  2060.       case SHADER_OPCODE_MEMORY_FENCE:
  2061.          brw_memory_fence(p, dst);
  2062.          break;
  2063.  
  2064.       case FS_OPCODE_SET_SIMD4X2_OFFSET:
  2065.          generate_set_simd4x2_offset(inst, dst, src[0]);
  2066.          break;
  2067.  
  2068.       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
  2069.          brw_find_live_channel(p, dst);
  2070.          break;
  2071.  
  2072.       case SHADER_OPCODE_BROADCAST:
  2073.          brw_broadcast(p, dst, src[0], src[1]);
  2074.          break;
  2075.  
  2076.       case FS_OPCODE_SET_OMASK:
  2077.          generate_set_omask(inst, dst, src[0]);
  2078.          break;
  2079.  
  2080.       case FS_OPCODE_SET_SAMPLE_ID:
  2081.          generate_set_sample_id(inst, dst, src[0], src[1]);
  2082.          break;
  2083.  
  2084.       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
  2085.           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
  2086.           break;
  2087.  
  2088.       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
  2089.       case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
  2090.          generate_unpack_half_2x16_split(inst, dst, src[0]);
  2091.          break;
  2092.  
  2093.       case FS_OPCODE_PLACEHOLDER_HALT:
  2094.          /* This is the place where the final HALT needs to be inserted if
  2095.           * we've emitted any discards.  If not, this will emit no code.
  2096.           */
  2097.          if (!patch_discard_jumps_to_fb_writes()) {
  2098.             if (unlikely(debug_flag)) {
  2099.                annotation.ann_count--;
  2100.             }
  2101.          }
  2102.          break;
  2103.  
  2104.       case FS_OPCODE_INTERPOLATE_AT_CENTROID:
  2105.          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
  2106.                                            GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
  2107.          break;
  2108.  
  2109.       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
  2110.          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
  2111.                                            GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
  2112.          break;
  2113.  
  2114.       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
  2115.          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
  2116.                                            GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
  2117.          break;
  2118.  
  2119.       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
  2120.          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
  2121.                                            GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
  2122.          break;
  2123.  
  2124.       case CS_OPCODE_CS_TERMINATE:
  2125.          generate_cs_terminate(inst, src[0]);
  2126.          break;
  2127.  
  2128.       default:
  2129.          unreachable("Unsupported opcode");
  2130.  
  2131.       case SHADER_OPCODE_LOAD_PAYLOAD:
  2132.          unreachable("Should be lowered by lower_load_payload()");
  2133.       }
  2134.  
  2135.       if (multiple_instructions_emitted)
  2136.          continue;
  2137.  
  2138.       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
  2139.          assert(p->next_insn_offset == last_insn_offset + 16 ||
  2140.                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
  2141.                  "emitting more than 1 instruction");
  2142.  
  2143.          brw_inst *last = &p->store[last_insn_offset / 16];
  2144.  
  2145.          if (inst->conditional_mod)
  2146.             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
  2147.          brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
  2148.          brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
  2149.       }
  2150.    }
  2151.  
  2152.    brw_set_uip_jip(p);
  2153.    annotation_finalize(&annotation, p->next_insn_offset);
  2154.  
  2155.    int before_size = p->next_insn_offset - start_offset;
  2156.    brw_compact_instructions(p, start_offset, annotation.ann_count,
  2157.                             annotation.ann);
  2158.    int after_size = p->next_insn_offset - start_offset;
  2159.  
  2160.    if (unlikely(debug_flag)) {
  2161.       fprintf(stderr, "Native code for %s\n"
  2162.               "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
  2163.               " bytes (%.0f%%)\n",
  2164.               shader_name, dispatch_width, before_size / 16, loop_count,
  2165.               spill_count, fill_count, promoted_constants, before_size, after_size,
  2166.               100.0f * (before_size - after_size) / before_size);
  2167.  
  2168.       dump_assembly(p->store, annotation.ann_count, annotation.ann,
  2169.                     p->devinfo, prog);
  2170.       ralloc_free(annotation.ann);
  2171.    }
  2172.  
  2173.    static GLuint msg_id = 0;
  2174.    _mesa_gl_debug(&brw->ctx, &msg_id,
  2175.                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
  2176.                   MESA_DEBUG_TYPE_OTHER,
  2177.                   MESA_DEBUG_SEVERITY_NOTIFICATION,
  2178.                   "%s SIMD%d shader: %d inst, %d loops, %d:%d spills:fills, "
  2179.                   "Promoted %u constants, compacted %d to %d bytes.\n",
  2180.                   stage_abbrev, dispatch_width, before_size / 16, loop_count,
  2181.                   spill_count, fill_count, promoted_constants, before_size, after_size);
  2182.  
  2183.    return start_offset;
  2184. }
  2185.  
  2186. const unsigned *
  2187. fs_generator::get_assembly(unsigned int *assembly_size)
  2188. {
  2189.    return brw_get_program(p, assembly_size);
  2190. }
  2191.