Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
  3.  Intel funded Tungsten Graphics to
  4.  develop this 3D driver.
  5.  
  6.  Permission is hereby granted, free of charge, to any person obtaining
  7.  a copy of this software and associated documentation files (the
  8.  "Software"), to deal in the Software without restriction, including
  9.  without limitation the rights to use, copy, modify, merge, publish,
  10.  distribute, sublicense, and/or sell copies of the Software, and to
  11.  permit persons to whom the Software is furnished to do so, subject to
  12.  the following conditions:
  13.  
  14.  The above copyright notice and this permission notice (including the
  15.  next paragraph) shall be included in all copies or substantial
  16.  portions of the Software.
  17.  
  18.  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19.  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21.  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22.  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23.  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24.  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  
  26.  **********************************************************************/
  27.  /*
  28.   * Authors:
  29.   *   Keith Whitwell <keithw@vmware.com>
  30.   */
  31.  
  32.  
  33. #include "brw_context.h"
  34. #include "brw_defines.h"
  35. #include "brw_eu.h"
  36.  
  37. #include "util/ralloc.h"
  38.  
  39. /**
  40.  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
  41.  * registers, implicitly moving the operand to a message register.
  42.  *
  43.  * On Sandybridge, this is no longer the case.  This function performs the
  44.  * explicit move; it should be called before emitting a SEND instruction.
  45.  */
  46. void
  47. gen6_resolve_implied_move(struct brw_codegen *p,
  48.                           struct brw_reg *src,
  49.                           unsigned msg_reg_nr)
  50. {
  51.    const struct brw_device_info *devinfo = p->devinfo;
  52.    if (devinfo->gen < 6)
  53.       return;
  54.  
  55.    if (src->file == BRW_MESSAGE_REGISTER_FILE)
  56.       return;
  57.  
  58.    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
  59.       brw_push_insn_state(p);
  60.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  61.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  62.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  63.       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
  64.               retype(*src, BRW_REGISTER_TYPE_UD));
  65.       brw_pop_insn_state(p);
  66.    }
  67.    *src = brw_message_reg(msg_reg_nr);
  68. }
  69.  
  70. static void
  71. gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
  72. {
  73.    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
  74.     * "The send with EOT should use register space R112-R127 for <src>. This is
  75.     *  to enable loading of a new thread into the same slot while the message
  76.     *  with EOT for current thread is pending dispatch."
  77.     *
  78.     * Since we're pretending to have 16 MRFs anyway, we may as well use the
  79.     * registers required for messages with EOT.
  80.     */
  81.    const struct brw_device_info *devinfo = p->devinfo;
  82.    if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
  83.       reg->file = BRW_GENERAL_REGISTER_FILE;
  84.       reg->nr += GEN7_MRF_HACK_START;
  85.    }
  86. }
  87.  
  88. /**
  89.  * Convert a brw_reg_type enumeration value into the hardware representation.
  90.  *
  91.  * The hardware encoding may depend on whether the value is an immediate.
  92.  */
  93. unsigned
  94. brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
  95.                         enum brw_reg_type type, unsigned file)
  96. {
  97.    if (file == BRW_IMMEDIATE_VALUE) {
  98.       const static int imm_hw_types[] = {
  99.          [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
  100.          [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
  101.          [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
  102.          [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
  103.          [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
  104.          [BRW_REGISTER_TYPE_UB] = -1,
  105.          [BRW_REGISTER_TYPE_B]  = -1,
  106.          [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
  107.          [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
  108.          [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
  109.          [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
  110.          [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
  111.          [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
  112.          [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
  113.       };
  114.       assert(type < ARRAY_SIZE(imm_hw_types));
  115.       assert(imm_hw_types[type] != -1);
  116.       assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
  117.       return imm_hw_types[type];
  118.    } else {
  119.       /* Non-immediate registers */
  120.       const static int hw_types[] = {
  121.          [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
  122.          [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
  123.          [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
  124.          [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
  125.          [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
  126.          [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
  127.          [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
  128.          [BRW_REGISTER_TYPE_UV] = -1,
  129.          [BRW_REGISTER_TYPE_VF] = -1,
  130.          [BRW_REGISTER_TYPE_V]  = -1,
  131.          [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
  132.          [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
  133.          [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
  134.          [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
  135.       };
  136.       assert(type < ARRAY_SIZE(hw_types));
  137.       assert(hw_types[type] != -1);
  138.       assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
  139.       assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
  140.       return hw_types[type];
  141.    }
  142. }
  143.  
  144. void
  145. brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
  146. {
  147.    const struct brw_device_info *devinfo = p->devinfo;
  148.  
  149.    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
  150.        dest.file != BRW_MESSAGE_REGISTER_FILE)
  151.       assert(dest.nr < 128);
  152.  
  153.    gen7_convert_mrf_to_grf(p, &dest);
  154.  
  155.    brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
  156.    brw_inst_set_dst_reg_type(devinfo, inst,
  157.                              brw_reg_type_to_hw_type(devinfo, dest.type,
  158.                                                      dest.file));
  159.    brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
  160.  
  161.    if (dest.address_mode == BRW_ADDRESS_DIRECT) {
  162.       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
  163.  
  164.       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  165.          brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
  166.          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
  167.             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
  168.          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
  169.       } else {
  170.          brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
  171.          brw_inst_set_da16_writemask(devinfo, inst, dest.dw1.bits.writemask);
  172.          if (dest.file == BRW_GENERAL_REGISTER_FILE ||
  173.              dest.file == BRW_MESSAGE_REGISTER_FILE) {
  174.             assert(dest.dw1.bits.writemask != 0);
  175.          }
  176.          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
  177.           *    Although Dst.HorzStride is a don't care for Align16, HW needs
  178.           *    this to be programmed as "01".
  179.           */
  180.          brw_inst_set_dst_hstride(devinfo, inst, 1);
  181.       }
  182.    } else {
  183.       brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
  184.  
  185.       /* These are different sizes in align1 vs align16:
  186.        */
  187.       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  188.          brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
  189.                                        dest.dw1.bits.indirect_offset);
  190.          if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
  191.             dest.hstride = BRW_HORIZONTAL_STRIDE_1;
  192.          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
  193.       } else {
  194.          brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
  195.                                         dest.dw1.bits.indirect_offset);
  196.          /* even ignored in da16, still need to set as '01' */
  197.          brw_inst_set_dst_hstride(devinfo, inst, 1);
  198.       }
  199.    }
  200.  
  201.    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
  202.     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
  203.     * small registers, we automatically reduce it to match the register size.
  204.     */
  205.    if (dest.width < BRW_EXECUTE_8)
  206.       brw_inst_set_exec_size(devinfo, inst, dest.width);
  207. }
  208.  
  209. extern int reg_type_size[];
  210.  
  211. static void
  212. validate_reg(const struct brw_device_info *devinfo,
  213.              brw_inst *inst, struct brw_reg reg)
  214. {
  215.    const int hstride_for_reg[] = {0, 1, 2, 4};
  216.    const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
  217.    const int width_for_reg[] = {1, 2, 4, 8, 16};
  218.    const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
  219.    int width, hstride, vstride, execsize;
  220.  
  221.    if (reg.file == BRW_IMMEDIATE_VALUE) {
  222.       /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
  223.        * mean the destination has to be 128-bit aligned and the
  224.        * destination horiz stride has to be a word.
  225.        */
  226.       if (reg.type == BRW_REGISTER_TYPE_V) {
  227.          assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
  228.                 reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
  229.       }
  230.  
  231.       return;
  232.    }
  233.  
  234.    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
  235.        reg.file == BRW_ARF_NULL)
  236.       return;
  237.  
  238.    assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
  239.    hstride = hstride_for_reg[reg.hstride];
  240.  
  241.    if (reg.vstride == 0xf) {
  242.       vstride = -1;
  243.    } else {
  244.       assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
  245.       vstride = vstride_for_reg[reg.vstride];
  246.    }
  247.  
  248.    assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
  249.    width = width_for_reg[reg.width];
  250.  
  251.    assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
  252.           brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
  253.    execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
  254.  
  255.    /* Restrictions from 3.3.10: Register Region Restrictions. */
  256.    /* 3. */
  257.    assert(execsize >= width);
  258.  
  259.    /* 4. */
  260.    if (execsize == width && hstride != 0) {
  261.       assert(vstride == -1 || vstride == width * hstride);
  262.    }
  263.  
  264.    /* 5. */
  265.    if (execsize == width && hstride == 0) {
  266.       /* no restriction on vstride. */
  267.    }
  268.  
  269.    /* 6. */
  270.    if (width == 1) {
  271.       assert(hstride == 0);
  272.    }
  273.  
  274.    /* 7. */
  275.    if (execsize == 1 && width == 1) {
  276.       assert(hstride == 0);
  277.       assert(vstride == 0);
  278.    }
  279.  
  280.    /* 8. */
  281.    if (vstride == 0 && hstride == 0) {
  282.       assert(width == 1);
  283.    }
  284.  
  285.    /* 10. Check destination issues. */
  286. }
  287.  
  288. static bool
  289. is_compactable_immediate(unsigned imm)
  290. {
  291.    /* We get the low 12 bits as-is. */
  292.    imm &= ~0xfff;
  293.  
  294.    /* We get one bit replicated through the top 20 bits. */
  295.    return imm == 0 || imm == 0xfffff000;
  296. }
  297.  
  298. void
  299. brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
  300. {
  301.    const struct brw_device_info *devinfo = p->devinfo;
  302.  
  303.    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
  304.       assert(reg.nr < 128);
  305.  
  306.    gen7_convert_mrf_to_grf(p, &reg);
  307.  
  308.    if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
  309.                              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
  310.       /* Any source modifiers or regions will be ignored, since this just
  311.        * identifies the MRF/GRF to start reading the message contents from.
  312.        * Check for some likely failures.
  313.        */
  314.       assert(!reg.negate);
  315.       assert(!reg.abs);
  316.       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
  317.    }
  318.  
  319.    validate_reg(devinfo, inst, reg);
  320.  
  321.    brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
  322.    brw_inst_set_src0_reg_type(devinfo, inst,
  323.                               brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
  324.    brw_inst_set_src0_abs(devinfo, inst, reg.abs);
  325.    brw_inst_set_src0_negate(devinfo, inst, reg.negate);
  326.    brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
  327.  
  328.    if (reg.file == BRW_IMMEDIATE_VALUE) {
  329.       brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
  330.  
  331.       /* The Bspec's section titled "Non-present Operands" claims that if src0
  332.        * is an immediate that src1's type must be the same as that of src0.
  333.        *
  334.        * The SNB+ DataTypeIndex instruction compaction tables contain mappings
  335.        * that do not follow this rule. E.g., from the IVB/HSW table:
  336.        *
  337.        *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
  338.        *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
  339.        *
  340.        * And from the SNB table:
  341.        *
  342.        *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
  343.        *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
  344.        *
  345.        * Neither of these cause warnings from the simulator when used,
  346.        * compacted or otherwise. In fact, all compaction mappings that have an
  347.        * immediate in src0 use a:ud for src1.
  348.        *
  349.        * The GM45 instruction compaction tables do not contain mapped meanings
  350.        * so it's not clear whether it has the restriction. We'll assume it was
  351.        * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
  352.        */
  353.       brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE);
  354.       if (devinfo->gen < 6) {
  355.          brw_inst_set_src1_reg_type(devinfo, inst,
  356.                                     brw_inst_src0_reg_type(devinfo, inst));
  357.       } else {
  358.          brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
  359.       }
  360.  
  361.       /* Compacted instructions only have 12-bits (plus 1 for the other 20)
  362.        * for immediate values. Presumably the hardware engineers realized
  363.        * that the only useful floating-point value that could be represented
  364.        * in this format is 0.0, which can also be represented as a VF-typed
  365.        * immediate, so they gave us the previously mentioned mapping on IVB+.
  366.        *
  367.        * Strangely, we do have a mapping for imm:f in src1, so we don't need
  368.        * to do this there.
  369.        *
  370.        * If we see a 0.0:F, change the type to VF so that it can be compacted.
  371.        */
  372.       if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
  373.           brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
  374.          brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
  375.       }
  376.  
  377.       /* There are no mappings for dst:d | i:d, so if the immediate is suitable
  378.        * set the types to :UD so the instruction can be compacted.
  379.        */
  380.       if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
  381.           brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
  382.           brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
  383.           brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
  384.          brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
  385.          brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
  386.       }
  387.    } else {
  388.       if (reg.address_mode == BRW_ADDRESS_DIRECT) {
  389.          brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
  390.          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  391.              brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
  392.          } else {
  393.             brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
  394.          }
  395.       } else {
  396.          brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
  397.  
  398.          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  399.             brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
  400.          } else {
  401.             brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.dw1.bits.indirect_offset);
  402.          }
  403.       }
  404.  
  405.       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  406.          if (reg.width == BRW_WIDTH_1 &&
  407.              brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
  408.             brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
  409.             brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
  410.             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
  411.          } else {
  412.             brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
  413.             brw_inst_set_src0_width(devinfo, inst, reg.width);
  414.             brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
  415.          }
  416.       } else {
  417.          brw_inst_set_src0_da16_swiz_x(devinfo, inst,
  418.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
  419.          brw_inst_set_src0_da16_swiz_y(devinfo, inst,
  420.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
  421.          brw_inst_set_src0_da16_swiz_z(devinfo, inst,
  422.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
  423.          brw_inst_set_src0_da16_swiz_w(devinfo, inst,
  424.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
  425.  
  426.          /* This is an oddity of the fact we're using the same
  427.           * descriptions for registers in align_16 as align_1:
  428.           */
  429.          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
  430.             brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
  431.          else
  432.             brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
  433.       }
  434.    }
  435. }
  436.  
  437.  
  438. void
  439. brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
  440. {
  441.    const struct brw_device_info *devinfo = p->devinfo;
  442.  
  443.    if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
  444.       assert(reg.nr < 128);
  445.  
  446.    gen7_convert_mrf_to_grf(p, &reg);
  447.    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
  448.  
  449.    validate_reg(devinfo, inst, reg);
  450.  
  451.    brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
  452.    brw_inst_set_src1_reg_type(devinfo, inst,
  453.                               brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
  454.    brw_inst_set_src1_abs(devinfo, inst, reg.abs);
  455.    brw_inst_set_src1_negate(devinfo, inst, reg.negate);
  456.  
  457.    /* Only src1 can be immediate in two-argument instructions.
  458.     */
  459.    assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
  460.  
  461.    if (reg.file == BRW_IMMEDIATE_VALUE) {
  462.       brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
  463.    } else {
  464.       /* This is a hardware restriction, which may or may not be lifted
  465.        * in the future:
  466.        */
  467.       assert (reg.address_mode == BRW_ADDRESS_DIRECT);
  468.       /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
  469.  
  470.       brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
  471.       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  472.          brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
  473.       } else {
  474.          brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
  475.       }
  476.  
  477.       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
  478.          if (reg.width == BRW_WIDTH_1 &&
  479.              brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
  480.             brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
  481.             brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
  482.             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
  483.          } else {
  484.             brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
  485.             brw_inst_set_src1_width(devinfo, inst, reg.width);
  486.             brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
  487.          }
  488.       } else {
  489.          brw_inst_set_src1_da16_swiz_x(devinfo, inst,
  490.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
  491.          brw_inst_set_src1_da16_swiz_y(devinfo, inst,
  492.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
  493.          brw_inst_set_src1_da16_swiz_z(devinfo, inst,
  494.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
  495.          brw_inst_set_src1_da16_swiz_w(devinfo, inst,
  496.             BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
  497.  
  498.          /* This is an oddity of the fact we're using the same
  499.           * descriptions for registers in align_16 as align_1:
  500.           */
  501.          if (reg.vstride == BRW_VERTICAL_STRIDE_8)
  502.             brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
  503.          else
  504.             brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
  505.       }
  506.    }
  507. }
  508.  
  509. /**
  510.  * Set the Message Descriptor and Extended Message Descriptor fields
  511.  * for SEND messages.
  512.  *
  513.  * \note This zeroes out the Function Control bits, so it must be called
  514.  *       \b before filling out any message-specific data.  Callers can
  515.  *       choose not to fill in irrelevant bits; they will be zero.
  516.  */
  517. static void
  518. brw_set_message_descriptor(struct brw_codegen *p,
  519.                            brw_inst *inst,
  520.                            enum brw_message_target sfid,
  521.                            unsigned msg_length,
  522.                            unsigned response_length,
  523.                            bool header_present,
  524.                            bool end_of_thread)
  525. {
  526.    const struct brw_device_info *devinfo = p->devinfo;
  527.  
  528.    brw_set_src1(p, inst, brw_imm_d(0));
  529.  
  530.    /* For indirect sends, `inst` will not be the SEND/SENDC instruction
  531.     * itself; instead, it will be a MOV/OR into the address register.
  532.     *
  533.     * In this case, we avoid setting the extended message descriptor bits,
  534.     * since they go on the later SEND/SENDC instead and if set here would
  535.     * instead clobber the conditionalmod bits.
  536.     */
  537.    unsigned opcode = brw_inst_opcode(devinfo, inst);
  538.    if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
  539.       brw_inst_set_sfid(devinfo, inst, sfid);
  540.    }
  541.  
  542.    brw_inst_set_mlen(devinfo, inst, msg_length);
  543.    brw_inst_set_rlen(devinfo, inst, response_length);
  544.    brw_inst_set_eot(devinfo, inst, end_of_thread);
  545.  
  546.    if (devinfo->gen >= 5) {
  547.       brw_inst_set_header_present(devinfo, inst, header_present);
  548.    }
  549. }
  550.  
  551. static void brw_set_math_message( struct brw_codegen *p,
  552.                                   brw_inst *inst,
  553.                                   unsigned function,
  554.                                   unsigned integer_type,
  555.                                   bool low_precision,
  556.                                   unsigned dataType )
  557. {
  558.    const struct brw_device_info *devinfo = p->devinfo;
  559.    unsigned msg_length;
  560.    unsigned response_length;
  561.  
  562.    /* Infer message length from the function */
  563.    switch (function) {
  564.    case BRW_MATH_FUNCTION_POW:
  565.    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
  566.    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
  567.    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
  568.       msg_length = 2;
  569.       break;
  570.    default:
  571.       msg_length = 1;
  572.       break;
  573.    }
  574.  
  575.    /* Infer response length from the function */
  576.    switch (function) {
  577.    case BRW_MATH_FUNCTION_SINCOS:
  578.    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
  579.       response_length = 2;
  580.       break;
  581.    default:
  582.       response_length = 1;
  583.       break;
  584.    }
  585.  
  586.  
  587.    brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
  588.                               msg_length, response_length, false, false);
  589.    brw_inst_set_math_msg_function(devinfo, inst, function);
  590.    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
  591.    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
  592.    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
  593.    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
  594.    brw_inst_set_saturate(devinfo, inst, 0);
  595. }
  596.  
  597.  
  598. static void brw_set_ff_sync_message(struct brw_codegen *p,
  599.                                     brw_inst *insn,
  600.                                     bool allocate,
  601.                                     unsigned response_length,
  602.                                     bool end_of_thread)
  603. {
  604.    const struct brw_device_info *devinfo = p->devinfo;
  605.  
  606.    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
  607.                               1, response_length, true, end_of_thread);
  608.    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
  609.    brw_inst_set_urb_allocate(devinfo, insn, allocate);
  610.    /* The following fields are not used by FF_SYNC: */
  611.    brw_inst_set_urb_global_offset(devinfo, insn, 0);
  612.    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
  613.    brw_inst_set_urb_used(devinfo, insn, 0);
  614.    brw_inst_set_urb_complete(devinfo, insn, 0);
  615. }
  616.  
  617. static void brw_set_urb_message( struct brw_codegen *p,
  618.                                  brw_inst *insn,
  619.                                  enum brw_urb_write_flags flags,
  620.                                  unsigned msg_length,
  621.                                  unsigned response_length,
  622.                                  unsigned offset,
  623.                                  unsigned swizzle_control )
  624. {
  625.    const struct brw_device_info *devinfo = p->devinfo;
  626.  
  627.    assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
  628.    assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
  629.    assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
  630.  
  631.    brw_set_message_descriptor(p, insn, BRW_SFID_URB,
  632.                               msg_length, response_length, true,
  633.                               flags & BRW_URB_WRITE_EOT);
  634.  
  635.    if (flags & BRW_URB_WRITE_OWORD) {
  636.       assert(msg_length == 2); /* header + one OWORD of data */
  637.       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
  638.    } else {
  639.       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
  640.    }
  641.  
  642.    brw_inst_set_urb_global_offset(devinfo, insn, offset);
  643.    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
  644.  
  645.    if (devinfo->gen < 8) {
  646.       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
  647.    }
  648.  
  649.    if (devinfo->gen < 7) {
  650.       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
  651.       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
  652.    } else {
  653.       brw_inst_set_urb_per_slot_offset(devinfo, insn,
  654.          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
  655.    }
  656. }
  657.  
  658. void
  659. brw_set_dp_write_message(struct brw_codegen *p,
  660.                          brw_inst *insn,
  661.                          unsigned binding_table_index,
  662.                          unsigned msg_control,
  663.                          unsigned msg_type,
  664.                          unsigned msg_length,
  665.                          bool header_present,
  666.                          unsigned last_render_target,
  667.                          unsigned response_length,
  668.                          unsigned end_of_thread,
  669.                          unsigned send_commit_msg)
  670. {
  671.    const struct brw_device_info *devinfo = p->devinfo;
  672.    unsigned sfid;
  673.  
  674.    if (devinfo->gen >= 7) {
  675.       /* Use the Render Cache for RT writes; otherwise use the Data Cache */
  676.       if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
  677.          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
  678.       else
  679.          sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
  680.    } else if (devinfo->gen == 6) {
  681.       /* Use the render cache for all write messages. */
  682.       sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
  683.    } else {
  684.       sfid = BRW_SFID_DATAPORT_WRITE;
  685.    }
  686.  
  687.    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
  688.                               header_present, end_of_thread);
  689.  
  690.    brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
  691.    brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
  692.    brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
  693.    brw_inst_set_rt_last(devinfo, insn, last_render_target);
  694.    if (devinfo->gen < 7) {
  695.       brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
  696.    }
  697. }
  698.  
  699. void
  700. brw_set_dp_read_message(struct brw_codegen *p,
  701.                         brw_inst *insn,
  702.                         unsigned binding_table_index,
  703.                         unsigned msg_control,
  704.                         unsigned msg_type,
  705.                         unsigned target_cache,
  706.                         unsigned msg_length,
  707.                         bool header_present,
  708.                         unsigned response_length)
  709. {
  710.    const struct brw_device_info *devinfo = p->devinfo;
  711.    unsigned sfid;
  712.  
  713.    if (devinfo->gen >= 7) {
  714.       sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
  715.    } else if (devinfo->gen == 6) {
  716.       if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
  717.          sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
  718.       else
  719.          sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
  720.    } else {
  721.       sfid = BRW_SFID_DATAPORT_READ;
  722.    }
  723.  
  724.    brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
  725.                               header_present, false);
  726.  
  727.    brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
  728.    brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
  729.    brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
  730.    if (devinfo->gen < 6)
  731.       brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
  732. }
  733.  
  734. void
  735. brw_set_sampler_message(struct brw_codegen *p,
  736.                         brw_inst *inst,
  737.                         unsigned binding_table_index,
  738.                         unsigned sampler,
  739.                         unsigned msg_type,
  740.                         unsigned response_length,
  741.                         unsigned msg_length,
  742.                         unsigned header_present,
  743.                         unsigned simd_mode,
  744.                         unsigned return_format)
  745. {
  746.    const struct brw_device_info *devinfo = p->devinfo;
  747.  
  748.    brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
  749.                               response_length, header_present, false);
  750.  
  751.    brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
  752.    brw_inst_set_sampler(devinfo, inst, sampler);
  753.    brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
  754.    if (devinfo->gen >= 5) {
  755.       brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
  756.    } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
  757.       brw_inst_set_sampler_return_format(devinfo, inst, return_format);
  758.    }
  759. }
  760.  
  761. static void
  762. gen7_set_dp_scratch_message(struct brw_codegen *p,
  763.                             brw_inst *inst,
  764.                             bool write,
  765.                             bool dword,
  766.                             bool invalidate_after_read,
  767.                             unsigned num_regs,
  768.                             unsigned addr_offset,
  769.                             unsigned mlen,
  770.                             unsigned rlen,
  771.                             bool header_present)
  772. {
  773.    const struct brw_device_info *devinfo = p->devinfo;
  774.    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
  775.           (devinfo->gen >= 8 && num_regs == 8));
  776.    brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
  777.                               mlen, rlen, header_present, false);
  778.    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
  779.    brw_inst_set_scratch_read_write(devinfo, inst, write);
  780.    brw_inst_set_scratch_type(devinfo, inst, dword);
  781.    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
  782.    brw_inst_set_scratch_block_size(devinfo, inst, ffs(num_regs) - 1);
  783.    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
  784. }
  785.  
  786. #define next_insn brw_next_insn
  787. brw_inst *
  788. brw_next_insn(struct brw_codegen *p, unsigned opcode)
  789. {
  790.    const struct brw_device_info *devinfo = p->devinfo;
  791.    brw_inst *insn;
  792.  
  793.    if (p->nr_insn + 1 > p->store_size) {
  794.       p->store_size <<= 1;
  795.       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
  796.    }
  797.  
  798.    p->next_insn_offset += 16;
  799.    insn = &p->store[p->nr_insn++];
  800.    memcpy(insn, p->current, sizeof(*insn));
  801.  
  802.    brw_inst_set_opcode(devinfo, insn, opcode);
  803.    return insn;
  804. }
  805.  
  806. static brw_inst *
  807. brw_alu1(struct brw_codegen *p, unsigned opcode,
  808.          struct brw_reg dest, struct brw_reg src)
  809. {
  810.    brw_inst *insn = next_insn(p, opcode);
  811.    brw_set_dest(p, insn, dest);
  812.    brw_set_src0(p, insn, src);
  813.    return insn;
  814. }
  815.  
  816. static brw_inst *
  817. brw_alu2(struct brw_codegen *p, unsigned opcode,
  818.          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
  819. {
  820.    brw_inst *insn = next_insn(p, opcode);
  821.    brw_set_dest(p, insn, dest);
  822.    brw_set_src0(p, insn, src0);
  823.    brw_set_src1(p, insn, src1);
  824.    return insn;
  825. }
  826.  
  827. static int
  828. get_3src_subreg_nr(struct brw_reg reg)
  829. {
  830.    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
  831.       assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
  832.       return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
  833.    } else {
  834.       return reg.subnr / 4;
  835.    }
  836. }
  837.  
  838. static brw_inst *
  839. brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
  840.          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
  841. {
  842.    const struct brw_device_info *devinfo = p->devinfo;
  843.    brw_inst *inst = next_insn(p, opcode);
  844.  
  845.    gen7_convert_mrf_to_grf(p, &dest);
  846.  
  847.    assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
  848.  
  849.    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
  850.           dest.file == BRW_MESSAGE_REGISTER_FILE);
  851.    assert(dest.nr < 128);
  852.    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
  853.    assert(dest.type == BRW_REGISTER_TYPE_F ||
  854.           dest.type == BRW_REGISTER_TYPE_D ||
  855.           dest.type == BRW_REGISTER_TYPE_UD);
  856.    if (devinfo->gen == 6) {
  857.       brw_inst_set_3src_dst_reg_file(devinfo, inst,
  858.                                      dest.file == BRW_MESSAGE_REGISTER_FILE);
  859.    }
  860.    brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
  861.    brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
  862.    brw_inst_set_3src_dst_writemask(devinfo, inst, dest.dw1.bits.writemask);
  863.  
  864.    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
  865.    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
  866.    assert(src0.nr < 128);
  867.    brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.dw1.bits.swizzle);
  868.    brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
  869.    brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
  870.    brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
  871.    brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
  872.    brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
  873.                                    src0.vstride == BRW_VERTICAL_STRIDE_0);
  874.  
  875.    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
  876.    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
  877.    assert(src1.nr < 128);
  878.    brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.dw1.bits.swizzle);
  879.    brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
  880.    brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
  881.    brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
  882.    brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
  883.    brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
  884.                                    src1.vstride == BRW_VERTICAL_STRIDE_0);
  885.  
  886.    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
  887.    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
  888.    assert(src2.nr < 128);
  889.    brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.dw1.bits.swizzle);
  890.    brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
  891.    brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
  892.    brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
  893.    brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
  894.    brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
  895.                                    src2.vstride == BRW_VERTICAL_STRIDE_0);
  896.  
  897.    if (devinfo->gen >= 7) {
  898.       /* Set both the source and destination types based on dest.type,
  899.        * ignoring the source register types.  The MAD and LRP emitters ensure
  900.        * that all four types are float.  The BFE and BFI2 emitters, however,
  901.        * may send us mixed D and UD types and want us to ignore that and use
  902.        * the destination type.
  903.        */
  904.       switch (dest.type) {
  905.       case BRW_REGISTER_TYPE_F:
  906.          brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
  907.          brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
  908.          break;
  909.       case BRW_REGISTER_TYPE_D:
  910.          brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
  911.          brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
  912.          break;
  913.       case BRW_REGISTER_TYPE_UD:
  914.          brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
  915.          brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
  916.          break;
  917.       }
  918.    }
  919.  
  920.    return inst;
  921. }
  922.  
  923.  
  924. /***********************************************************************
  925.  * Convenience routines.
  926.  */
  927. #define ALU1(OP)                                        \
  928. brw_inst *brw_##OP(struct brw_codegen *p,               \
  929.               struct brw_reg dest,                      \
  930.               struct brw_reg src0)                      \
  931. {                                                       \
  932.    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);     \
  933. }
  934.  
  935. #define ALU2(OP)                                        \
  936. brw_inst *brw_##OP(struct brw_codegen *p,               \
  937.               struct brw_reg dest,                      \
  938.               struct brw_reg src0,                      \
  939.               struct brw_reg src1)                      \
  940. {                                                       \
  941.    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);       \
  942. }
  943.  
  944. #define ALU3(OP)                                        \
  945. brw_inst *brw_##OP(struct brw_codegen *p,               \
  946.               struct brw_reg dest,                      \
  947.               struct brw_reg src0,                      \
  948.               struct brw_reg src1,                      \
  949.               struct brw_reg src2)                      \
  950. {                                                       \
  951.    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
  952. }
  953.  
  954. #define ALU3F(OP)                                               \
  955. brw_inst *brw_##OP(struct brw_codegen *p,         \
  956.                                  struct brw_reg dest,           \
  957.                                  struct brw_reg src0,           \
  958.                                  struct brw_reg src1,           \
  959.                                  struct brw_reg src2)           \
  960. {                                                               \
  961.    assert(dest.type == BRW_REGISTER_TYPE_F);                    \
  962.    assert(src0.type == BRW_REGISTER_TYPE_F);                    \
  963.    assert(src1.type == BRW_REGISTER_TYPE_F);                    \
  964.    assert(src2.type == BRW_REGISTER_TYPE_F);                    \
  965.    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
  966. }
  967.  
  968. /* Rounding operations (other than RNDD) require two instructions - the first
  969.  * stores a rounded value (possibly the wrong way) in the dest register, but
  970.  * also sets a per-channel "increment bit" in the flag register.  A predicated
  971.  * add of 1.0 fixes dest to contain the desired result.
  972.  *
  973.  * Sandybridge and later appear to round correctly without an ADD.
  974.  */
  975. #define ROUND(OP)                                                             \
  976. void brw_##OP(struct brw_codegen *p,                                          \
  977.               struct brw_reg dest,                                            \
  978.               struct brw_reg src)                                             \
  979. {                                                                             \
  980.    const struct brw_device_info *devinfo = p->devinfo;                                        \
  981.    brw_inst *rnd, *add;                                                       \
  982.    rnd = next_insn(p, BRW_OPCODE_##OP);                                       \
  983.    brw_set_dest(p, rnd, dest);                                                \
  984.    brw_set_src0(p, rnd, src);                                                 \
  985.                                                                               \
  986.    if (devinfo->gen < 6) {                                                            \
  987.       /* turn on round-increments */                                          \
  988.       brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
  989.       add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));                          \
  990.       brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
  991.    }                                                                          \
  992. }
  993.  
  994.  
  995. ALU1(MOV)
  996. ALU2(SEL)
  997. ALU1(NOT)
  998. ALU2(AND)
  999. ALU2(OR)
  1000. ALU2(XOR)
  1001. ALU2(SHR)
  1002. ALU2(SHL)
  1003. ALU2(ASR)
  1004. ALU1(FRC)
  1005. ALU1(RNDD)
  1006. ALU2(MAC)
  1007. ALU2(MACH)
  1008. ALU1(LZD)
  1009. ALU2(DP4)
  1010. ALU2(DPH)
  1011. ALU2(DP3)
  1012. ALU2(DP2)
  1013. ALU3F(MAD)
  1014. ALU3F(LRP)
  1015. ALU1(BFREV)
  1016. ALU3(BFE)
  1017. ALU2(BFI1)
  1018. ALU3(BFI2)
  1019. ALU1(FBH)
  1020. ALU1(FBL)
  1021. ALU1(CBIT)
  1022. ALU2(ADDC)
  1023. ALU2(SUBB)
  1024.  
  1025. ROUND(RNDZ)
  1026. ROUND(RNDE)
  1027.  
  1028.  
  1029. brw_inst *
  1030. brw_ADD(struct brw_codegen *p, struct brw_reg dest,
  1031.         struct brw_reg src0, struct brw_reg src1)
  1032. {
  1033.    /* 6.2.2: add */
  1034.    if (src0.type == BRW_REGISTER_TYPE_F ||
  1035.        (src0.file == BRW_IMMEDIATE_VALUE &&
  1036.         src0.type == BRW_REGISTER_TYPE_VF)) {
  1037.       assert(src1.type != BRW_REGISTER_TYPE_UD);
  1038.       assert(src1.type != BRW_REGISTER_TYPE_D);
  1039.    }
  1040.  
  1041.    if (src1.type == BRW_REGISTER_TYPE_F ||
  1042.        (src1.file == BRW_IMMEDIATE_VALUE &&
  1043.         src1.type == BRW_REGISTER_TYPE_VF)) {
  1044.       assert(src0.type != BRW_REGISTER_TYPE_UD);
  1045.       assert(src0.type != BRW_REGISTER_TYPE_D);
  1046.    }
  1047.  
  1048.    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
  1049. }
  1050.  
  1051. brw_inst *
  1052. brw_AVG(struct brw_codegen *p, struct brw_reg dest,
  1053.         struct brw_reg src0, struct brw_reg src1)
  1054. {
  1055.    assert(dest.type == src0.type);
  1056.    assert(src0.type == src1.type);
  1057.    switch (src0.type) {
  1058.    case BRW_REGISTER_TYPE_B:
  1059.    case BRW_REGISTER_TYPE_UB:
  1060.    case BRW_REGISTER_TYPE_W:
  1061.    case BRW_REGISTER_TYPE_UW:
  1062.    case BRW_REGISTER_TYPE_D:
  1063.    case BRW_REGISTER_TYPE_UD:
  1064.       break;
  1065.    default:
  1066.       unreachable("Bad type for brw_AVG");
  1067.    }
  1068.  
  1069.    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
  1070. }
  1071.  
  1072. brw_inst *
  1073. brw_MUL(struct brw_codegen *p, struct brw_reg dest,
  1074.         struct brw_reg src0, struct brw_reg src1)
  1075. {
  1076.    /* 6.32.38: mul */
  1077.    if (src0.type == BRW_REGISTER_TYPE_D ||
  1078.        src0.type == BRW_REGISTER_TYPE_UD ||
  1079.        src1.type == BRW_REGISTER_TYPE_D ||
  1080.        src1.type == BRW_REGISTER_TYPE_UD) {
  1081.       assert(dest.type != BRW_REGISTER_TYPE_F);
  1082.    }
  1083.  
  1084.    if (src0.type == BRW_REGISTER_TYPE_F ||
  1085.        (src0.file == BRW_IMMEDIATE_VALUE &&
  1086.         src0.type == BRW_REGISTER_TYPE_VF)) {
  1087.       assert(src1.type != BRW_REGISTER_TYPE_UD);
  1088.       assert(src1.type != BRW_REGISTER_TYPE_D);
  1089.    }
  1090.  
  1091.    if (src1.type == BRW_REGISTER_TYPE_F ||
  1092.        (src1.file == BRW_IMMEDIATE_VALUE &&
  1093.         src1.type == BRW_REGISTER_TYPE_VF)) {
  1094.       assert(src0.type != BRW_REGISTER_TYPE_UD);
  1095.       assert(src0.type != BRW_REGISTER_TYPE_D);
  1096.    }
  1097.  
  1098.    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
  1099.           src0.nr != BRW_ARF_ACCUMULATOR);
  1100.    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
  1101.           src1.nr != BRW_ARF_ACCUMULATOR);
  1102.  
  1103.    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
  1104. }
  1105.  
  1106. brw_inst *
  1107. brw_LINE(struct brw_codegen *p, struct brw_reg dest,
  1108.          struct brw_reg src0, struct brw_reg src1)
  1109. {
  1110.    src0.vstride = BRW_VERTICAL_STRIDE_0;
  1111.    src0.width = BRW_WIDTH_1;
  1112.    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
  1113.    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
  1114. }
  1115.  
  1116. brw_inst *
  1117. brw_PLN(struct brw_codegen *p, struct brw_reg dest,
  1118.         struct brw_reg src0, struct brw_reg src1)
  1119. {
  1120.    src0.vstride = BRW_VERTICAL_STRIDE_0;
  1121.    src0.width = BRW_WIDTH_1;
  1122.    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
  1123.    src1.vstride = BRW_VERTICAL_STRIDE_8;
  1124.    src1.width = BRW_WIDTH_8;
  1125.    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
  1126.    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
  1127. }
  1128.  
  1129. brw_inst *
  1130. brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
  1131. {
  1132.    const struct brw_device_info *devinfo = p->devinfo;
  1133.    const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
  1134.    /* The F32TO16 instruction doesn't support 32-bit destination types in
  1135.     * Align1 mode, and neither does the Gen8 implementation in terms of a
  1136.     * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
  1137.     * an undocumented feature.
  1138.     */
  1139.    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
  1140.                                  (!align16 || devinfo->gen >= 8));
  1141.    brw_inst *inst;
  1142.  
  1143.    if (align16) {
  1144.       assert(dst.type == BRW_REGISTER_TYPE_UD);
  1145.    } else {
  1146.       assert(dst.type == BRW_REGISTER_TYPE_UD ||
  1147.              dst.type == BRW_REGISTER_TYPE_W ||
  1148.              dst.type == BRW_REGISTER_TYPE_UW ||
  1149.              dst.type == BRW_REGISTER_TYPE_HF);
  1150.    }
  1151.  
  1152.    brw_push_insn_state(p);
  1153.  
  1154.    if (needs_zero_fill) {
  1155.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  1156.       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
  1157.    }
  1158.  
  1159.    if (devinfo->gen >= 8) {
  1160.       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
  1161.    } else {
  1162.       assert(devinfo->gen == 7);
  1163.       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
  1164.    }
  1165.  
  1166.    if (needs_zero_fill) {
  1167.       brw_inst_set_no_dd_clear(devinfo, inst, true);
  1168.       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
  1169.       brw_inst_set_no_dd_check(devinfo, inst, true);
  1170.    }
  1171.  
  1172.    brw_pop_insn_state(p);
  1173.    return inst;
  1174. }
  1175.  
  1176. brw_inst *
  1177. brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
  1178. {
  1179.    const struct brw_device_info *devinfo = p->devinfo;
  1180.    bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
  1181.  
  1182.    if (align16) {
  1183.       assert(src.type == BRW_REGISTER_TYPE_UD);
  1184.    } else {
  1185.       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
  1186.        *
  1187.        *   Because this instruction does not have a 16-bit floating-point
  1188.        *   type, the source data type must be Word (W). The destination type
  1189.        *   must be F (Float).
  1190.        */
  1191.       if (src.type == BRW_REGISTER_TYPE_UD)
  1192.          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
  1193.  
  1194.       assert(src.type == BRW_REGISTER_TYPE_W ||
  1195.              src.type == BRW_REGISTER_TYPE_UW ||
  1196.              src.type == BRW_REGISTER_TYPE_HF);
  1197.    }
  1198.  
  1199.    if (devinfo->gen >= 8) {
  1200.       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
  1201.    } else {
  1202.       assert(devinfo->gen == 7);
  1203.       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
  1204.    }
  1205. }
  1206.  
  1207.  
  1208. void brw_NOP(struct brw_codegen *p)
  1209. {
  1210.    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
  1211.    brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
  1212.    brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
  1213.    brw_set_src1(p, insn, brw_imm_ud(0x0));
  1214. }
  1215.  
  1216.  
  1217.  
  1218.  
  1219.  
  1220. /***********************************************************************
  1221.  * Comparisons, if/else/endif
  1222.  */
  1223.  
  1224. brw_inst *
  1225. brw_JMPI(struct brw_codegen *p, struct brw_reg index,
  1226.          unsigned predicate_control)
  1227. {
  1228.    const struct brw_device_info *devinfo = p->devinfo;
  1229.    struct brw_reg ip = brw_ip_reg();
  1230.    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
  1231.  
  1232.    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
  1233.    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
  1234.    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
  1235.    brw_inst_set_pred_control(devinfo, inst, predicate_control);
  1236.  
  1237.    return inst;
  1238. }
  1239.  
  1240. static void
  1241. push_if_stack(struct brw_codegen *p, brw_inst *inst)
  1242. {
  1243.    p->if_stack[p->if_stack_depth] = inst - p->store;
  1244.  
  1245.    p->if_stack_depth++;
  1246.    if (p->if_stack_array_size <= p->if_stack_depth) {
  1247.       p->if_stack_array_size *= 2;
  1248.       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
  1249.                              p->if_stack_array_size);
  1250.    }
  1251. }
  1252.  
  1253. static brw_inst *
  1254. pop_if_stack(struct brw_codegen *p)
  1255. {
  1256.    p->if_stack_depth--;
  1257.    return &p->store[p->if_stack[p->if_stack_depth]];
  1258. }
  1259.  
  1260. static void
  1261. push_loop_stack(struct brw_codegen *p, brw_inst *inst)
  1262. {
  1263.    if (p->loop_stack_array_size < p->loop_stack_depth) {
  1264.       p->loop_stack_array_size *= 2;
  1265.       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
  1266.                                p->loop_stack_array_size);
  1267.       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
  1268.                                      p->loop_stack_array_size);
  1269.    }
  1270.  
  1271.    p->loop_stack[p->loop_stack_depth] = inst - p->store;
  1272.    p->loop_stack_depth++;
  1273.    p->if_depth_in_loop[p->loop_stack_depth] = 0;
  1274. }
  1275.  
  1276. static brw_inst *
  1277. get_inner_do_insn(struct brw_codegen *p)
  1278. {
  1279.    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
  1280. }
  1281.  
  1282. /* EU takes the value from the flag register and pushes it onto some
  1283.  * sort of a stack (presumably merging with any flag value already on
  1284.  * the stack).  Within an if block, the flags at the top of the stack
  1285.  * control execution on each channel of the unit, eg. on each of the
  1286.  * 16 pixel values in our wm programs.
  1287.  *
  1288.  * When the matching 'else' instruction is reached (presumably by
  1289.  * countdown of the instruction count patched in by our ELSE/ENDIF
  1290.  * functions), the relevant flags are inverted.
  1291.  *
  1292.  * When the matching 'endif' instruction is reached, the flags are
  1293.  * popped off.  If the stack is now empty, normal execution resumes.
  1294.  */
  1295. brw_inst *
  1296. brw_IF(struct brw_codegen *p, unsigned execute_size)
  1297. {
  1298.    const struct brw_device_info *devinfo = p->devinfo;
  1299.    brw_inst *insn;
  1300.  
  1301.    insn = next_insn(p, BRW_OPCODE_IF);
  1302.  
  1303.    /* Override the defaults for this instruction:
  1304.     */
  1305.    if (devinfo->gen < 6) {
  1306.       brw_set_dest(p, insn, brw_ip_reg());
  1307.       brw_set_src0(p, insn, brw_ip_reg());
  1308.       brw_set_src1(p, insn, brw_imm_d(0x0));
  1309.    } else if (devinfo->gen == 6) {
  1310.       brw_set_dest(p, insn, brw_imm_w(0));
  1311.       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
  1312.       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
  1313.       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
  1314.    } else if (devinfo->gen == 7) {
  1315.       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
  1316.       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
  1317.       brw_set_src1(p, insn, brw_imm_w(0));
  1318.       brw_inst_set_jip(devinfo, insn, 0);
  1319.       brw_inst_set_uip(devinfo, insn, 0);
  1320.    } else {
  1321.       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
  1322.       brw_set_src0(p, insn, brw_imm_d(0));
  1323.       brw_inst_set_jip(devinfo, insn, 0);
  1324.       brw_inst_set_uip(devinfo, insn, 0);
  1325.    }
  1326.  
  1327.    brw_inst_set_exec_size(devinfo, insn, execute_size);
  1328.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1329.    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
  1330.    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
  1331.    if (!p->single_program_flow && devinfo->gen < 6)
  1332.       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
  1333.  
  1334.    push_if_stack(p, insn);
  1335.    p->if_depth_in_loop[p->loop_stack_depth]++;
  1336.    return insn;
  1337. }
  1338.  
  1339. /* This function is only used for gen6-style IF instructions with an
  1340.  * embedded comparison (conditional modifier).  It is not used on gen7.
  1341.  */
  1342. brw_inst *
  1343. gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
  1344.         struct brw_reg src0, struct brw_reg src1)
  1345. {
  1346.    const struct brw_device_info *devinfo = p->devinfo;
  1347.    brw_inst *insn;
  1348.  
  1349.    insn = next_insn(p, BRW_OPCODE_IF);
  1350.  
  1351.    brw_set_dest(p, insn, brw_imm_w(0));
  1352.    brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
  1353.                                                    : BRW_EXECUTE_8);
  1354.    brw_inst_set_gen6_jump_count(devinfo, insn, 0);
  1355.    brw_set_src0(p, insn, src0);
  1356.    brw_set_src1(p, insn, src1);
  1357.  
  1358.    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
  1359.    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
  1360.    brw_inst_set_cond_modifier(devinfo, insn, conditional);
  1361.  
  1362.    push_if_stack(p, insn);
  1363.    return insn;
  1364. }
  1365.  
  1366. /**
  1367.  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
  1368.  */
  1369. static void
  1370. convert_IF_ELSE_to_ADD(struct brw_codegen *p,
  1371.                        brw_inst *if_inst, brw_inst *else_inst)
  1372. {
  1373.    const struct brw_device_info *devinfo = p->devinfo;
  1374.  
  1375.    /* The next instruction (where the ENDIF would be, if it existed) */
  1376.    brw_inst *next_inst = &p->store[p->nr_insn];
  1377.  
  1378.    assert(p->single_program_flow);
  1379.    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
  1380.    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
  1381.    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
  1382.  
  1383.    /* Convert IF to an ADD instruction that moves the instruction pointer
  1384.     * to the first instruction of the ELSE block.  If there is no ELSE
  1385.     * block, point to where ENDIF would be.  Reverse the predicate.
  1386.     *
  1387.     * There's no need to execute an ENDIF since we don't need to do any
  1388.     * stack operations, and if we're currently executing, we just want to
  1389.     * continue normally.
  1390.     */
  1391.    brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
  1392.    brw_inst_set_pred_inv(devinfo, if_inst, true);
  1393.  
  1394.    if (else_inst != NULL) {
  1395.       /* Convert ELSE to an ADD instruction that points where the ENDIF
  1396.        * would be.
  1397.        */
  1398.       brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
  1399.  
  1400.       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
  1401.       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
  1402.    } else {
  1403.       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
  1404.    }
  1405. }
  1406.  
  1407. /**
  1408.  * Patch IF and ELSE instructions with appropriate jump targets.
  1409.  */
  1410. static void
  1411. patch_IF_ELSE(struct brw_codegen *p,
  1412.               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
  1413. {
  1414.    const struct brw_device_info *devinfo = p->devinfo;
  1415.  
  1416.    /* We shouldn't be patching IF and ELSE instructions in single program flow
  1417.     * mode when gen < 6, because in single program flow mode on those
  1418.     * platforms, we convert flow control instructions to conditional ADDs that
  1419.     * operate on IP (see brw_ENDIF).
  1420.     *
  1421.     * However, on Gen6, writing to IP doesn't work in single program flow mode
  1422.     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
  1423.     * not be updated by non-flow control instructions.").  And on later
  1424.     * platforms, there is no significant benefit to converting control flow
  1425.     * instructions to conditional ADDs.  So we do patch IF and ELSE
  1426.     * instructions in single program flow mode on those platforms.
  1427.     */
  1428.    if (devinfo->gen < 6)
  1429.       assert(!p->single_program_flow);
  1430.  
  1431.    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
  1432.    assert(endif_inst != NULL);
  1433.    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
  1434.  
  1435.    unsigned br = brw_jump_scale(devinfo);
  1436.  
  1437.    assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
  1438.    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
  1439.  
  1440.    if (else_inst == NULL) {
  1441.       /* Patch IF -> ENDIF */
  1442.       if (devinfo->gen < 6) {
  1443.          /* Turn it into an IFF, which means no mask stack operations for
  1444.           * all-false and jumping past the ENDIF.
  1445.           */
  1446.          brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
  1447.          brw_inst_set_gen4_jump_count(devinfo, if_inst,
  1448.                                       br * (endif_inst - if_inst + 1));
  1449.          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
  1450.       } else if (devinfo->gen == 6) {
  1451.          /* As of gen6, there is no IFF and IF must point to the ENDIF. */
  1452.          brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
  1453.       } else {
  1454.          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
  1455.          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
  1456.       }
  1457.    } else {
  1458.       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
  1459.  
  1460.       /* Patch IF -> ELSE */
  1461.       if (devinfo->gen < 6) {
  1462.          brw_inst_set_gen4_jump_count(devinfo, if_inst,
  1463.                                       br * (else_inst - if_inst));
  1464.          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
  1465.       } else if (devinfo->gen == 6) {
  1466.          brw_inst_set_gen6_jump_count(devinfo, if_inst,
  1467.                                       br * (else_inst - if_inst + 1));
  1468.       }
  1469.  
  1470.       /* Patch ELSE -> ENDIF */
  1471.       if (devinfo->gen < 6) {
  1472.          /* BRW_OPCODE_ELSE pre-gen6 should point just past the
  1473.           * matching ENDIF.
  1474.           */
  1475.          brw_inst_set_gen4_jump_count(devinfo, else_inst,
  1476.                                       br * (endif_inst - else_inst + 1));
  1477.          brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
  1478.       } else if (devinfo->gen == 6) {
  1479.          /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
  1480.          brw_inst_set_gen6_jump_count(devinfo, else_inst,
  1481.                                       br * (endif_inst - else_inst));
  1482.       } else {
  1483.          /* The IF instruction's JIP should point just past the ELSE */
  1484.          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
  1485.          /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
  1486.          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
  1487.          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
  1488.          if (devinfo->gen >= 8) {
  1489.             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
  1490.              * should point to ENDIF.
  1491.              */
  1492.             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
  1493.          }
  1494.       }
  1495.    }
  1496. }
  1497.  
  1498. void
  1499. brw_ELSE(struct brw_codegen *p)
  1500. {
  1501.    const struct brw_device_info *devinfo = p->devinfo;
  1502.    brw_inst *insn;
  1503.  
  1504.    insn = next_insn(p, BRW_OPCODE_ELSE);
  1505.  
  1506.    if (devinfo->gen < 6) {
  1507.       brw_set_dest(p, insn, brw_ip_reg());
  1508.       brw_set_src0(p, insn, brw_ip_reg());
  1509.       brw_set_src1(p, insn, brw_imm_d(0x0));
  1510.    } else if (devinfo->gen == 6) {
  1511.       brw_set_dest(p, insn, brw_imm_w(0));
  1512.       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
  1513.       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1514.       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1515.    } else if (devinfo->gen == 7) {
  1516.       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1517.       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1518.       brw_set_src1(p, insn, brw_imm_w(0));
  1519.       brw_inst_set_jip(devinfo, insn, 0);
  1520.       brw_inst_set_uip(devinfo, insn, 0);
  1521.    } else {
  1522.       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1523.       brw_set_src0(p, insn, brw_imm_d(0));
  1524.       brw_inst_set_jip(devinfo, insn, 0);
  1525.       brw_inst_set_uip(devinfo, insn, 0);
  1526.    }
  1527.  
  1528.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1529.    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
  1530.    if (!p->single_program_flow && devinfo->gen < 6)
  1531.       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
  1532.  
  1533.    push_if_stack(p, insn);
  1534. }
  1535.  
  1536. void
  1537. brw_ENDIF(struct brw_codegen *p)
  1538. {
  1539.    const struct brw_device_info *devinfo = p->devinfo;
  1540.    brw_inst *insn = NULL;
  1541.    brw_inst *else_inst = NULL;
  1542.    brw_inst *if_inst = NULL;
  1543.    brw_inst *tmp;
  1544.    bool emit_endif = true;
  1545.  
  1546.    /* In single program flow mode, we can express IF and ELSE instructions
  1547.     * equivalently as ADD instructions that operate on IP.  On platforms prior
  1548.     * to Gen6, flow control instructions cause an implied thread switch, so
  1549.     * this is a significant savings.
  1550.     *
  1551.     * However, on Gen6, writing to IP doesn't work in single program flow mode
  1552.     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
  1553.     * not be updated by non-flow control instructions.").  And on later
  1554.     * platforms, there is no significant benefit to converting control flow
  1555.     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
  1556.     * Gen5.
  1557.     */
  1558.    if (devinfo->gen < 6 && p->single_program_flow)
  1559.       emit_endif = false;
  1560.  
  1561.    /*
  1562.     * A single next_insn() may change the base address of instruction store
  1563.     * memory(p->store), so call it first before referencing the instruction
  1564.     * store pointer from an index
  1565.     */
  1566.    if (emit_endif)
  1567.       insn = next_insn(p, BRW_OPCODE_ENDIF);
  1568.  
  1569.    /* Pop the IF and (optional) ELSE instructions from the stack */
  1570.    p->if_depth_in_loop[p->loop_stack_depth]--;
  1571.    tmp = pop_if_stack(p);
  1572.    if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
  1573.       else_inst = tmp;
  1574.       tmp = pop_if_stack(p);
  1575.    }
  1576.    if_inst = tmp;
  1577.  
  1578.    if (!emit_endif) {
  1579.       /* ENDIF is useless; don't bother emitting it. */
  1580.       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
  1581.       return;
  1582.    }
  1583.  
  1584.    if (devinfo->gen < 6) {
  1585.       brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
  1586.       brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
  1587.       brw_set_src1(p, insn, brw_imm_d(0x0));
  1588.    } else if (devinfo->gen == 6) {
  1589.       brw_set_dest(p, insn, brw_imm_w(0));
  1590.       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1591.       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1592.    } else if (devinfo->gen == 7) {
  1593.       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1594.       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1595.       brw_set_src1(p, insn, brw_imm_w(0));
  1596.    } else {
  1597.       brw_set_src0(p, insn, brw_imm_d(0));
  1598.    }
  1599.  
  1600.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1601.    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
  1602.    if (devinfo->gen < 6)
  1603.       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
  1604.  
  1605.    /* Also pop item off the stack in the endif instruction: */
  1606.    if (devinfo->gen < 6) {
  1607.       brw_inst_set_gen4_jump_count(devinfo, insn, 0);
  1608.       brw_inst_set_gen4_pop_count(devinfo, insn, 1);
  1609.    } else if (devinfo->gen == 6) {
  1610.       brw_inst_set_gen6_jump_count(devinfo, insn, 2);
  1611.    } else {
  1612.       brw_inst_set_jip(devinfo, insn, 2);
  1613.    }
  1614.    patch_IF_ELSE(p, if_inst, else_inst, insn);
  1615. }
  1616.  
  1617. brw_inst *
  1618. brw_BREAK(struct brw_codegen *p)
  1619. {
  1620.    const struct brw_device_info *devinfo = p->devinfo;
  1621.    brw_inst *insn;
  1622.  
  1623.    insn = next_insn(p, BRW_OPCODE_BREAK);
  1624.    if (devinfo->gen >= 8) {
  1625.       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1626.       brw_set_src0(p, insn, brw_imm_d(0x0));
  1627.    } else if (devinfo->gen >= 6) {
  1628.       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1629.       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1630.       brw_set_src1(p, insn, brw_imm_d(0x0));
  1631.    } else {
  1632.       brw_set_dest(p, insn, brw_ip_reg());
  1633.       brw_set_src0(p, insn, brw_ip_reg());
  1634.       brw_set_src1(p, insn, brw_imm_d(0x0));
  1635.       brw_inst_set_gen4_pop_count(devinfo, insn,
  1636.                                   p->if_depth_in_loop[p->loop_stack_depth]);
  1637.    }
  1638.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1639.    brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
  1640.                                                    : BRW_EXECUTE_8);
  1641.  
  1642.    return insn;
  1643. }
  1644.  
  1645. brw_inst *
  1646. brw_CONT(struct brw_codegen *p)
  1647. {
  1648.    const struct brw_device_info *devinfo = p->devinfo;
  1649.    brw_inst *insn;
  1650.  
  1651.    insn = next_insn(p, BRW_OPCODE_CONTINUE);
  1652.    brw_set_dest(p, insn, brw_ip_reg());
  1653.    if (devinfo->gen >= 8) {
  1654.       brw_set_src0(p, insn, brw_imm_d(0x0));
  1655.    } else {
  1656.       brw_set_src0(p, insn, brw_ip_reg());
  1657.       brw_set_src1(p, insn, brw_imm_d(0x0));
  1658.    }
  1659.  
  1660.    if (devinfo->gen < 6) {
  1661.       brw_inst_set_gen4_pop_count(devinfo, insn,
  1662.                                   p->if_depth_in_loop[p->loop_stack_depth]);
  1663.    }
  1664.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1665.    brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
  1666.                                                    : BRW_EXECUTE_8);
  1667.    return insn;
  1668. }
  1669.  
  1670. brw_inst *
  1671. gen6_HALT(struct brw_codegen *p)
  1672. {
  1673.    const struct brw_device_info *devinfo = p->devinfo;
  1674.    brw_inst *insn;
  1675.  
  1676.    insn = next_insn(p, BRW_OPCODE_HALT);
  1677.    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1678.    if (devinfo->gen >= 8) {
  1679.       brw_set_src0(p, insn, brw_imm_d(0x0));
  1680.    } else {
  1681.       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1682.       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
  1683.    }
  1684.  
  1685.    if (p->compressed) {
  1686.       brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_16);
  1687.    } else {
  1688.       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1689.       brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_8);
  1690.    }
  1691.    return insn;
  1692. }
  1693.  
  1694. /* DO/WHILE loop:
  1695.  *
  1696.  * The DO/WHILE is just an unterminated loop -- break or continue are
  1697.  * used for control within the loop.  We have a few ways they can be
  1698.  * done.
  1699.  *
  1700.  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
  1701.  * jip and no DO instruction.
  1702.  *
  1703.  * For non-uniform control flow pre-gen6, there's a DO instruction to
  1704.  * push the mask, and a WHILE to jump back, and BREAK to get out and
  1705.  * pop the mask.
  1706.  *
  1707.  * For gen6, there's no more mask stack, so no need for DO.  WHILE
  1708.  * just points back to the first instruction of the loop.
  1709.  */
  1710. brw_inst *
  1711. brw_DO(struct brw_codegen *p, unsigned execute_size)
  1712. {
  1713.    const struct brw_device_info *devinfo = p->devinfo;
  1714.  
  1715.    if (devinfo->gen >= 6 || p->single_program_flow) {
  1716.       push_loop_stack(p, &p->store[p->nr_insn]);
  1717.       return &p->store[p->nr_insn];
  1718.    } else {
  1719.       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
  1720.  
  1721.       push_loop_stack(p, insn);
  1722.  
  1723.       /* Override the defaults for this instruction:
  1724.        */
  1725.       brw_set_dest(p, insn, brw_null_reg());
  1726.       brw_set_src0(p, insn, brw_null_reg());
  1727.       brw_set_src1(p, insn, brw_null_reg());
  1728.  
  1729.       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1730.       brw_inst_set_exec_size(devinfo, insn, execute_size);
  1731.       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
  1732.  
  1733.       return insn;
  1734.    }
  1735. }
  1736.  
  1737. /**
  1738.  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
  1739.  * instruction here.
  1740.  *
  1741.  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
  1742.  * nesting, since it can always just point to the end of the block/current loop.
  1743.  */
  1744. static void
  1745. brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
  1746. {
  1747.    const struct brw_device_info *devinfo = p->devinfo;
  1748.    brw_inst *do_inst = get_inner_do_insn(p);
  1749.    brw_inst *inst;
  1750.    unsigned br = brw_jump_scale(devinfo);
  1751.  
  1752.    assert(devinfo->gen < 6);
  1753.  
  1754.    for (inst = while_inst - 1; inst != do_inst; inst--) {
  1755.       /* If the jump count is != 0, that means that this instruction has already
  1756.        * been patched because it's part of a loop inside of the one we're
  1757.        * patching.
  1758.        */
  1759.       if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
  1760.           brw_inst_gen4_jump_count(devinfo, inst) == 0) {
  1761.          brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
  1762.       } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
  1763.                  brw_inst_gen4_jump_count(devinfo, inst) == 0) {
  1764.          brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
  1765.       }
  1766.    }
  1767. }
  1768.  
  1769. brw_inst *
  1770. brw_WHILE(struct brw_codegen *p)
  1771. {
  1772.    const struct brw_device_info *devinfo = p->devinfo;
  1773.    brw_inst *insn, *do_insn;
  1774.    unsigned br = brw_jump_scale(devinfo);
  1775.  
  1776.    if (devinfo->gen >= 6) {
  1777.       insn = next_insn(p, BRW_OPCODE_WHILE);
  1778.       do_insn = get_inner_do_insn(p);
  1779.  
  1780.       if (devinfo->gen >= 8) {
  1781.          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1782.          brw_set_src0(p, insn, brw_imm_d(0));
  1783.          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
  1784.       } else if (devinfo->gen == 7) {
  1785.          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1786.          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1787.          brw_set_src1(p, insn, brw_imm_w(0));
  1788.          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
  1789.       } else {
  1790.          brw_set_dest(p, insn, brw_imm_w(0));
  1791.          brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
  1792.          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1793.          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
  1794.       }
  1795.  
  1796.       brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
  1797.                                                       : BRW_EXECUTE_8);
  1798.    } else {
  1799.       if (p->single_program_flow) {
  1800.          insn = next_insn(p, BRW_OPCODE_ADD);
  1801.          do_insn = get_inner_do_insn(p);
  1802.  
  1803.          brw_set_dest(p, insn, brw_ip_reg());
  1804.          brw_set_src0(p, insn, brw_ip_reg());
  1805.          brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
  1806.          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
  1807.       } else {
  1808.          insn = next_insn(p, BRW_OPCODE_WHILE);
  1809.          do_insn = get_inner_do_insn(p);
  1810.  
  1811.          assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
  1812.  
  1813.          brw_set_dest(p, insn, brw_ip_reg());
  1814.          brw_set_src0(p, insn, brw_ip_reg());
  1815.          brw_set_src1(p, insn, brw_imm_d(0));
  1816.  
  1817.          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
  1818.          brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
  1819.          brw_inst_set_gen4_pop_count(devinfo, insn, 0);
  1820.  
  1821.          brw_patch_break_cont(p, insn);
  1822.       }
  1823.    }
  1824.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  1825.  
  1826.    p->loop_stack_depth--;
  1827.  
  1828.    return insn;
  1829. }
  1830.  
  1831. /* FORWARD JUMPS:
  1832.  */
  1833. void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
  1834. {
  1835.    const struct brw_device_info *devinfo = p->devinfo;
  1836.    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
  1837.    unsigned jmpi = 1;
  1838.  
  1839.    if (devinfo->gen >= 5)
  1840.       jmpi = 2;
  1841.  
  1842.    assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
  1843.    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
  1844.  
  1845.    brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
  1846.                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
  1847. }
  1848.  
  1849. /* To integrate with the above, it makes sense that the comparison
  1850.  * instruction should populate the flag register.  It might be simpler
  1851.  * just to use the flag reg for most WM tasks?
  1852.  */
  1853. void brw_CMP(struct brw_codegen *p,
  1854.              struct brw_reg dest,
  1855.              unsigned conditional,
  1856.              struct brw_reg src0,
  1857.              struct brw_reg src1)
  1858. {
  1859.    const struct brw_device_info *devinfo = p->devinfo;
  1860.    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
  1861.  
  1862.    brw_inst_set_cond_modifier(devinfo, insn, conditional);
  1863.    brw_set_dest(p, insn, dest);
  1864.    brw_set_src0(p, insn, src0);
  1865.    brw_set_src1(p, insn, src1);
  1866.  
  1867.    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
  1868.     * page says:
  1869.     *    "Any CMP instruction with a null destination must use a {switch}."
  1870.     *
  1871.     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
  1872.     * mentioned on their work-arounds pages.
  1873.     */
  1874.    if (devinfo->gen == 7) {
  1875.       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
  1876.           dest.nr == BRW_ARF_NULL) {
  1877.          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
  1878.       }
  1879.    }
  1880. }
  1881.  
  1882. /***********************************************************************
  1883.  * Helpers for the various SEND message types:
  1884.  */
  1885.  
  1886. /** Extended math function, float[8].
  1887.  */
  1888. void gen4_math(struct brw_codegen *p,
  1889.                struct brw_reg dest,
  1890.                unsigned function,
  1891.                unsigned msg_reg_nr,
  1892.                struct brw_reg src,
  1893.                unsigned precision )
  1894. {
  1895.    const struct brw_device_info *devinfo = p->devinfo;
  1896.    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
  1897.    unsigned data_type;
  1898.    if (has_scalar_region(src)) {
  1899.       data_type = BRW_MATH_DATA_SCALAR;
  1900.    } else {
  1901.       data_type = BRW_MATH_DATA_VECTOR;
  1902.    }
  1903.  
  1904.    assert(devinfo->gen < 6);
  1905.  
  1906.    /* Example code doesn't set predicate_control for send
  1907.     * instructions.
  1908.     */
  1909.    brw_inst_set_pred_control(devinfo, insn, 0);
  1910.    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
  1911.  
  1912.    brw_set_dest(p, insn, dest);
  1913.    brw_set_src0(p, insn, src);
  1914.    brw_set_math_message(p,
  1915.                         insn,
  1916.                         function,
  1917.                         src.type == BRW_REGISTER_TYPE_D,
  1918.                         precision,
  1919.                         data_type);
  1920. }
  1921.  
  1922. void gen6_math(struct brw_codegen *p,
  1923.                struct brw_reg dest,
  1924.                unsigned function,
  1925.                struct brw_reg src0,
  1926.                struct brw_reg src1)
  1927. {
  1928.    const struct brw_device_info *devinfo = p->devinfo;
  1929.    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
  1930.  
  1931.    assert(devinfo->gen >= 6);
  1932.  
  1933.    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
  1934.           (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
  1935.    assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
  1936.           (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
  1937.  
  1938.    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
  1939.    if (devinfo->gen == 6) {
  1940.       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
  1941.       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
  1942.    }
  1943.  
  1944.    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
  1945.        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
  1946.        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
  1947.       assert(src0.type != BRW_REGISTER_TYPE_F);
  1948.       assert(src1.type != BRW_REGISTER_TYPE_F);
  1949.       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
  1950.              (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
  1951.    } else {
  1952.       assert(src0.type == BRW_REGISTER_TYPE_F);
  1953.       assert(src1.type == BRW_REGISTER_TYPE_F);
  1954.       if (function == BRW_MATH_FUNCTION_POW) {
  1955.          assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
  1956.                 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
  1957.       } else {
  1958.          assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
  1959.                 src1.nr == BRW_ARF_NULL);
  1960.       }
  1961.    }
  1962.  
  1963.    /* Source modifiers are ignored for extended math instructions on Gen6. */
  1964.    if (devinfo->gen == 6) {
  1965.       assert(!src0.negate);
  1966.       assert(!src0.abs);
  1967.       assert(!src1.negate);
  1968.       assert(!src1.abs);
  1969.    }
  1970.  
  1971.    brw_inst_set_math_function(devinfo, insn, function);
  1972.  
  1973.    brw_set_dest(p, insn, dest);
  1974.    brw_set_src0(p, insn, src0);
  1975.    brw_set_src1(p, insn, src1);
  1976. }
  1977.  
  1978.  
  1979. /**
  1980.  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
  1981.  * using a constant offset per channel.
  1982.  *
  1983.  * The offset must be aligned to oword size (16 bytes).  Used for
  1984.  * register spilling.
  1985.  */
  1986. void brw_oword_block_write_scratch(struct brw_codegen *p,
  1987.                                    struct brw_reg mrf,
  1988.                                    int num_regs,
  1989.                                    unsigned offset)
  1990. {
  1991.    const struct brw_device_info *devinfo = p->devinfo;
  1992.    uint32_t msg_control, msg_type;
  1993.    int mlen;
  1994.  
  1995.    if (devinfo->gen >= 6)
  1996.       offset /= 16;
  1997.  
  1998.    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
  1999.  
  2000.    if (num_regs == 1) {
  2001.       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
  2002.       mlen = 2;
  2003.    } else {
  2004.       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
  2005.       mlen = 3;
  2006.    }
  2007.  
  2008.    /* Set up the message header.  This is g0, with g0.2 filled with
  2009.     * the offset.  We don't want to leave our offset around in g0 or
  2010.     * it'll screw up texture samples, so set it up inside the message
  2011.     * reg.
  2012.     */
  2013.    {
  2014.       brw_push_insn_state(p);
  2015.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  2016.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  2017.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  2018.  
  2019.       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  2020.  
  2021.       /* set message header global offset field (reg 0, element 2) */
  2022.       brw_MOV(p,
  2023.               retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  2024.                                   mrf.nr,
  2025.                                   2), BRW_REGISTER_TYPE_UD),
  2026.               brw_imm_ud(offset));
  2027.  
  2028.       brw_pop_insn_state(p);
  2029.    }
  2030.  
  2031.    {
  2032.       struct brw_reg dest;
  2033.       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
  2034.       int send_commit_msg;
  2035.       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
  2036.                                          BRW_REGISTER_TYPE_UW);
  2037.  
  2038.       if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_NONE) {
  2039.          brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  2040.          src_header = vec16(src_header);
  2041.       }
  2042.       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
  2043.       if (devinfo->gen < 6)
  2044.          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
  2045.  
  2046.       /* Until gen6, writes followed by reads from the same location
  2047.        * are not guaranteed to be ordered unless write_commit is set.
  2048.        * If set, then a no-op write is issued to the destination
  2049.        * register to set a dependency, and a read from the destination
  2050.        * can be used to ensure the ordering.
  2051.        *
  2052.        * For gen6, only writes between different threads need ordering
  2053.        * protection.  Our use of DP writes is all about register
  2054.        * spilling within a thread.
  2055.        */
  2056.       if (devinfo->gen >= 6) {
  2057.          dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
  2058.          send_commit_msg = 0;
  2059.       } else {
  2060.          dest = src_header;
  2061.          send_commit_msg = 1;
  2062.       }
  2063.  
  2064.       brw_set_dest(p, insn, dest);
  2065.       if (devinfo->gen >= 6) {
  2066.          brw_set_src0(p, insn, mrf);
  2067.       } else {
  2068.          brw_set_src0(p, insn, brw_null_reg());
  2069.       }
  2070.  
  2071.       if (devinfo->gen >= 6)
  2072.          msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
  2073.       else
  2074.          msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
  2075.  
  2076.       brw_set_dp_write_message(p,
  2077.                                insn,
  2078.                                255, /* binding table index (255=stateless) */
  2079.                                msg_control,
  2080.                                msg_type,
  2081.                                mlen,
  2082.                                true, /* header_present */
  2083.                                0, /* not a render target */
  2084.                                send_commit_msg, /* response_length */
  2085.                                0, /* eot */
  2086.                                send_commit_msg);
  2087.    }
  2088. }
  2089.  
  2090.  
  2091. /**
  2092.  * Read a block of owords (half a GRF each) from the scratch buffer
  2093.  * using a constant index per channel.
  2094.  *
  2095.  * Offset must be aligned to oword size (16 bytes).  Used for register
  2096.  * spilling.
  2097.  */
  2098. void
  2099. brw_oword_block_read_scratch(struct brw_codegen *p,
  2100.                              struct brw_reg dest,
  2101.                              struct brw_reg mrf,
  2102.                              int num_regs,
  2103.                              unsigned offset)
  2104. {
  2105.    const struct brw_device_info *devinfo = p->devinfo;
  2106.    uint32_t msg_control;
  2107.    int rlen;
  2108.  
  2109.    if (devinfo->gen >= 6)
  2110.       offset /= 16;
  2111.  
  2112.    if (p->devinfo->gen >= 7) {
  2113.       /* On gen 7 and above, we no longer have message registers and we can
  2114.        * send from any register we want.  By using the destination register
  2115.        * for the message, we guarantee that the implied message write won't
  2116.        * accidentally overwrite anything.  This has been a problem because
  2117.        * the MRF registers and source for the final FB write are both fixed
  2118.        * and may overlap.
  2119.        */
  2120.       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
  2121.    } else {
  2122.       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
  2123.    }
  2124.    dest = retype(dest, BRW_REGISTER_TYPE_UW);
  2125.  
  2126.    if (num_regs == 1) {
  2127.       msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
  2128.       rlen = 1;
  2129.    } else {
  2130.       msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
  2131.       rlen = 2;
  2132.    }
  2133.  
  2134.    {
  2135.       brw_push_insn_state(p);
  2136.       brw_set_default_exec_size(p, BRW_EXECUTE_8);
  2137.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  2138.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  2139.  
  2140.       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  2141.  
  2142.       /* set message header global offset field (reg 0, element 2) */
  2143.       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
  2144.  
  2145.       brw_pop_insn_state(p);
  2146.    }
  2147.  
  2148.    {
  2149.       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
  2150.  
  2151.       assert(brw_inst_pred_control(devinfo, insn) == 0);
  2152.       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  2153.  
  2154.       brw_set_dest(p, insn, dest);      /* UW? */
  2155.       if (devinfo->gen >= 6) {
  2156.          brw_set_src0(p, insn, mrf);
  2157.       } else {
  2158.          brw_set_src0(p, insn, brw_null_reg());
  2159.          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
  2160.       }
  2161.  
  2162.       brw_set_dp_read_message(p,
  2163.                               insn,
  2164.                               255, /* binding table index (255=stateless) */
  2165.                               msg_control,
  2166.                               BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
  2167.                               BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
  2168.                               1, /* msg_length */
  2169.                               true, /* header_present */
  2170.                               rlen);
  2171.    }
  2172. }
  2173.  
  2174. void
  2175. gen7_block_read_scratch(struct brw_codegen *p,
  2176.                         struct brw_reg dest,
  2177.                         int num_regs,
  2178.                         unsigned offset)
  2179. {
  2180.    const struct brw_device_info *devinfo = p->devinfo;
  2181.    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
  2182.    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
  2183.  
  2184.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  2185.    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
  2186.  
  2187.    /* The HW requires that the header is present; this is to get the g0.5
  2188.     * scratch offset.
  2189.     */
  2190.    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
  2191.  
  2192.    /* According to the docs, offset is "A 12-bit HWord offset into the memory
  2193.     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
  2194.     * is 32 bytes, which happens to be the size of a register.
  2195.     */
  2196.    offset /= REG_SIZE;
  2197.    assert(offset < (1 << 12));
  2198.  
  2199.    gen7_set_dp_scratch_message(p, insn,
  2200.                                false, /* scratch read */
  2201.                                false, /* OWords */
  2202.                                false, /* invalidate after read */
  2203.                                num_regs,
  2204.                                offset,
  2205.                                1,        /* mlen: just g0 */
  2206.                                num_regs, /* rlen */
  2207.                                true);    /* header present */
  2208. }
  2209.  
  2210. /**
  2211.  * Read a float[4] vector from the data port Data Cache (const buffer).
  2212.  * Location (in buffer) should be a multiple of 16.
  2213.  * Used for fetching shader constants.
  2214.  */
  2215. void brw_oword_block_read(struct brw_codegen *p,
  2216.                           struct brw_reg dest,
  2217.                           struct brw_reg mrf,
  2218.                           uint32_t offset,
  2219.                           uint32_t bind_table_index)
  2220. {
  2221.    const struct brw_device_info *devinfo = p->devinfo;
  2222.  
  2223.    /* On newer hardware, offset is in units of owords. */
  2224.    if (devinfo->gen >= 6)
  2225.       offset /= 16;
  2226.  
  2227.    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
  2228.  
  2229.    brw_push_insn_state(p);
  2230.    brw_set_default_exec_size(p, BRW_EXECUTE_8);
  2231.    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  2232.    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  2233.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  2234.  
  2235.    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
  2236.  
  2237.    /* set message header global offset field (reg 0, element 2) */
  2238.    brw_MOV(p,
  2239.            retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
  2240.                                mrf.nr,
  2241.                                2), BRW_REGISTER_TYPE_UD),
  2242.            brw_imm_ud(offset));
  2243.  
  2244.    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
  2245.  
  2246.    /* cast dest to a uword[8] vector */
  2247.    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
  2248.  
  2249.    brw_set_dest(p, insn, dest);
  2250.    if (devinfo->gen >= 6) {
  2251.       brw_set_src0(p, insn, mrf);
  2252.    } else {
  2253.       brw_set_src0(p, insn, brw_null_reg());
  2254.       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
  2255.    }
  2256.  
  2257.    brw_set_dp_read_message(p,
  2258.                            insn,
  2259.                            bind_table_index,
  2260.                            BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
  2261.                            BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
  2262.                            BRW_DATAPORT_READ_TARGET_DATA_CACHE,
  2263.                            1, /* msg_length */
  2264.                            true, /* header_present */
  2265.                            1); /* response_length (1 reg, 2 owords!) */
  2266.  
  2267.    brw_pop_insn_state(p);
  2268. }
  2269.  
  2270.  
  2271. void brw_fb_WRITE(struct brw_codegen *p,
  2272.                   int dispatch_width,
  2273.                   struct brw_reg payload,
  2274.                   struct brw_reg implied_header,
  2275.                   unsigned msg_control,
  2276.                   unsigned binding_table_index,
  2277.                   unsigned msg_length,
  2278.                   unsigned response_length,
  2279.                   bool eot,
  2280.                   bool last_render_target,
  2281.                   bool header_present)
  2282. {
  2283.    const struct brw_device_info *devinfo = p->devinfo;
  2284.    brw_inst *insn;
  2285.    unsigned msg_type;
  2286.    struct brw_reg dest, src0;
  2287.  
  2288.    if (dispatch_width == 16)
  2289.       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
  2290.    else
  2291.       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
  2292.  
  2293.    if (devinfo->gen >= 6) {
  2294.       insn = next_insn(p, BRW_OPCODE_SENDC);
  2295.    } else {
  2296.       insn = next_insn(p, BRW_OPCODE_SEND);
  2297.    }
  2298.    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  2299.  
  2300.    if (devinfo->gen >= 6) {
  2301.       /* headerless version, just submit color payload */
  2302.       src0 = payload;
  2303.  
  2304.       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
  2305.    } else {
  2306.       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
  2307.       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
  2308.       src0 = implied_header;
  2309.  
  2310.       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
  2311.    }
  2312.  
  2313.    brw_set_dest(p, insn, dest);
  2314.    brw_set_src0(p, insn, src0);
  2315.    brw_set_dp_write_message(p,
  2316.                             insn,
  2317.                             binding_table_index,
  2318.                             msg_control,
  2319.                             msg_type,
  2320.                             msg_length,
  2321.                             header_present,
  2322.                             last_render_target,
  2323.                             response_length,
  2324.                             eot,
  2325.                             0 /* send_commit_msg */);
  2326. }
  2327.  
  2328.  
  2329. /**
  2330.  * Texture sample instruction.
  2331.  * Note: the msg_type plus msg_length values determine exactly what kind
  2332.  * of sampling operation is performed.  See volume 4, page 161 of docs.
  2333.  */
  2334. void brw_SAMPLE(struct brw_codegen *p,
  2335.                 struct brw_reg dest,
  2336.                 unsigned msg_reg_nr,
  2337.                 struct brw_reg src0,
  2338.                 unsigned binding_table_index,
  2339.                 unsigned sampler,
  2340.                 unsigned msg_type,
  2341.                 unsigned response_length,
  2342.                 unsigned msg_length,
  2343.                 unsigned header_present,
  2344.                 unsigned simd_mode,
  2345.                 unsigned return_format)
  2346. {
  2347.    const struct brw_device_info *devinfo = p->devinfo;
  2348.    brw_inst *insn;
  2349.  
  2350.    if (msg_reg_nr != -1)
  2351.       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
  2352.  
  2353.    insn = next_insn(p, BRW_OPCODE_SEND);
  2354.    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
  2355.  
  2356.    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
  2357.     *
  2358.     *    "Instruction compression is not allowed for this instruction (that
  2359.     *     is, send). The hardware behavior is undefined if this instruction is
  2360.     *     set as compressed. However, compress control can be set to "SecHalf"
  2361.     *     to affect the EMask generation."
  2362.     *
  2363.     * No similar wording is found in later PRMs, but there are examples
  2364.     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
  2365.     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
  2366.     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
  2367.     */
  2368.    if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_2NDHALF)
  2369.       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
  2370.  
  2371.    if (devinfo->gen < 6)
  2372.       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
  2373.  
  2374.    brw_set_dest(p, insn, dest);
  2375.    brw_set_src0(p, insn, src0);
  2376.    brw_set_sampler_message(p, insn,
  2377.                            binding_table_index,
  2378.                            sampler,
  2379.                            msg_type,
  2380.                            response_length,
  2381.                            msg_length,
  2382.                            header_present,
  2383.                            simd_mode,
  2384.                            return_format);
  2385. }
  2386.  
  2387. /* Adjust the message header's sampler state pointer to
  2388.  * select the correct group of 16 samplers.
  2389.  */
  2390. void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
  2391.                                       struct brw_reg header,
  2392.                                       struct brw_reg sampler_index)
  2393. {
  2394.    /* The "Sampler Index" field can only store values between 0 and 15.
  2395.     * However, we can add an offset to the "Sampler State Pointer"
  2396.     * field, effectively selecting a different set of 16 samplers.
  2397.     *
  2398.     * The "Sampler State Pointer" needs to be aligned to a 32-byte
  2399.     * offset, and each sampler state is only 16-bytes, so we can't
  2400.     * exclusively use the offset - we have to use both.
  2401.     */
  2402.  
  2403.    const struct brw_device_info *devinfo = p->devinfo;
  2404.  
  2405.    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
  2406.       const int sampler_state_size = 16; /* 16 bytes */
  2407.       uint32_t sampler = sampler_index.dw1.ud;
  2408.  
  2409.       if (sampler >= 16) {
  2410.          assert(devinfo->is_haswell || devinfo->gen >= 8);
  2411.          brw_ADD(p,
  2412.                  get_element_ud(header, 3),
  2413.                  get_element_ud(brw_vec8_grf(0, 0), 3),
  2414.                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
  2415.       }
  2416.    } else {
  2417.       /* Non-const sampler array indexing case */
  2418.       if (devinfo->gen < 8 && !devinfo->is_haswell) {
  2419.          return;
  2420.       }
  2421.  
  2422.       struct brw_reg temp = get_element_ud(header, 3);
  2423.  
  2424.       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
  2425.       brw_SHL(p, temp, temp, brw_imm_ud(4));
  2426.       brw_ADD(p,
  2427.               get_element_ud(header, 3),
  2428.               get_element_ud(brw_vec8_grf(0, 0), 3),
  2429.               temp);
  2430.    }
  2431. }
  2432.  
  2433. /* All these variables are pretty confusing - we might be better off
  2434.  * using bitmasks and macros for this, in the old style.  Or perhaps
  2435.  * just having the caller instantiate the fields in dword3 itself.
  2436.  */
  2437. void brw_urb_WRITE(struct brw_codegen *p,
  2438.                    struct brw_reg dest,
  2439.                    unsigned msg_reg_nr,
  2440.                    struct brw_reg src0,
  2441.                    enum brw_urb_write_flags flags,
  2442.                    unsigned msg_length,
  2443.                    unsigned response_length,
  2444.                    unsigned offset,
  2445.                    unsigned swizzle)
  2446. {
  2447.    const struct brw_device_info *devinfo = p->devinfo;
  2448.    brw_inst *insn;
  2449.  
  2450.    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
  2451.  
  2452.    if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
  2453.       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
  2454.       brw_push_insn_state(p);
  2455.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  2456.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  2457.       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
  2458.                        BRW_REGISTER_TYPE_UD),
  2459.                 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
  2460.                 brw_imm_ud(0xff00));
  2461.       brw_pop_insn_state(p);
  2462.    }
  2463.  
  2464.    insn = next_insn(p, BRW_OPCODE_SEND);
  2465.  
  2466.    assert(msg_length < BRW_MAX_MRF);
  2467.  
  2468.    brw_set_dest(p, insn, dest);
  2469.    brw_set_src0(p, insn, src0);
  2470.    brw_set_src1(p, insn, brw_imm_d(0));
  2471.  
  2472.    if (devinfo->gen < 6)
  2473.       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
  2474.  
  2475.    brw_set_urb_message(p,
  2476.                        insn,
  2477.                        flags,
  2478.                        msg_length,
  2479.                        response_length,
  2480.                        offset,
  2481.                        swizzle);
  2482. }
  2483.  
  2484. struct brw_inst *
  2485. brw_send_indirect_message(struct brw_codegen *p,
  2486.                           unsigned sfid,
  2487.                           struct brw_reg dst,
  2488.                           struct brw_reg payload,
  2489.                           struct brw_reg desc)
  2490. {
  2491.    const struct brw_device_info *devinfo = p->devinfo;
  2492.    struct brw_inst *send, *setup;
  2493.  
  2494.    assert(desc.type == BRW_REGISTER_TYPE_UD);
  2495.  
  2496.    if (desc.file == BRW_IMMEDIATE_VALUE) {
  2497.       setup = send = next_insn(p, BRW_OPCODE_SEND);
  2498.       brw_set_src1(p, send, desc);
  2499.  
  2500.    } else {
  2501.       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
  2502.  
  2503.       brw_push_insn_state(p);
  2504.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  2505.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  2506.       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  2507.  
  2508.       /* Load the indirect descriptor to an address register using OR so the
  2509.        * caller can specify additional descriptor bits with the usual
  2510.        * brw_set_*_message() helper functions.
  2511.        */
  2512.       setup = brw_OR(p, addr, desc, brw_imm_ud(0));
  2513.  
  2514.       brw_pop_insn_state(p);
  2515.  
  2516.       send = next_insn(p, BRW_OPCODE_SEND);
  2517.       brw_set_src1(p, send, addr);
  2518.    }
  2519.  
  2520.    brw_set_dest(p, send, dst);
  2521.    brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
  2522.    brw_inst_set_sfid(devinfo, send, sfid);
  2523.  
  2524.    return setup;
  2525. }
  2526.  
  2527. static struct brw_inst *
  2528. brw_send_indirect_surface_message(struct brw_codegen *p,
  2529.                                   unsigned sfid,
  2530.                                   struct brw_reg dst,
  2531.                                   struct brw_reg payload,
  2532.                                   struct brw_reg surface,
  2533.                                   unsigned message_len,
  2534.                                   unsigned response_len,
  2535.                                   bool header_present)
  2536. {
  2537.    const struct brw_device_info *devinfo = p->devinfo;
  2538.    struct brw_inst *insn;
  2539.  
  2540.    if (surface.file != BRW_IMMEDIATE_VALUE) {
  2541.       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
  2542.  
  2543.       brw_push_insn_state(p);
  2544.       brw_set_default_access_mode(p, BRW_ALIGN_1);
  2545.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  2546.       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  2547.  
  2548.       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
  2549.        * some surface array is accessed out of bounds.
  2550.        */
  2551.       insn = brw_AND(p, addr,
  2552.                      suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
  2553.                                BRW_GET_SWZ(surface.dw1.bits.swizzle, 0)),
  2554.                      brw_imm_ud(0xff));
  2555.  
  2556.       brw_pop_insn_state(p);
  2557.  
  2558.       surface = addr;
  2559.    }
  2560.  
  2561.    insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
  2562.    brw_inst_set_mlen(devinfo, insn, message_len);
  2563.    brw_inst_set_rlen(devinfo, insn, response_len);
  2564.    brw_inst_set_header_present(devinfo, insn, header_present);
  2565.  
  2566.    return insn;
  2567. }
  2568.  
  2569. static int
  2570. brw_find_next_block_end(struct brw_codegen *p, int start_offset)
  2571. {
  2572.    int offset;
  2573.    void *store = p->store;
  2574.    const struct brw_device_info *devinfo = p->devinfo;
  2575.  
  2576.    for (offset = next_offset(devinfo, store, start_offset);
  2577.         offset < p->next_insn_offset;
  2578.         offset = next_offset(devinfo, store, offset)) {
  2579.       brw_inst *insn = store + offset;
  2580.  
  2581.       switch (brw_inst_opcode(devinfo, insn)) {
  2582.       case BRW_OPCODE_ENDIF:
  2583.       case BRW_OPCODE_ELSE:
  2584.       case BRW_OPCODE_WHILE:
  2585.       case BRW_OPCODE_HALT:
  2586.          return offset;
  2587.       }
  2588.    }
  2589.  
  2590.    return 0;
  2591. }
  2592.  
  2593. /* There is no DO instruction on gen6, so to find the end of the loop
  2594.  * we have to see if the loop is jumping back before our start
  2595.  * instruction.
  2596.  */
  2597. static int
  2598. brw_find_loop_end(struct brw_codegen *p, int start_offset)
  2599. {
  2600.    const struct brw_device_info *devinfo = p->devinfo;
  2601.    int offset;
  2602.    int scale = 16 / brw_jump_scale(devinfo);
  2603.    void *store = p->store;
  2604.  
  2605.    assert(devinfo->gen >= 6);
  2606.  
  2607.    /* Always start after the instruction (such as a WHILE) we're trying to fix
  2608.     * up.
  2609.     */
  2610.    for (offset = next_offset(devinfo, store, start_offset);
  2611.         offset < p->next_insn_offset;
  2612.         offset = next_offset(devinfo, store, offset)) {
  2613.       brw_inst *insn = store + offset;
  2614.  
  2615.       if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
  2616.          int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
  2617.                                      : brw_inst_jip(devinfo, insn);
  2618.          if (offset + jip * scale <= start_offset)
  2619.             return offset;
  2620.       }
  2621.    }
  2622.    assert(!"not reached");
  2623.    return start_offset;
  2624. }
  2625.  
  2626. /* After program generation, go back and update the UIP and JIP of
  2627.  * BREAK, CONT, and HALT instructions to their correct locations.
  2628.  */
  2629. void
  2630. brw_set_uip_jip(struct brw_codegen *p)
  2631. {
  2632.    const struct brw_device_info *devinfo = p->devinfo;
  2633.    int offset;
  2634.    int br = brw_jump_scale(devinfo);
  2635.    int scale = 16 / br;
  2636.    void *store = p->store;
  2637.  
  2638.    if (devinfo->gen < 6)
  2639.       return;
  2640.  
  2641.    for (offset = 0; offset < p->next_insn_offset;
  2642.         offset = next_offset(devinfo, store, offset)) {
  2643.       brw_inst *insn = store + offset;
  2644.  
  2645.       if (brw_inst_cmpt_control(devinfo, insn)) {
  2646.          /* Fixups for compacted BREAK/CONTINUE not supported yet. */
  2647.          assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
  2648.                 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
  2649.                 brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
  2650.          continue;
  2651.       }
  2652.  
  2653.       int block_end_offset = brw_find_next_block_end(p, offset);
  2654.       switch (brw_inst_opcode(devinfo, insn)) {
  2655.       case BRW_OPCODE_BREAK:
  2656.          assert(block_end_offset != 0);
  2657.          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
  2658.          /* Gen7 UIP points to WHILE; Gen6 points just after it */
  2659.          brw_inst_set_uip(devinfo, insn,
  2660.             (brw_find_loop_end(p, offset) - offset +
  2661.              (devinfo->gen == 6 ? 16 : 0)) / scale);
  2662.          break;
  2663.       case BRW_OPCODE_CONTINUE:
  2664.          assert(block_end_offset != 0);
  2665.          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
  2666.          brw_inst_set_uip(devinfo, insn,
  2667.             (brw_find_loop_end(p, offset) - offset) / scale);
  2668.  
  2669.          assert(brw_inst_uip(devinfo, insn) != 0);
  2670.          assert(brw_inst_jip(devinfo, insn) != 0);
  2671.          break;
  2672.  
  2673.       case BRW_OPCODE_ENDIF: {
  2674.          int32_t jump = (block_end_offset == 0) ?
  2675.                         1 * br : (block_end_offset - offset) / scale;
  2676.          if (devinfo->gen >= 7)
  2677.             brw_inst_set_jip(devinfo, insn, jump);
  2678.          else
  2679.             brw_inst_set_gen6_jump_count(devinfo, insn, jump);
  2680.          break;
  2681.       }
  2682.  
  2683.       case BRW_OPCODE_HALT:
  2684.          /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
  2685.           *
  2686.           *    "In case of the halt instruction not inside any conditional
  2687.           *     code block, the value of <JIP> and <UIP> should be the
  2688.           *     same. In case of the halt instruction inside conditional code
  2689.           *     block, the <UIP> should be the end of the program, and the
  2690.           *     <JIP> should be end of the most inner conditional code block."
  2691.           *
  2692.           * The uip will have already been set by whoever set up the
  2693.           * instruction.
  2694.           */
  2695.          if (block_end_offset == 0) {
  2696.             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
  2697.          } else {
  2698.             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
  2699.          }
  2700.          assert(brw_inst_uip(devinfo, insn) != 0);
  2701.          assert(brw_inst_jip(devinfo, insn) != 0);
  2702.          break;
  2703.       }
  2704.    }
  2705. }
  2706.  
  2707. void brw_ff_sync(struct brw_codegen *p,
  2708.                    struct brw_reg dest,
  2709.                    unsigned msg_reg_nr,
  2710.                    struct brw_reg src0,
  2711.                    bool allocate,
  2712.                    unsigned response_length,
  2713.                    bool eot)
  2714. {
  2715.    const struct brw_device_info *devinfo = p->devinfo;
  2716.    brw_inst *insn;
  2717.  
  2718.    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
  2719.  
  2720.    insn = next_insn(p, BRW_OPCODE_SEND);
  2721.    brw_set_dest(p, insn, dest);
  2722.    brw_set_src0(p, insn, src0);
  2723.    brw_set_src1(p, insn, brw_imm_d(0));
  2724.  
  2725.    if (devinfo->gen < 6)
  2726.       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
  2727.  
  2728.    brw_set_ff_sync_message(p,
  2729.                            insn,
  2730.                            allocate,
  2731.                            response_length,
  2732.                            eot);
  2733. }
  2734.  
  2735. /**
  2736.  * Emit the SEND instruction necessary to generate stream output data on Gen6
  2737.  * (for transform feedback).
  2738.  *
  2739.  * If send_commit_msg is true, this is the last piece of stream output data
  2740.  * from this thread, so send the data as a committed write.  According to the
  2741.  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
  2742.  *
  2743.  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
  2744.  *   writes are complete by sending the final write as a committed write."
  2745.  */
  2746. void
  2747. brw_svb_write(struct brw_codegen *p,
  2748.               struct brw_reg dest,
  2749.               unsigned msg_reg_nr,
  2750.               struct brw_reg src0,
  2751.               unsigned binding_table_index,
  2752.               bool   send_commit_msg)
  2753. {
  2754.    brw_inst *insn;
  2755.  
  2756.    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
  2757.  
  2758.    insn = next_insn(p, BRW_OPCODE_SEND);
  2759.    brw_set_dest(p, insn, dest);
  2760.    brw_set_src0(p, insn, src0);
  2761.    brw_set_src1(p, insn, brw_imm_d(0));
  2762.    brw_set_dp_write_message(p, insn,
  2763.                             binding_table_index,
  2764.                             0, /* msg_control: ignored */
  2765.                             GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
  2766.                             1, /* msg_length */
  2767.                             true, /* header_present */
  2768.                             0, /* last_render_target: ignored */
  2769.                             send_commit_msg, /* response_length */
  2770.                             0, /* end_of_thread */
  2771.                             send_commit_msg); /* send_commit_msg */
  2772. }
  2773.  
  2774. static unsigned
  2775. brw_surface_payload_size(struct brw_codegen *p,
  2776.                          unsigned num_channels,
  2777.                          bool has_simd4x2,
  2778.                          bool has_simd16)
  2779. {
  2780.    if (has_simd4x2 && brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
  2781.       return 1;
  2782.    else if (has_simd16 && p->compressed)
  2783.       return 2 * num_channels;
  2784.    else
  2785.       return num_channels;
  2786. }
  2787.  
  2788. static void
  2789. brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
  2790.                                   brw_inst *insn,
  2791.                                   unsigned atomic_op,
  2792.                                   bool response_expected)
  2793. {
  2794.    const struct brw_device_info *devinfo = p->devinfo;
  2795.    unsigned msg_control =
  2796.       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
  2797.       (response_expected ? 1 << 5 : 0); /* Return data expected */
  2798.  
  2799.    if (devinfo->gen >= 8 || devinfo->is_haswell) {
  2800.       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  2801.          if (!p->compressed)
  2802.             msg_control |= 1 << 4; /* SIMD8 mode */
  2803.  
  2804.          brw_inst_set_dp_msg_type(devinfo, insn,
  2805.                                   HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
  2806.       } else {
  2807.          brw_inst_set_dp_msg_type(devinfo, insn,
  2808.             HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
  2809.       }
  2810.    } else {
  2811.       brw_inst_set_dp_msg_type(devinfo, insn,
  2812.                                GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
  2813.  
  2814.       if (!p->compressed)
  2815.          msg_control |= 1 << 4; /* SIMD8 mode */
  2816.    }
  2817.  
  2818.    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
  2819. }
  2820.  
  2821. void
  2822. brw_untyped_atomic(struct brw_codegen *p,
  2823.                    struct brw_reg dst,
  2824.                    struct brw_reg payload,
  2825.                    struct brw_reg surface,
  2826.                    unsigned atomic_op,
  2827.                    unsigned msg_length,
  2828.                    bool response_expected)
  2829. {
  2830.    const struct brw_device_info *devinfo = p->devinfo;
  2831.    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
  2832.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  2833.                           GEN7_SFID_DATAPORT_DATA_CACHE);
  2834.    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
  2835.    /* Mask out unused components -- This is especially important in Align16
  2836.     * mode on generations that don't have native support for SIMD4x2 atomics,
  2837.     * because unused but enabled components will cause the dataport to perform
  2838.     * additional atomic operations on the addresses that happen to be in the
  2839.     * uninitialized Y, Z and W coordinates of the payload.
  2840.     */
  2841.    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
  2842.    struct brw_inst *insn = brw_send_indirect_surface_message(
  2843.       p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
  2844.       brw_surface_payload_size(p, response_expected,
  2845.                                devinfo->gen >= 8 || devinfo->is_haswell, true),
  2846.       align1);
  2847.  
  2848.    brw_set_dp_untyped_atomic_message(
  2849.       p, insn, atomic_op, response_expected);
  2850. }
  2851.  
  2852. static void
  2853. brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
  2854.                                         struct brw_inst *insn,
  2855.                                         unsigned num_channels)
  2856. {
  2857.    const struct brw_device_info *devinfo = p->devinfo;
  2858.    /* Set mask of 32-bit channels to drop. */
  2859.    unsigned msg_control = 0xf & (0xf << num_channels);
  2860.  
  2861.    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  2862.       if (p->compressed)
  2863.          msg_control |= 1 << 4; /* SIMD16 mode */
  2864.       else
  2865.          msg_control |= 2 << 4; /* SIMD8 mode */
  2866.    }
  2867.  
  2868.    brw_inst_set_dp_msg_type(devinfo, insn,
  2869.                             (devinfo->gen >= 8 || devinfo->is_haswell ?
  2870.                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
  2871.                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
  2872.    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
  2873. }
  2874.  
  2875. void
  2876. brw_untyped_surface_read(struct brw_codegen *p,
  2877.                          struct brw_reg dst,
  2878.                          struct brw_reg payload,
  2879.                          struct brw_reg surface,
  2880.                          unsigned msg_length,
  2881.                          unsigned num_channels)
  2882. {
  2883.    const struct brw_device_info *devinfo = p->devinfo;
  2884.    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
  2885.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  2886.                           GEN7_SFID_DATAPORT_DATA_CACHE);
  2887.    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
  2888.    struct brw_inst *insn = brw_send_indirect_surface_message(
  2889.       p, sfid, dst, payload, surface, msg_length,
  2890.       brw_surface_payload_size(p, num_channels, true, true),
  2891.       align1);
  2892.  
  2893.    brw_set_dp_untyped_surface_read_message(
  2894.       p, insn, num_channels);
  2895. }
  2896.  
  2897. static void
  2898. brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
  2899.                                          struct brw_inst *insn,
  2900.                                          unsigned num_channels)
  2901. {
  2902.    const struct brw_device_info *devinfo = p->devinfo;
  2903.    /* Set mask of 32-bit channels to drop. */
  2904.    unsigned msg_control = 0xf & (0xf << num_channels);
  2905.  
  2906.    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  2907.       if (p->compressed)
  2908.          msg_control |= 1 << 4; /* SIMD16 mode */
  2909.       else
  2910.          msg_control |= 2 << 4; /* SIMD8 mode */
  2911.    } else {
  2912.       if (devinfo->gen >= 8 || devinfo->is_haswell)
  2913.          msg_control |= 0 << 4; /* SIMD4x2 mode */
  2914.       else
  2915.          msg_control |= 2 << 4; /* SIMD8 mode */
  2916.    }
  2917.  
  2918.    brw_inst_set_dp_msg_type(devinfo, insn,
  2919.                             devinfo->gen >= 8 || devinfo->is_haswell ?
  2920.                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
  2921.                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
  2922.    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
  2923. }
  2924.  
  2925. void
  2926. brw_untyped_surface_write(struct brw_codegen *p,
  2927.                           struct brw_reg payload,
  2928.                           struct brw_reg surface,
  2929.                           unsigned msg_length,
  2930.                           unsigned num_channels)
  2931. {
  2932.    const struct brw_device_info *devinfo = p->devinfo;
  2933.    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
  2934.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  2935.                           GEN7_SFID_DATAPORT_DATA_CACHE);
  2936.    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
  2937.    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
  2938.    const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
  2939.                           WRITEMASK_X : WRITEMASK_XYZW;
  2940.    struct brw_inst *insn = brw_send_indirect_surface_message(
  2941.       p, sfid, brw_writemask(brw_null_reg(), mask),
  2942.       payload, surface, msg_length, 0, align1);
  2943.  
  2944.    brw_set_dp_untyped_surface_write_message(
  2945.       p, insn, num_channels);
  2946. }
  2947.  
  2948. static void
  2949. brw_set_dp_typed_atomic_message(struct brw_codegen *p,
  2950.                                 struct brw_inst *insn,
  2951.                                 unsigned atomic_op,
  2952.                                 bool response_expected)
  2953. {
  2954.    const struct brw_device_info *devinfo = p->devinfo;
  2955.    unsigned msg_control =
  2956.       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
  2957.       (response_expected ? 1 << 5 : 0); /* Return data expected */
  2958.  
  2959.    if (devinfo->gen >= 8 || devinfo->is_haswell) {
  2960.       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  2961.          if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
  2962.             msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
  2963.  
  2964.          brw_inst_set_dp_msg_type(devinfo, insn,
  2965.                                   HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
  2966.       } else {
  2967.          brw_inst_set_dp_msg_type(devinfo, insn,
  2968.                                   HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
  2969.       }
  2970.  
  2971.    } else {
  2972.       brw_inst_set_dp_msg_type(devinfo, insn,
  2973.                                GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
  2974.  
  2975.       if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
  2976.          msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
  2977.    }
  2978.  
  2979.    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
  2980. }
  2981.  
  2982. void
  2983. brw_typed_atomic(struct brw_codegen *p,
  2984.                  struct brw_reg dst,
  2985.                  struct brw_reg payload,
  2986.                  struct brw_reg surface,
  2987.                  unsigned atomic_op,
  2988.                  unsigned msg_length,
  2989.                  bool response_expected) {
  2990.    const struct brw_device_info *devinfo = p->devinfo;
  2991.    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
  2992.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  2993.                           GEN6_SFID_DATAPORT_RENDER_CACHE);
  2994.    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
  2995.    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
  2996.    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
  2997.    struct brw_inst *insn = brw_send_indirect_surface_message(
  2998.       p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
  2999.       brw_surface_payload_size(p, response_expected,
  3000.                                devinfo->gen >= 8 || devinfo->is_haswell, false),
  3001.       true);
  3002.  
  3003.    brw_set_dp_typed_atomic_message(
  3004.       p, insn, atomic_op, response_expected);
  3005. }
  3006.  
  3007. static void
  3008. brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
  3009.                                       struct brw_inst *insn,
  3010.                                       unsigned num_channels)
  3011. {
  3012.    const struct brw_device_info *devinfo = p->devinfo;
  3013.    /* Set mask of unused channels. */
  3014.    unsigned msg_control = 0xf & (0xf << num_channels);
  3015.  
  3016.    if (devinfo->gen >= 8 || devinfo->is_haswell) {
  3017.       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  3018.          if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
  3019.             msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
  3020.          else
  3021.             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
  3022.       }
  3023.  
  3024.       brw_inst_set_dp_msg_type(devinfo, insn,
  3025.                                HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
  3026.    } else {
  3027.       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  3028.          if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
  3029.             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
  3030.       }
  3031.  
  3032.       brw_inst_set_dp_msg_type(devinfo, insn,
  3033.                                GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
  3034.    }
  3035.  
  3036.    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
  3037. }
  3038.  
  3039. void
  3040. brw_typed_surface_read(struct brw_codegen *p,
  3041.                        struct brw_reg dst,
  3042.                        struct brw_reg payload,
  3043.                        struct brw_reg surface,
  3044.                        unsigned msg_length,
  3045.                        unsigned num_channels)
  3046. {
  3047.    const struct brw_device_info *devinfo = p->devinfo;
  3048.    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
  3049.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  3050.                           GEN6_SFID_DATAPORT_RENDER_CACHE);
  3051.    struct brw_inst *insn = brw_send_indirect_surface_message(
  3052.       p, sfid, dst, payload, surface, msg_length,
  3053.       brw_surface_payload_size(p, num_channels,
  3054.                                devinfo->gen >= 8 || devinfo->is_haswell, false),
  3055.       true);
  3056.  
  3057.    brw_set_dp_typed_surface_read_message(
  3058.       p, insn, num_channels);
  3059. }
  3060.  
  3061. static void
  3062. brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
  3063.                                        struct brw_inst *insn,
  3064.                                        unsigned num_channels)
  3065. {
  3066.    const struct brw_device_info *devinfo = p->devinfo;
  3067.    /* Set mask of unused channels. */
  3068.    unsigned msg_control = 0xf & (0xf << num_channels);
  3069.  
  3070.    if (devinfo->gen >= 8 || devinfo->is_haswell) {
  3071.       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  3072.          if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
  3073.             msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
  3074.          else
  3075.             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
  3076.       }
  3077.  
  3078.       brw_inst_set_dp_msg_type(devinfo, insn,
  3079.                                HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
  3080.  
  3081.    } else {
  3082.       if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  3083.          if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
  3084.             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
  3085.       }
  3086.  
  3087.       brw_inst_set_dp_msg_type(devinfo, insn,
  3088.                                GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
  3089.    }
  3090.  
  3091.    brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
  3092. }
  3093.  
  3094. void
  3095. brw_typed_surface_write(struct brw_codegen *p,
  3096.                         struct brw_reg payload,
  3097.                         struct brw_reg surface,
  3098.                         unsigned msg_length,
  3099.                         unsigned num_channels)
  3100. {
  3101.    const struct brw_device_info *devinfo = p->devinfo;
  3102.    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
  3103.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  3104.                           GEN6_SFID_DATAPORT_RENDER_CACHE);
  3105.    const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
  3106.    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
  3107.    const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
  3108.                           WRITEMASK_X : WRITEMASK_XYZW);
  3109.    struct brw_inst *insn = brw_send_indirect_surface_message(
  3110.       p, sfid, brw_writemask(brw_null_reg(), mask),
  3111.       payload, surface, msg_length, 0, true);
  3112.  
  3113.    brw_set_dp_typed_surface_write_message(
  3114.       p, insn, num_channels);
  3115. }
  3116.  
  3117. static void
  3118. brw_set_memory_fence_message(struct brw_codegen *p,
  3119.                              struct brw_inst *insn,
  3120.                              enum brw_message_target sfid,
  3121.                              bool commit_enable)
  3122. {
  3123.    const struct brw_device_info *devinfo = p->devinfo;
  3124.  
  3125.    brw_set_message_descriptor(p, insn, sfid,
  3126.                               1 /* message length */,
  3127.                               (commit_enable ? 1 : 0) /* response length */,
  3128.                               true /* header present */,
  3129.                               false);
  3130.  
  3131.    switch (sfid) {
  3132.    case GEN6_SFID_DATAPORT_RENDER_CACHE:
  3133.       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
  3134.       break;
  3135.    case GEN7_SFID_DATAPORT_DATA_CACHE:
  3136.       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
  3137.       break;
  3138.    default:
  3139.       unreachable("Not reached");
  3140.    }
  3141.  
  3142.    if (commit_enable)
  3143.       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
  3144. }
  3145.  
  3146. void
  3147. brw_memory_fence(struct brw_codegen *p,
  3148.                  struct brw_reg dst)
  3149. {
  3150.    const struct brw_device_info *devinfo = p->devinfo;
  3151.    const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
  3152.    struct brw_inst *insn;
  3153.  
  3154.    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
  3155.     * message doesn't write anything back.
  3156.     */
  3157.    insn = next_insn(p, BRW_OPCODE_SEND);
  3158.    brw_set_dest(p, insn, dst);
  3159.    brw_set_src0(p, insn, dst);
  3160.    brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
  3161.                                 commit_enable);
  3162.  
  3163.    if (devinfo->gen == 7 && !devinfo->is_haswell) {
  3164.       /* IVB does typed surface access through the render cache, so we need to
  3165.        * flush it too.  Use a different register so both flushes can be
  3166.        * pipelined by the hardware.
  3167.        */
  3168.       insn = next_insn(p, BRW_OPCODE_SEND);
  3169.       brw_set_dest(p, insn, offset(dst, 1));
  3170.       brw_set_src0(p, insn, offset(dst, 1));
  3171.       brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
  3172.                                    commit_enable);
  3173.  
  3174.       /* Now write the response of the second message into the response of the
  3175.        * first to trigger a pipeline stall -- This way future render and data
  3176.        * cache messages will be properly ordered with respect to past data and
  3177.        * render cache messages.
  3178.        */
  3179.       brw_push_insn_state(p);
  3180.       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  3181.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  3182.       brw_MOV(p, dst, offset(dst, 1));
  3183.       brw_pop_insn_state(p);
  3184.    }
  3185. }
  3186.  
  3187. void
  3188. brw_pixel_interpolator_query(struct brw_codegen *p,
  3189.                              struct brw_reg dest,
  3190.                              struct brw_reg mrf,
  3191.                              bool noperspective,
  3192.                              unsigned mode,
  3193.                              unsigned data,
  3194.                              unsigned msg_length,
  3195.                              unsigned response_length)
  3196. {
  3197.    const struct brw_device_info *devinfo = p->devinfo;
  3198.    struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
  3199.  
  3200.    brw_set_dest(p, insn, dest);
  3201.    brw_set_src0(p, insn, mrf);
  3202.    brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
  3203.                               msg_length, response_length,
  3204.                               false /* header is never present for PI */,
  3205.                               false);
  3206.  
  3207.    brw_inst_set_pi_simd_mode(
  3208.          devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16);
  3209.    brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
  3210.    brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
  3211.    brw_inst_set_pi_message_type(devinfo, insn, mode);
  3212.    brw_inst_set_pi_message_data(devinfo, insn, data);
  3213. }
  3214.  
  3215. void
  3216. brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
  3217. {
  3218.    const struct brw_device_info *devinfo = p->devinfo;
  3219.    brw_inst *inst;
  3220.  
  3221.    assert(devinfo->gen >= 7);
  3222.  
  3223.    brw_push_insn_state(p);
  3224.  
  3225.    if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
  3226.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  3227.  
  3228.       if (devinfo->gen >= 8) {
  3229.          /* Getting the first active channel index is easy on Gen8: Just find
  3230.           * the first bit set in the mask register.  The same register exists
  3231.           * on HSW already but it reads back as all ones when the current
  3232.           * instruction has execution masking disabled, so it's kind of
  3233.           * useless.
  3234.           */
  3235.          inst = brw_FBL(p, vec1(dst),
  3236.                         retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
  3237.  
  3238.          /* Quarter control has the effect of magically shifting the value of
  3239.           * this register.  Make sure it's set to zero.
  3240.           */
  3241.          brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q);
  3242.       } else {
  3243.          const struct brw_reg flag = retype(brw_flag_reg(1, 0),
  3244.                                             BRW_REGISTER_TYPE_UD);
  3245.  
  3246.          brw_MOV(p, flag, brw_imm_ud(0));
  3247.  
  3248.          /* Run a 16-wide instruction returning zero with execution masking
  3249.           * and a conditional modifier enabled in order to get the current
  3250.           * execution mask in f1.0.
  3251.           */
  3252.          inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0));
  3253.          brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16);
  3254.          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
  3255.          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
  3256.          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
  3257.  
  3258.          brw_FBL(p, vec1(dst), flag);
  3259.       }
  3260.    } else {
  3261.       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  3262.  
  3263.       if (devinfo->gen >= 8) {
  3264.          /* In SIMD4x2 mode the first active channel index is just the
  3265.           * negation of the first bit of the mask register.
  3266.           */
  3267.          inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
  3268.                         negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
  3269.                         brw_imm_ud(1));
  3270.  
  3271.       } else {
  3272.          /* Overwrite the destination without and with execution masking to
  3273.           * find out which of the channels is active.
  3274.           */
  3275.          brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
  3276.                  brw_imm_ud(1));
  3277.  
  3278.          inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
  3279.                         brw_imm_ud(0));
  3280.          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
  3281.       }
  3282.    }
  3283.  
  3284.    brw_pop_insn_state(p);
  3285. }
  3286.  
  3287. void
  3288. brw_broadcast(struct brw_codegen *p,
  3289.               struct brw_reg dst,
  3290.               struct brw_reg src,
  3291.               struct brw_reg idx)
  3292. {
  3293.    const struct brw_device_info *devinfo = p->devinfo;
  3294.    const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
  3295.    brw_inst *inst;
  3296.  
  3297.    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
  3298.           src.address_mode == BRW_ADDRESS_DIRECT);
  3299.  
  3300.    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
  3301.        idx.file == BRW_IMMEDIATE_VALUE) {
  3302.       /* Trivial, the source is already uniform or the index is a constant.
  3303.        * We will typically not get here if the optimizer is doing its job, but
  3304.        * asserting would be mean.
  3305.        */
  3306.       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0;
  3307.       brw_MOV(p, dst,
  3308.               (align1 ? stride(suboffset(src, i), 0, 1, 0) :
  3309.                stride(suboffset(src, 4 * i), 0, 4, 1)));
  3310.    } else {
  3311.       if (align1) {
  3312.          const struct brw_reg addr =
  3313.             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
  3314.          const unsigned offset = src.nr * REG_SIZE + src.subnr;
  3315.          /* Limit in bytes of the signed indirect addressing immediate. */
  3316.          const unsigned limit = 512;
  3317.  
  3318.          brw_push_insn_state(p);
  3319.          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  3320.          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
  3321.  
  3322.          /* Take into account the component size and horizontal stride. */
  3323.          assert(src.vstride == src.hstride + src.width);
  3324.          brw_SHL(p, addr, vec1(idx),
  3325.                  brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
  3326.                             src.hstride - 1));
  3327.  
  3328.          /* We can only address up to limit bytes using the indirect
  3329.           * addressing immediate, account for the difference if the source
  3330.           * register is above this limit.
  3331.           */
  3332.          if (offset >= limit)
  3333.             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
  3334.  
  3335.          brw_pop_insn_state(p);
  3336.  
  3337.          /* Use indirect addressing to fetch the specified component. */
  3338.          brw_MOV(p, dst,
  3339.                  retype(brw_vec1_indirect(addr.subnr, offset % limit),
  3340.                         src.type));
  3341.       } else {
  3342.          /* In SIMD4x2 mode the index can be either zero or one, replicate it
  3343.           * to all bits of a flag register,
  3344.           */
  3345.          inst = brw_MOV(p,
  3346.                         brw_null_reg(),
  3347.                         stride(brw_swizzle1(idx, 0), 0, 4, 1));
  3348.          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
  3349.          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
  3350.          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
  3351.  
  3352.          /* and use predicated SEL to pick the right channel. */
  3353.          inst = brw_SEL(p, dst,
  3354.                         stride(suboffset(src, 4), 0, 4, 1),
  3355.                         stride(src, 0, 4, 1));
  3356.          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
  3357.          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
  3358.       }
  3359.    }
  3360. }
  3361.  
  3362. /**
  3363.  * This instruction is generated as a single-channel align1 instruction by
  3364.  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
  3365.  *
  3366.  * We can't use the typed atomic op in the FS because that has the execution
  3367.  * mask ANDed with the pixel mask, but we just want to write the one dword for
  3368.  * all the pixels.
  3369.  *
  3370.  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
  3371.  * one u32.  So we use the same untyped atomic write message as the pixel
  3372.  * shader.
  3373.  *
  3374.  * The untyped atomic operation requires a BUFFER surface type with RAW
  3375.  * format, and is only accessible through the legacy DATA_CACHE dataport
  3376.  * messages.
  3377.  */
  3378. void brw_shader_time_add(struct brw_codegen *p,
  3379.                          struct brw_reg payload,
  3380.                          uint32_t surf_index)
  3381. {
  3382.    const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
  3383.                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
  3384.                           GEN7_SFID_DATAPORT_DATA_CACHE);
  3385.    assert(p->devinfo->gen >= 7);
  3386.  
  3387.    brw_push_insn_state(p);
  3388.    brw_set_default_access_mode(p, BRW_ALIGN_1);
  3389.    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
  3390.    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
  3391.    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
  3392.  
  3393.    /* We use brw_vec1_reg and unmasked because we want to increment the given
  3394.     * offset only once.
  3395.     */
  3396.    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
  3397.                                       BRW_ARF_NULL, 0));
  3398.    brw_set_src0(p, send, brw_vec1_reg(payload.file,
  3399.                                       payload.nr, 0));
  3400.    brw_set_src1(p, send, brw_imm_ud(0));
  3401.    brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
  3402.    brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
  3403.    brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
  3404.  
  3405.    brw_pop_insn_state(p);
  3406. }
  3407.