Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2012 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21.  * DEALINGS IN THE SOFTWARE.
  22.  */
  23.  
  24. #include "ir.h"
  25. #include "ir_builder.h"
  26. #include "ir_optimization.h"
  27. #include "ir_rvalue_visitor.h"
  28.  
  29. namespace {
  30.  
  31. using namespace ir_builder;
  32.  
  33. /**
  34.  * A visitor that lowers built-in floating-point pack/unpack expressions
  35.  * such packSnorm2x16.
  36.  */
  37. class lower_packing_builtins_visitor : public ir_rvalue_visitor {
  38. public:
  39.    /**
  40.     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
  41.     */
  42.    explicit lower_packing_builtins_visitor(int op_mask)
  43.       : op_mask(op_mask),
  44.         progress(false)
  45.    {
  46.       /* Mutually exclusive options. */
  47.       assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
  48.                (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
  49.  
  50.       assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
  51.                (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
  52.  
  53.       factory.instructions = &factory_instructions;
  54.    }
  55.  
  56.    virtual ~lower_packing_builtins_visitor()
  57.    {
  58.       assert(factory_instructions.is_empty());
  59.    }
  60.  
  61.    bool get_progress() { return progress; }
  62.  
  63.    void handle_rvalue(ir_rvalue **rvalue)
  64.    {
  65.       if (!*rvalue)
  66.          return;
  67.  
  68.       ir_expression *expr = (*rvalue)->as_expression();
  69.       if (!expr)
  70.          return;
  71.  
  72.       enum lower_packing_builtins_op lowering_op =
  73.          choose_lowering_op(expr->operation);
  74.  
  75.       if (lowering_op == LOWER_PACK_UNPACK_NONE)
  76.          return;
  77.  
  78.       setup_factory(ralloc_parent(expr));
  79.  
  80.       ir_rvalue *op0 = expr->operands[0];
  81.       ralloc_steal(factory.mem_ctx, op0);
  82.  
  83.       switch (lowering_op) {
  84.       case LOWER_PACK_SNORM_2x16:
  85.          *rvalue = lower_pack_snorm_2x16(op0);
  86.          break;
  87.       case LOWER_PACK_SNORM_4x8:
  88.          *rvalue = lower_pack_snorm_4x8(op0);
  89.          break;
  90.       case LOWER_PACK_UNORM_2x16:
  91.          *rvalue = lower_pack_unorm_2x16(op0);
  92.          break;
  93.       case LOWER_PACK_UNORM_4x8:
  94.          *rvalue = lower_pack_unorm_4x8(op0);
  95.          break;
  96.       case LOWER_PACK_HALF_2x16:
  97.          *rvalue = lower_pack_half_2x16(op0);
  98.          break;
  99.       case LOWER_PACK_HALF_2x16_TO_SPLIT:
  100.          *rvalue = split_pack_half_2x16(op0);
  101.          break;
  102.       case LOWER_UNPACK_SNORM_2x16:
  103.          *rvalue = lower_unpack_snorm_2x16(op0);
  104.          break;
  105.       case LOWER_UNPACK_SNORM_4x8:
  106.          *rvalue = lower_unpack_snorm_4x8(op0);
  107.          break;
  108.       case LOWER_UNPACK_UNORM_2x16:
  109.          *rvalue = lower_unpack_unorm_2x16(op0);
  110.          break;
  111.       case LOWER_UNPACK_UNORM_4x8:
  112.          *rvalue = lower_unpack_unorm_4x8(op0);
  113.          break;
  114.       case LOWER_UNPACK_HALF_2x16:
  115.          *rvalue = lower_unpack_half_2x16(op0);
  116.          break;
  117.       case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
  118.          *rvalue = split_unpack_half_2x16(op0);
  119.          break;
  120.       case LOWER_PACK_UNPACK_NONE:
  121.          assert(!"not reached");
  122.          break;
  123.       }
  124.  
  125.       teardown_factory();
  126.       progress = true;
  127.    }
  128.  
  129. private:
  130.    const int op_mask;
  131.    bool progress;
  132.    ir_factory factory;
  133.    exec_list factory_instructions;
  134.  
  135.    /**
  136.     * Determine the needed lowering operation by filtering \a expr_op
  137.     * through \ref op_mask.
  138.     */
  139.    enum lower_packing_builtins_op
  140.    choose_lowering_op(ir_expression_operation expr_op)
  141.    {
  142.       /* C++ regards int and enum as fundamentally different types.
  143.        * So, we can't simply return from each case; we must cast the return
  144.        * value.
  145.        */
  146.       int result;
  147.  
  148.       switch (expr_op) {
  149.       case ir_unop_pack_snorm_2x16:
  150.          result = op_mask & LOWER_PACK_SNORM_2x16;
  151.          break;
  152.       case ir_unop_pack_snorm_4x8:
  153.          result = op_mask & LOWER_PACK_SNORM_4x8;
  154.          break;
  155.       case ir_unop_pack_unorm_2x16:
  156.          result = op_mask & LOWER_PACK_UNORM_2x16;
  157.          break;
  158.       case ir_unop_pack_unorm_4x8:
  159.          result = op_mask & LOWER_PACK_UNORM_4x8;
  160.          break;
  161.       case ir_unop_pack_half_2x16:
  162.          result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
  163.          break;
  164.       case ir_unop_unpack_snorm_2x16:
  165.          result = op_mask & LOWER_UNPACK_SNORM_2x16;
  166.          break;
  167.       case ir_unop_unpack_snorm_4x8:
  168.          result = op_mask & LOWER_UNPACK_SNORM_4x8;
  169.          break;
  170.       case ir_unop_unpack_unorm_2x16:
  171.          result = op_mask & LOWER_UNPACK_UNORM_2x16;
  172.          break;
  173.       case ir_unop_unpack_unorm_4x8:
  174.          result = op_mask & LOWER_UNPACK_UNORM_4x8;
  175.          break;
  176.       case ir_unop_unpack_half_2x16:
  177.          result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
  178.          break;
  179.       default:
  180.          result = LOWER_PACK_UNPACK_NONE;
  181.          break;
  182.       }
  183.  
  184.       return static_cast<enum lower_packing_builtins_op>(result);
  185.    }
  186.  
  187.    void
  188.    setup_factory(void *mem_ctx)
  189.    {
  190.       assert(factory.mem_ctx == NULL);
  191.       assert(factory.instructions->is_empty());
  192.  
  193.       factory.mem_ctx = mem_ctx;
  194.    }
  195.  
  196.    void
  197.    teardown_factory()
  198.    {
  199.       base_ir->insert_before(factory.instructions);
  200.       assert(factory.instructions->is_empty());
  201.       factory.mem_ctx = NULL;
  202.    }
  203.  
  204.    template <typename T>
  205.    ir_constant*
  206.    constant(T x)
  207.    {
  208.       return factory.constant(x);
  209.    }
  210.  
  211.    /**
  212.     * \brief Pack two uint16's into a single uint32.
  213.     *
  214.     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
  215.     * where the least significant bits specify the first element of the pair.
  216.     * Return the uint32.
  217.     */
  218.    ir_rvalue*
  219.    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
  220.    {
  221.       assert(uvec2_rval->type == glsl_type::uvec2_type);
  222.  
  223.       /* uvec2 u = UVEC2_RVAL; */
  224.       ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
  225.                                           "tmp_pack_uvec2_to_uint");
  226.       factory.emit(assign(u, uvec2_rval));
  227.  
  228.       /* return (u.y << 16) | (u.x & 0xffff); */
  229.       return bit_or(lshift(swizzle_y(u), constant(16u)),
  230.                     bit_and(swizzle_x(u), constant(0xffffu)));
  231.    }
  232.  
  233.    /**
  234.     * \brief Pack four uint8's into a single uint32.
  235.     *
  236.     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
  237.     * uint32 where the least significant bits specify the first element of the
  238.     * 4-tuple. Return the uint32.
  239.     */
  240.    ir_rvalue*
  241.    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
  242.    {
  243.       assert(uvec4_rval->type == glsl_type::uvec4_type);
  244.  
  245.       /* uvec4 u = UVEC4_RVAL; */
  246.       ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
  247.                                           "tmp_pack_uvec4_to_uint");
  248.       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
  249.  
  250.       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
  251.       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
  252.                            lshift(swizzle_z(u), constant(16u))),
  253.                     bit_or(lshift(swizzle_y(u), constant(8u)),
  254.                            swizzle_x(u)));
  255.    }
  256.  
  257.    /**
  258.     * \brief Unpack a uint32 into two uint16's.
  259.     *
  260.     * Interpret the given uint32 as a uint16 pair where the uint32's least
  261.     * significant bits specify the pair's first element. Return the uint16
  262.     * pair as a uvec2.
  263.     */
  264.    ir_rvalue*
  265.    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
  266.    {
  267.       assert(uint_rval->type == glsl_type::uint_type);
  268.  
  269.       /* uint u = UINT_RVAL; */
  270.       ir_variable *u = factory.make_temp(glsl_type::uint_type,
  271.                                           "tmp_unpack_uint_to_uvec2_u");
  272.       factory.emit(assign(u, uint_rval));
  273.  
  274.       /* uvec2 u2; */
  275.       ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
  276.                                            "tmp_unpack_uint_to_uvec2_u2");
  277.  
  278.       /* u2.x = u & 0xffffu; */
  279.       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
  280.  
  281.       /* u2.y = u >> 16u; */
  282.       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
  283.  
  284.       return deref(u2).val;
  285.    }
  286.  
  287.    /**
  288.     * \brief Unpack a uint32 into four uint8's.
  289.     *
  290.     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
  291.     * significant bits specify the 4-tuple's first element. Return the uint8
  292.     * 4-tuple as a uvec4.
  293.     */
  294.    ir_rvalue*
  295.    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
  296.    {
  297.       assert(uint_rval->type == glsl_type::uint_type);
  298.  
  299.       /* uint u = UINT_RVAL; */
  300.       ir_variable *u = factory.make_temp(glsl_type::uint_type,
  301.                                           "tmp_unpack_uint_to_uvec4_u");
  302.       factory.emit(assign(u, uint_rval));
  303.  
  304.       /* uvec4 u4; */
  305.       ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
  306.                                            "tmp_unpack_uint_to_uvec4_u4");
  307.  
  308.       /* u4.x = u & 0xffu; */
  309.       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
  310.  
  311.       /* u4.y = (u >> 8u) & 0xffu; */
  312.       factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
  313.                                       constant(0xffu)), WRITEMASK_Y));
  314.  
  315.       /* u4.z = (u >> 16u) & 0xffu; */
  316.       factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
  317.                                       constant(0xffu)), WRITEMASK_Z));
  318.  
  319.       /* u4.w = (u >> 24u) */
  320.       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
  321.  
  322.       return deref(u4).val;
  323.    }
  324.  
  325.    /**
  326.     * \brief Lower a packSnorm2x16 expression.
  327.     *
  328.     * \param vec2_rval is packSnorm2x16's input
  329.     * \return packSnorm2x16's output as a uint rvalue
  330.     */
  331.    ir_rvalue*
  332.    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
  333.    {
  334.       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
  335.        *
  336.        *    highp uint packSnorm2x16(vec2 v)
  337.        *    --------------------------------
  338.        *    First, converts each component of the normalized floating-point value
  339.        *    v into 16-bit integer values. Then, the results are packed into the
  340.        *    returned 32-bit unsigned integer.
  341.        *
  342.        *    The conversion for component c of v to fixed point is done as
  343.        *    follows:
  344.        *
  345.        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
  346.        *
  347.        *    The first component of the vector will be written to the least
  348.        *    significant bits of the output; the last component will be written to
  349.        *    the most significant bits.
  350.        *
  351.        * This function generates IR that approximates the following pseudo-GLSL:
  352.        *
  353.        *     return pack_uvec2_to_uint(
  354.        *         uvec2(ivec2(
  355.        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
  356.        *
  357.        * It is necessary to first convert the vec2 to ivec2 rather than directly
  358.        * converting vec2 to uvec2 because the latter conversion is undefined.
  359.        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
  360.        * convert a negative floating point value to an uint".
  361.        */
  362.       assert(vec2_rval->type == glsl_type::vec2_type);
  363.  
  364.       ir_rvalue *result = pack_uvec2_to_uint(
  365.             i2u(f2i(round_even(mul(clamp(vec2_rval,
  366.                                          constant(-1.0f),
  367.                                          constant(1.0f)),
  368.                                    constant(32767.0f))))));
  369.  
  370.       assert(result->type == glsl_type::uint_type);
  371.       return result;
  372.    }
  373.  
  374.    /**
  375.     * \brief Lower a packSnorm4x8 expression.
  376.     *
  377.     * \param vec4_rval is packSnorm4x8's input
  378.     * \return packSnorm4x8's output as a uint rvalue
  379.     */
  380.    ir_rvalue*
  381.    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
  382.    {
  383.       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
  384.        *
  385.        *    highp uint packSnorm4x8(vec4 v)
  386.        *    -------------------------------
  387.        *    First, converts each component of the normalized floating-point value
  388.        *    v into 8-bit integer values. Then, the results are packed into the
  389.        *    returned 32-bit unsigned integer.
  390.        *
  391.        *    The conversion for component c of v to fixed point is done as
  392.        *    follows:
  393.        *
  394.        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
  395.        *
  396.        *    The first component of the vector will be written to the least
  397.        *    significant bits of the output; the last component will be written to
  398.        *    the most significant bits.
  399.        *
  400.        * This function generates IR that approximates the following pseudo-GLSL:
  401.        *
  402.        *     return pack_uvec4_to_uint(
  403.        *         uvec4(ivec4(
  404.        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
  405.        *
  406.        * It is necessary to first convert the vec4 to ivec4 rather than directly
  407.        * converting vec4 to uvec4 because the latter conversion is undefined.
  408.        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
  409.        * convert a negative floating point value to an uint".
  410.        */
  411.       assert(vec4_rval->type == glsl_type::vec4_type);
  412.  
  413.       ir_rvalue *result = pack_uvec4_to_uint(
  414.             i2u(f2i(round_even(mul(clamp(vec4_rval,
  415.                                          constant(-1.0f),
  416.                                          constant(1.0f)),
  417.                                    constant(127.0f))))));
  418.  
  419.       assert(result->type == glsl_type::uint_type);
  420.       return result;
  421.    }
  422.  
  423.    /**
  424.     * \brief Lower an unpackSnorm2x16 expression.
  425.     *
  426.     * \param uint_rval is unpackSnorm2x16's input
  427.     * \return unpackSnorm2x16's output as a vec2 rvalue
  428.     */
  429.    ir_rvalue*
  430.    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
  431.    {
  432.       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
  433.        *
  434.        *    highp vec2 unpackSnorm2x16 (highp uint p)
  435.        *    -----------------------------------------
  436.        *    First, unpacks a single 32-bit unsigned integer p into a pair of
  437.        *    16-bit unsigned integers. Then, each component is converted to
  438.        *    a normalized floating-point value to generate the returned
  439.        *    two-component vector.
  440.        *
  441.        *    The conversion for unpacked fixed-point value f to floating point is
  442.        *    done as follows:
  443.        *
  444.        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
  445.        *
  446.        *    The first component of the returned vector will be extracted from the
  447.        *    least significant bits of the input; the last component will be
  448.        *    extracted from the most significant bits.
  449.        *
  450.        * This function generates IR that approximates the following pseudo-GLSL:
  451.        *
  452.        *    return clamp(
  453.        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
  454.        *       -1.0f, 1.0f);
  455.        *
  456.        * The above IR may appear unnecessarily complex, but the intermediate
  457.        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
  458.        * negative floats.
  459.        *
  460.        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
  461.        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
  462.        * place that int16 into an int32, which results in the *positive* integer
  463.        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
  464.        * unimportant bit 16. We must now extend the int16's sign bit into bits
  465.        * 17-32, which is accomplished by left-shifting then right-shifting.
  466.        */
  467.  
  468.       assert(uint_rval->type == glsl_type::uint_type);
  469.  
  470.       ir_rvalue *result =
  471.         clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
  472.                                     constant(16)),
  473.                              constant(16u))),
  474.                   constant(32767.0f)),
  475.               constant(-1.0f),
  476.               constant(1.0f));
  477.  
  478.       assert(result->type == glsl_type::vec2_type);
  479.       return result;
  480.    }
  481.  
  482.    /**
  483.     * \brief Lower an unpackSnorm4x8 expression.
  484.     *
  485.     * \param uint_rval is unpackSnorm4x8's input
  486.     * \return unpackSnorm4x8's output as a vec4 rvalue
  487.     */
  488.    ir_rvalue*
  489.    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
  490.    {
  491.       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
  492.        *
  493.        *    highp vec4 unpackSnorm4x8 (highp uint p)
  494.        *    ----------------------------------------
  495.        *    First, unpacks a single 32-bit unsigned integer p into four
  496.        *    8-bit unsigned integers. Then, each component is converted to
  497.        *    a normalized floating-point value to generate the returned
  498.        *    four-component vector.
  499.        *
  500.        *    The conversion for unpacked fixed-point value f to floating point is
  501.        *    done as follows:
  502.        *
  503.        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
  504.        *
  505.        *    The first component of the returned vector will be extracted from the
  506.        *    least significant bits of the input; the last component will be
  507.        *    extracted from the most significant bits.
  508.        *
  509.        * This function generates IR that approximates the following pseudo-GLSL:
  510.        *
  511.        *    return clamp(
  512.        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
  513.        *       -1.0f, 1.0f);
  514.        *
  515.        * The above IR may appear unnecessarily complex, but the intermediate
  516.        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
  517.        * negative floats.
  518.        *
  519.        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
  520.        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
  521.        * place that int8 into an int32, which results in the *positive* integer
  522.        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
  523.        * unimportant bit 8. We must now extend the int8's sign bit into bits
  524.        * 9-32, which is accomplished by left-shifting then right-shifting.
  525.        */
  526.  
  527.       assert(uint_rval->type == glsl_type::uint_type);
  528.  
  529.       ir_rvalue *result =
  530.         clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
  531.                                     constant(24u)),
  532.                              constant(24u))),
  533.                   constant(127.0f)),
  534.               constant(-1.0f),
  535.               constant(1.0f));
  536.  
  537.       assert(result->type == glsl_type::vec4_type);
  538.       return result;
  539.    }
  540.  
  541.    /**
  542.     * \brief Lower a packUnorm2x16 expression.
  543.     *
  544.     * \param vec2_rval is packUnorm2x16's input
  545.     * \return packUnorm2x16's output as a uint rvalue
  546.     */
  547.    ir_rvalue*
  548.    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
  549.    {
  550.       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
  551.        *
  552.        *    highp uint packUnorm2x16 (vec2 v)
  553.        *    ---------------------------------
  554.        *    First, converts each component of the normalized floating-point value
  555.        *    v into 16-bit integer values. Then, the results are packed into the
  556.        *    returned 32-bit unsigned integer.
  557.        *
  558.        *    The conversion for component c of v to fixed point is done as
  559.        *    follows:
  560.        *
  561.        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
  562.        *
  563.        *    The first component of the vector will be written to the least
  564.        *    significant bits of the output; the last component will be written to
  565.        *    the most significant bits.
  566.        *
  567.        * This function generates IR that approximates the following pseudo-GLSL:
  568.        *
  569.        *     return pack_uvec2_to_uint(uvec2(
  570.        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
  571.        *
  572.        * Here it is safe to directly convert the vec2 to uvec2 because the the
  573.        * vec2 has been clamped to a non-negative range.
  574.        */
  575.  
  576.       assert(vec2_rval->type == glsl_type::vec2_type);
  577.  
  578.       ir_rvalue *result = pack_uvec2_to_uint(
  579.          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
  580.  
  581.       assert(result->type == glsl_type::uint_type);
  582.       return result;
  583.    }
  584.  
  585.    /**
  586.     * \brief Lower a packUnorm4x8 expression.
  587.     *
  588.     * \param vec4_rval is packUnorm4x8's input
  589.     * \return packUnorm4x8's output as a uint rvalue
  590.     */
  591.    ir_rvalue*
  592.    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
  593.    {
  594.       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
  595.        *
  596.        *    highp uint packUnorm4x8 (vec4 v)
  597.        *    --------------------------------
  598.        *    First, converts each component of the normalized floating-point value
  599.        *    v into 8-bit integer values. Then, the results are packed into the
  600.        *    returned 32-bit unsigned integer.
  601.        *
  602.        *    The conversion for component c of v to fixed point is done as
  603.        *    follows:
  604.        *
  605.        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
  606.        *
  607.        *    The first component of the vector will be written to the least
  608.        *    significant bits of the output; the last component will be written to
  609.        *    the most significant bits.
  610.        *
  611.        * This function generates IR that approximates the following pseudo-GLSL:
  612.        *
  613.        *     return pack_uvec4_to_uint(uvec4(
  614.        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
  615.        *
  616.        * Here it is safe to directly convert the vec4 to uvec4 because the the
  617.        * vec4 has been clamped to a non-negative range.
  618.        */
  619.  
  620.       assert(vec4_rval->type == glsl_type::vec4_type);
  621.  
  622.       ir_rvalue *result = pack_uvec4_to_uint(
  623.          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
  624.  
  625.       assert(result->type == glsl_type::uint_type);
  626.       return result;
  627.    }
  628.  
  629.    /**
  630.     * \brief Lower an unpackUnorm2x16 expression.
  631.     *
  632.     * \param uint_rval is unpackUnorm2x16's input
  633.     * \return unpackUnorm2x16's output as a vec2 rvalue
  634.     */
  635.    ir_rvalue*
  636.    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
  637.    {
  638.       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
  639.        *
  640.        *    highp vec2 unpackUnorm2x16 (highp uint p)
  641.        *    -----------------------------------------
  642.        *    First, unpacks a single 32-bit unsigned integer p into a pair of
  643.        *    16-bit unsigned integers. Then, each component is converted to
  644.        *    a normalized floating-point value to generate the returned
  645.        *    two-component vector.
  646.        *
  647.        *    The conversion for unpacked fixed-point value f to floating point is
  648.        *    done as follows:
  649.        *
  650.        *       unpackUnorm2x16: f / 65535.0
  651.        *
  652.        *    The first component of the returned vector will be extracted from the
  653.        *    least significant bits of the input; the last component will be
  654.        *    extracted from the most significant bits.
  655.        *
  656.        * This function generates IR that approximates the following pseudo-GLSL:
  657.        *
  658.        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
  659.        */
  660.  
  661.       assert(uint_rval->type == glsl_type::uint_type);
  662.  
  663.       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
  664.                               constant(65535.0f));
  665.  
  666.       assert(result->type == glsl_type::vec2_type);
  667.       return result;
  668.    }
  669.  
  670.    /**
  671.     * \brief Lower an unpackUnorm4x8 expression.
  672.     *
  673.     * \param uint_rval is unpackUnorm4x8's input
  674.     * \return unpackUnorm4x8's output as a vec4 rvalue
  675.     */
  676.    ir_rvalue*
  677.    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
  678.    {
  679.       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
  680.        *
  681.        *    highp vec4 unpackUnorm4x8 (highp uint p)
  682.        *    ----------------------------------------
  683.        *    First, unpacks a single 32-bit unsigned integer p into four
  684.        *    8-bit unsigned integers. Then, each component is converted to
  685.        *    a normalized floating-point value to generate the returned
  686.        *    two-component vector.
  687.        *
  688.        *    The conversion for unpacked fixed-point value f to floating point is
  689.        *    done as follows:
  690.        *
  691.        *       unpackUnorm4x8: f / 255.0
  692.        *
  693.        *    The first component of the returned vector will be extracted from the
  694.        *    least significant bits of the input; the last component will be
  695.        *    extracted from the most significant bits.
  696.        *
  697.        * This function generates IR that approximates the following pseudo-GLSL:
  698.        *
  699.        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
  700.        */
  701.  
  702.       assert(uint_rval->type == glsl_type::uint_type);
  703.  
  704.       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
  705.                               constant(255.0f));
  706.  
  707.       assert(result->type == glsl_type::vec4_type);
  708.       return result;
  709.    }
  710.  
  711.    /**
  712.     * \brief Lower the component-wise calculation of packHalf2x16.
  713.     *
  714.     * \param f_rval is one component of packHafl2x16's input
  715.     * \param e_rval is the unshifted exponent bits of f_rval
  716.     * \param m_rval is the unshifted mantissa bits of f_rval
  717.     *
  718.     * \return a uint rvalue that encodes a float16 in its lower 16 bits
  719.     */
  720.    ir_rvalue*
  721.    pack_half_1x16_nosign(ir_rvalue *f_rval,
  722.                          ir_rvalue *e_rval,
  723.                          ir_rvalue *m_rval)
  724.    {
  725.       assert(e_rval->type == glsl_type::uint_type);
  726.       assert(m_rval->type == glsl_type::uint_type);
  727.  
  728.       /* uint u16; */
  729.       ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
  730.                                            "tmp_pack_half_1x16_u16");
  731.  
  732.       /* float f = FLOAT_RVAL; */
  733.       ir_variable *f = factory.make_temp(glsl_type::float_type,
  734.                                           "tmp_pack_half_1x16_f");
  735.       factory.emit(assign(f, f_rval));
  736.  
  737.       /* uint e = E_RVAL; */
  738.       ir_variable *e = factory.make_temp(glsl_type::uint_type,
  739.                                           "tmp_pack_half_1x16_e");
  740.       factory.emit(assign(e, e_rval));
  741.  
  742.       /* uint m = M_RVAL; */
  743.       ir_variable *m = factory.make_temp(glsl_type::uint_type,
  744.                                           "tmp_pack_half_1x16_m");
  745.       factory.emit(assign(m, m_rval));
  746.  
  747.       /* Preliminaries
  748.        * -------------
  749.        *
  750.        * For a float16, the bit layout is:
  751.        *
  752.        *   sign:     15
  753.        *   exponent: 10:14
  754.        *   mantissa: 0:9
  755.        *
  756.        * Let f16 be a float16 value. The sign, exponent, and mantissa
  757.        * determine its value thus:
  758.        *
  759.        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
  760.        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
  761.        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
  762.        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
  763.        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
  764.        *
  765.        * where 0 <= m16 < 2^10.
  766.        *
  767.        * For a float32, the bit layout is:
  768.        *
  769.        *   sign:     31
  770.        *   exponent: 23:30
  771.        *   mantissa: 0:22
  772.        *
  773.        * Let f32 be a float32 value. The sign, exponent, and mantissa
  774.        * determine its value thus:
  775.        *
  776.        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
  777.        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
  778.        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
  779.        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
  780.        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
  781.        *
  782.        * where 0 <= m32 < 2^23.
  783.        *
  784.        * The minimum and maximum normal float16 values are
  785.        *
  786.        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
  787.        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
  788.        *
  789.        * The step at max_norm16 is
  790.        *
  791.        *   max_step16 = 2^5                                     (22)
  792.        *
  793.        * Observe that the float16 boundary values in equations 20-21 lie in the
  794.        * range of normal float32 values.
  795.        *
  796.        *
  797.        * Rounding Behavior
  798.        * -----------------
  799.        * Not all float32 values can be exactly represented as a float16. We
  800.        * round all such intermediate float32 values to the nearest float16; if
  801.        * the float32 is exactly between to float16 values, we round to the one
  802.        * with an even mantissa. This rounding behavior has several benefits:
  803.        *
  804.        *   - It has no sign bias.
  805.        *
  806.        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
  807.        *     GPU ISA.
  808.        *
  809.        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
  810.        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
  811.        *     result in the same value as if the expression were executed on the
  812.        *     GPU.
  813.        *
  814.        * Calculation
  815.        * -----------
  816.        * Our task is to compute s16, e16, m16 given f32.  Since this function
  817.        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
  818.        * cases consider.
  819.        */
  820.  
  821.       factory.emit(
  822.  
  823.          /* Case 1) f32 is NaN
  824.           *
  825.           *   The resultant f16 will also be NaN.
  826.           */
  827.  
  828.          /* if (e32 == 255 && m32 != 0) { */
  829.          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
  830.                            logic_not(equal(m, constant(0u)))),
  831.  
  832.             assign(u16, constant(0x7fffu)),
  833.  
  834.          /* Case 2) f32 lies in the range [0, min_norm16).
  835.           *
  836.           *   The resultant float16 will be either zero, subnormal, or normal.
  837.           *
  838.           *   Solving
  839.           *
  840.           *     f32 = min_norm16       (30)
  841.           *
  842.           *   gives
  843.           *
  844.           *     e32 = 113 and m32 = 0  (31)
  845.           *
  846.           *   Therefore this case occurs if and only if
  847.           *
  848.           *     e32 < 113              (32)
  849.           */
  850.  
  851.          /* } else if (e32 < 113) { */
  852.          if_tree(less(e, constant(113u << 23u)),
  853.  
  854.             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
  855.             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
  856.                                            constant((float) (1 << 24)))))),
  857.  
  858.          /* Case 3) f32 lies in the range
  859.           *         [min_norm16, max_norm16 + max_step16).
  860.           *
  861.           *   The resultant float16 will be either normal or infinite.
  862.           *
  863.           *   Solving
  864.           *
  865.           *     f32 = max_norm16 + max_step16           (40)
  866.           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
  867.           *         = 2^16                              (42)
  868.           *   gives
  869.           *
  870.           *     e32 = 143 and m32 = 0                   (43)
  871.           *
  872.           *   We already solved the boundary condition f32 = min_norm16 above
  873.           *   in equation 31. Therefore this case occurs if and only if
  874.           *
  875.           *     113 <= e32 and e32 < 143
  876.           */
  877.  
  878.          /* } else if (e32 < 143) { */
  879.          if_tree(less(e, constant(143u << 23u)),
  880.  
  881.             /* The addition below handles the case where the mantissa rounds
  882.              * up to 1024 and bumps the exponent.
  883.              *
  884.              * u16 = ((e - (112u << 23u)) >> 13u)
  885.              *     + round_to_even((float(m) / (1u << 13u));
  886.              */
  887.             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
  888.                                    constant(13u)),
  889.                             f2u(round_even(
  890.                                   div(u2f(m), constant((float) (1 << 13))))))),
  891.  
  892.          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
  893.           *
  894.           *   The resultant float16 will be infinite.
  895.           *
  896.           *   The cases above caught all float32 values in the range
  897.           *   [0, max_norm16 + max_step16), so this is the fall-through case.
  898.           */
  899.  
  900.          /* } else { */
  901.  
  902.             assign(u16, constant(31u << 10u))))));
  903.  
  904.          /* } */
  905.  
  906.        return deref(u16).val;
  907.    }
  908.  
  909.    /**
  910.     * \brief Lower a packHalf2x16 expression.
  911.     *
  912.     * \param vec2_rval is packHalf2x16's input
  913.     * \return packHalf2x16's output as a uint rvalue
  914.     */
  915.    ir_rvalue*
  916.    lower_pack_half_2x16(ir_rvalue *vec2_rval)
  917.    {
  918.       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
  919.        *
  920.        *    highp uint packHalf2x16 (mediump vec2 v)
  921.        *    ----------------------------------------
  922.        *    Returns an unsigned integer obtained by converting the components of
  923.        *    a two-component floating-point vector to the 16-bit floating-point
  924.        *    representation found in the OpenGL ES Specification, and then packing
  925.        *    these two 16-bit integers into a 32-bit unsigned integer.
  926.        *
  927.        *    The first vector component specifies the 16 least- significant bits
  928.        *    of the result; the second component specifies the 16 most-significant
  929.        *    bits.
  930.        */
  931.  
  932.       assert(vec2_rval->type == glsl_type::vec2_type);
  933.  
  934.       /* vec2 f = VEC2_RVAL; */
  935.       ir_variable *f = factory.make_temp(glsl_type::vec2_type,
  936.                                          "tmp_pack_half_2x16_f");
  937.       factory.emit(assign(f, vec2_rval));
  938.  
  939.       /* uvec2 f32 = bitcast_f2u(f); */
  940.       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
  941.                                             "tmp_pack_half_2x16_f32");
  942.       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
  943.  
  944.       /* uvec2 f16; */
  945.       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
  946.                                         "tmp_pack_half_2x16_f16");
  947.  
  948.       /* Get f32's unshifted exponent bits.
  949.        *
  950.        *   uvec2 e = f32 & 0x7f800000u;
  951.        */
  952.       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
  953.                                           "tmp_pack_half_2x16_e");
  954.       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
  955.  
  956.       /* Get f32's unshifted mantissa bits.
  957.        *
  958.        *   uvec2 m = f32 & 0x007fffffu;
  959.        */
  960.       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
  961.                                           "tmp_pack_half_2x16_m");
  962.       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
  963.  
  964.       /* Set f16's exponent and mantissa bits.
  965.        *
  966.        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
  967.        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
  968.        */
  969.       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
  970.                                                      swizzle_x(e),
  971.                                                      swizzle_x(m)),
  972.                            WRITEMASK_X));
  973.       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
  974.                                                      swizzle_y(e),
  975.                                                      swizzle_y(m)),
  976.                            WRITEMASK_Y));
  977.  
  978.       /* Set f16's sign bits.
  979.        *
  980.        *   f16 |= (f32 & (1u << 31u) >> 16u;
  981.        */
  982.       factory.emit(
  983.          assign(f16, bit_or(f16,
  984.                             rshift(bit_and(f32, constant(1u << 31u)),
  985.                                    constant(16u)))));
  986.  
  987.  
  988.       /* return (f16.y << 16u) | f16.x; */
  989.       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
  990.                                         constant(16u)),
  991.                                  swizzle_x(f16));
  992.  
  993.       assert(result->type == glsl_type::uint_type);
  994.       return result;
  995.    }
  996.  
  997.    /**
  998.     * \brief Split packHalf2x16's vec2 operand into two floats.
  999.     *
  1000.     * \param vec2_rval is packHalf2x16's input
  1001.     * \return a uint rvalue
  1002.     *
  1003.     * Some code generators, such as the i965 fragment shader, require that all
  1004.     * vector expressions be lowered to a sequence of scalar expressions.
  1005.     * However, packHalf2x16 cannot be scalarized by the same mechanism as
  1006.     * a true vector operation because its input and output have a differing
  1007.     * number of vector components.
  1008.     *
  1009.     * This method scalarizes packHalf2x16 by transforming it from an unary
  1010.     * operation having vector input to a binary operation having scalar input.
  1011.     * That is, it transforms
  1012.     *
  1013.     *    packHalf2x16(VEC2_RVAL);
  1014.     *
  1015.     * into
  1016.     *
  1017.     *    vec2 v = VEC2_RVAL;
  1018.     *    return packHalf2x16_split(v.x, v.y);
  1019.     */
  1020.    ir_rvalue*
  1021.    split_pack_half_2x16(ir_rvalue *vec2_rval)
  1022.    {
  1023.       assert(vec2_rval->type == glsl_type::vec2_type);
  1024.  
  1025.       ir_variable *v = factory.make_temp(glsl_type::vec2_type,
  1026.                                          "tmp_split_pack_half_2x16_v");
  1027.       factory.emit(assign(v, vec2_rval));
  1028.  
  1029.       return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
  1030.    }
  1031.  
  1032.    /**
  1033.     * \brief Lower the component-wise calculation of unpackHalf2x16.
  1034.     *
  1035.     * Given a uint that encodes a float16 in its lower 16 bits, this function
  1036.     * returns a uint that encodes a float32 with the same value. The sign bit
  1037.     * of the float16 is ignored.
  1038.     *
  1039.     * \param e_rval is the unshifted exponent bits of a float16
  1040.     * \param m_rval is the unshifted mantissa bits of a float16
  1041.     * \param a uint rvalue that encodes a float32
  1042.     */
  1043.    ir_rvalue*
  1044.    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
  1045.    {
  1046.       assert(e_rval->type == glsl_type::uint_type);
  1047.       assert(m_rval->type == glsl_type::uint_type);
  1048.  
  1049.       /* uint u32; */
  1050.       ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
  1051.                                            "tmp_unpack_half_1x16_u32");
  1052.  
  1053.       /* uint e = E_RVAL; */
  1054.       ir_variable *e = factory.make_temp(glsl_type::uint_type,
  1055.                                           "tmp_unpack_half_1x16_e");
  1056.       factory.emit(assign(e, e_rval));
  1057.  
  1058.       /* uint m = M_RVAL; */
  1059.       ir_variable *m = factory.make_temp(glsl_type::uint_type,
  1060.                                           "tmp_unpack_half_1x16_m");
  1061.       factory.emit(assign(m, m_rval));
  1062.  
  1063.       /* Preliminaries
  1064.        * -------------
  1065.        *
  1066.        * For a float16, the bit layout is:
  1067.        *
  1068.        *   sign:     15
  1069.        *   exponent: 10:14
  1070.        *   mantissa: 0:9
  1071.        *
  1072.        * Let f16 be a float16 value. The sign, exponent, and mantissa
  1073.        * determine its value thus:
  1074.        *
  1075.        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
  1076.        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
  1077.        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
  1078.        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
  1079.        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
  1080.        *
  1081.        * where 0 <= m16 < 2^10.
  1082.        *
  1083.        * For a float32, the bit layout is:
  1084.        *
  1085.        *   sign: 31
  1086.        *   exponent: 23:30
  1087.        *   mantissa: 0:22
  1088.        *
  1089.        * Let f32 be a float32 value. The sign, exponent, and mantissa
  1090.        * determine its value thus:
  1091.        *
  1092.        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
  1093.        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
  1094.        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
  1095.        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
  1096.        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
  1097.        *
  1098.        * where 0 <= m32 < 2^23.
  1099.        *
  1100.        * Calculation
  1101.        * -----------
  1102.        * Our task is to compute s32, e32, m32 given f16.  Since this function
  1103.        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
  1104.        * cases consider.
  1105.        */
  1106.  
  1107.       factory.emit(
  1108.  
  1109.          /* Case 1) f16 is zero or subnormal.
  1110.           *
  1111.           *   The simplest method of calcuating f32 in this case is
  1112.           *
  1113.           *     f32 = f16                       (20)
  1114.           *         = 2^(-14) * (m16 / 2^10)    (21)
  1115.           *         = m16 / 2^(-24)             (22)
  1116.           */
  1117.  
  1118.          /* if (e16 == 0) { */
  1119.          if_tree(equal(e, constant(0u)),
  1120.  
  1121.             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
  1122.             assign(u32, expr(ir_unop_bitcast_f2u,
  1123.                                 div(u2f(m), constant((float)(1 << 24))))),
  1124.  
  1125.          /* Case 2) f16 is normal.
  1126.           *
  1127.           *   The equation
  1128.           *
  1129.           *     f32 = f16                              (30)
  1130.           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
  1131.           *       2^(e16 - 15) * (1 + m16 / 2^10)
  1132.           *
  1133.           *   can be decomposed into two
  1134.           *
  1135.           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
  1136.           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
  1137.           *
  1138.           *   which solve to
  1139.           *
  1140.           *     e32 = e16 + 112                        (34)
  1141.           *     m32 = m16 * 2^13                       (35)
  1142.           */
  1143.  
  1144.          /* } else if (e16 < 31)) { */
  1145.          if_tree(less(e, constant(31u << 10u)),
  1146.  
  1147.               /* u32 = ((e + (112 << 10)) | m) << 13;
  1148.                */
  1149.               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
  1150.                                  constant(13u))),
  1151.  
  1152.  
  1153.          /* Case 3) f16 is infinite. */
  1154.          if_tree(equal(m, constant(0u)),
  1155.  
  1156.                  assign(u32, constant(255u << 23u)),
  1157.  
  1158.          /* Case 4) f16 is NaN. */
  1159.          /* } else { */
  1160.  
  1161.             assign(u32, constant(0x7fffffffu))))));
  1162.  
  1163.          /* } */
  1164.  
  1165.       return deref(u32).val;
  1166.    }
  1167.  
  1168.    /**
  1169.     * \brief Lower an unpackHalf2x16 expression.
  1170.     *
  1171.     * \param uint_rval is unpackHalf2x16's input
  1172.     * \return unpackHalf2x16's output as a vec2 rvalue
  1173.     */
  1174.    ir_rvalue*
  1175.    lower_unpack_half_2x16(ir_rvalue *uint_rval)
  1176.    {
  1177.       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
  1178.        *
  1179.        *    mediump vec2 unpackHalf2x16 (highp uint v)
  1180.        *    ------------------------------------------
  1181.        *    Returns a two-component floating-point vector with components
  1182.        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
  1183.        *    values, interpreting those values as 16-bit floating-point numbers
  1184.        *    according to the OpenGL ES Specification, and converting them to
  1185.        *    32-bit floating-point values.
  1186.        *
  1187.        *    The first component of the vector is obtained from the
  1188.        *    16 least-significant bits of v; the second component is obtained
  1189.        *    from the 16 most-significant bits of v.
  1190.        */
  1191.       assert(uint_rval->type == glsl_type::uint_type);
  1192.  
  1193.       /* uint u = RVALUE;
  1194.        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
  1195.        */
  1196.       ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
  1197.                                             "tmp_unpack_half_2x16_f16");
  1198.       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
  1199.  
  1200.       /* uvec2 f32; */
  1201.       ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
  1202.                                             "tmp_unpack_half_2x16_f32");
  1203.  
  1204.       /* Get f16's unshifted exponent bits.
  1205.        *
  1206.        *    uvec2 e = f16 & 0x7c00u;
  1207.        */
  1208.       ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
  1209.                                           "tmp_unpack_half_2x16_e");
  1210.       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
  1211.  
  1212.       /* Get f16's unshifted mantissa bits.
  1213.        *
  1214.        *    uvec2 m = f16 & 0x03ffu;
  1215.        */
  1216.       ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
  1217.                                           "tmp_unpack_half_2x16_m");
  1218.       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
  1219.  
  1220.       /* Set f32's exponent and mantissa bits.
  1221.        *
  1222.        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
  1223.        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
  1224.        */
  1225.       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
  1226.                                                        swizzle_x(m)),
  1227.                            WRITEMASK_X));
  1228.       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
  1229.                                                        swizzle_y(m)),
  1230.                            WRITEMASK_Y));
  1231.  
  1232.       /* Set f32's sign bit.
  1233.        *
  1234.        *    f32 |= (f16 & 0x8000u) << 16u;
  1235.        */
  1236.       factory.emit(assign(f32, bit_or(f32,
  1237.                                        lshift(bit_and(f16,
  1238.                                                       constant(0x8000u)),
  1239.                                               constant(16u)))));
  1240.  
  1241.       /* return bitcast_u2f(f32); */
  1242.       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
  1243.       assert(result->type == glsl_type::vec2_type);
  1244.       return result;
  1245.    }
  1246.  
  1247.    /**
  1248.     * \brief Split unpackHalf2x16 into two operations.
  1249.     *
  1250.     * \param uint_rval is unpackHalf2x16's input
  1251.     * \return a vec2 rvalue
  1252.     *
  1253.     * Some code generators, such as the i965 fragment shader, require that all
  1254.     * vector expressions be lowered to a sequence of scalar expressions.
  1255.     * However, unpackHalf2x16 cannot be scalarized by the same method as
  1256.     * a true vector operation because the number of components of its input
  1257.     * and output differ.
  1258.     *
  1259.     * This method scalarizes unpackHalf2x16 by transforming it from a single
  1260.     * operation having vec2 output to a pair of operations each having float
  1261.     * output. That is, it transforms
  1262.     *
  1263.     *   unpackHalf2x16(UINT_RVAL)
  1264.     *
  1265.     * into
  1266.     *
  1267.     *   uint u = UINT_RVAL;
  1268.     *   vec2 v;
  1269.     *
  1270.     *   v.x = unpackHalf2x16_split_x(u);
  1271.     *   v.y = unpackHalf2x16_split_y(u);
  1272.     *
  1273.     *   return v;
  1274.     */
  1275.    ir_rvalue*
  1276.    split_unpack_half_2x16(ir_rvalue *uint_rval)
  1277.    {
  1278.       assert(uint_rval->type == glsl_type::uint_type);
  1279.  
  1280.       /* uint u = uint_rval; */
  1281.       ir_variable *u = factory.make_temp(glsl_type::uint_type,
  1282.                                           "tmp_split_unpack_half_2x16_u");
  1283.       factory.emit(assign(u, uint_rval));
  1284.  
  1285.       /* vec2 v; */
  1286.       ir_variable *v = factory.make_temp(glsl_type::vec2_type,
  1287.                                           "tmp_split_unpack_half_2x16_v");
  1288.  
  1289.       /* v.x = unpack_half_2x16_split_x(u); */
  1290.       factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
  1291.                            WRITEMASK_X));
  1292.  
  1293.       /* v.y = unpack_half_2x16_split_y(u); */
  1294.       factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
  1295.                            WRITEMASK_Y));
  1296.  
  1297.       return deref(v).val;
  1298.    }
  1299. };
  1300.  
  1301. } // namespace anonymous
  1302.  
  1303. /**
  1304.  * \brief Lower the builtin packing functions.
  1305.  *
  1306.  * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
  1307.  */
  1308. bool
  1309. lower_packing_builtins(exec_list *instructions, int op_mask)
  1310. {
  1311.    lower_packing_builtins_visitor v(op_mask);
  1312.    visit_list_elements(&v, instructions, true);
  1313.    return v.get_progress();
  1314. }
  1315.