Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28.  
  29. /**
  30.  * @file
  31.  * Helper functions for packing/unpacking.
  32.  *
  33.  * Pack/unpacking is necessary for conversion between types of different
  34.  * bit width.
  35.  *
  36.  * They are also commonly used when an computation needs higher
  37.  * precision for the intermediate values. For example, if one needs the
  38.  * function:
  39.  *
  40.  *   c = compute(a, b);
  41.  *
  42.  * to use more precision for intermediate results then one should implement it
  43.  * as:
  44.  *
  45.  *   LLVMValueRef
  46.  *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
  47.  *   {
  48.  *      struct lp_type wide_type = lp_wider_type(type);
  49.  *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
  50.  *
  51.  *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
  52.  *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
  53.  *
  54.  *      cl = compute_half(al, bl);
  55.  *      ch = compute_half(ah, bh);
  56.  *
  57.  *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
  58.  *
  59.  *      return c;
  60.  *   }
  61.  *
  62.  * where compute_half() would do the computation for half the elements with
  63.  * twice the precision.
  64.  *
  65.  * @author Jose Fonseca <jfonseca@vmware.com>
  66.  */
  67.  
  68.  
  69. #include "util/u_debug.h"
  70. #include "util/u_math.h"
  71. #include "util/u_cpu_detect.h"
  72. #include "util/u_memory.h"
  73.  
  74. #include "lp_bld_type.h"
  75. #include "lp_bld_const.h"
  76. #include "lp_bld_init.h"
  77. #include "lp_bld_intr.h"
  78. #include "lp_bld_arit.h"
  79. #include "lp_bld_pack.h"
  80. #include "lp_bld_swizzle.h"
  81.  
  82.  
  83. /**
  84.  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
  85.  */
  86. static LLVMValueRef
  87. lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
  88.                               unsigned n, unsigned lo_hi)
  89. {
  90.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  91.    unsigned i, j;
  92.  
  93.    assert(n <= LP_MAX_VECTOR_LENGTH);
  94.    assert(lo_hi < 2);
  95.  
  96.    /* TODO: cache results in a static table */
  97.  
  98.    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
  99.       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
  100.       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
  101.    }
  102.  
  103.    return LLVMConstVector(elems, n);
  104. }
  105.  
  106. /**
  107.  * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
  108.  * See comment above lp_build_interleave2_half for more details.
  109.  */
  110. static LLVMValueRef
  111. lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
  112.                                    unsigned n, unsigned lo_hi)
  113. {
  114.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  115.    unsigned i, j;
  116.  
  117.    assert(n <= LP_MAX_VECTOR_LENGTH);
  118.    assert(lo_hi < 2);
  119.  
  120.    for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
  121.       if (i == (n / 2))
  122.          j += n / 4;
  123.  
  124.       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
  125.       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
  126.    }
  127.  
  128.    return LLVMConstVector(elems, n);
  129. }
  130.  
  131. /**
  132.  * Build shuffle vectors that match PACKxx (SSE) instructions or
  133.  * VPERM (Altivec).
  134.  */
  135. static LLVMValueRef
  136. lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
  137. {
  138.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  139.    unsigned i;
  140.  
  141.    assert(n <= LP_MAX_VECTOR_LENGTH);
  142.  
  143.    for(i = 0; i < n; ++i)
  144. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  145.       elems[i] = lp_build_const_int32(gallivm, 2*i);
  146. #else
  147.       elems[i] = lp_build_const_int32(gallivm, 2*i+1);
  148. #endif
  149.  
  150.    return LLVMConstVector(elems, n);
  151. }
  152.  
  153. /**
  154.  * Return a vector with elements src[start:start+size]
  155.  * Most useful for getting half the values out of a 256bit sized vector,
  156.  * otherwise may cause data rearrangement to happen.
  157.  */
  158. LLVMValueRef
  159. lp_build_extract_range(struct gallivm_state *gallivm,
  160.                        LLVMValueRef src,
  161.                        unsigned start,
  162.                        unsigned size)
  163. {
  164.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  165.    unsigned i;
  166.  
  167.    assert(size <= Elements(elems));
  168.  
  169.    for (i = 0; i < size; ++i)
  170.       elems[i] = lp_build_const_int32(gallivm, i + start);
  171.  
  172.    if (size == 1) {
  173.       return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
  174.    }
  175.    else {
  176.       return LLVMBuildShuffleVector(gallivm->builder, src, src,
  177.                                     LLVMConstVector(elems, size), "");
  178.    }
  179. }
  180.  
  181. /**
  182.  * Concatenates several (must be a power of 2) vectors (of same type)
  183.  * into a larger one.
  184.  * Most useful for building up a 256bit sized vector out of two 128bit ones.
  185.  */
  186. LLVMValueRef
  187. lp_build_concat(struct gallivm_state *gallivm,
  188.                 LLVMValueRef src[],
  189.                 struct lp_type src_type,
  190.                 unsigned num_vectors)
  191. {
  192.    unsigned new_length, i;
  193.    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
  194.    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
  195.  
  196.    assert(src_type.length * num_vectors <= Elements(shuffles));
  197.    assert(util_is_power_of_two(num_vectors));
  198.  
  199.    new_length = src_type.length;
  200.  
  201.    for (i = 0; i < num_vectors; i++)
  202.       tmp[i] = src[i];
  203.  
  204.    while (num_vectors > 1) {
  205.       num_vectors >>= 1;
  206.       new_length <<= 1;
  207.       for (i = 0; i < new_length; i++) {
  208.          shuffles[i] = lp_build_const_int32(gallivm, i);
  209.       }
  210.       for (i = 0; i < num_vectors; i++) {
  211.          tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
  212.                                          LLVMConstVector(shuffles, new_length), "");
  213.       }
  214.    }
  215.  
  216.    return tmp[0];
  217. }
  218.  
  219.  
  220. /**
  221.  * Combines vectors to reduce from num_srcs to num_dsts.
  222.  * Returns the number of src vectors concatenated in a single dst.
  223.  *
  224.  * num_srcs must be exactly divisible by num_dsts.
  225.  *
  226.  * e.g. For num_srcs = 4 and src = [x, y, z, w]
  227.  *          num_dsts = 1  dst = [xyzw]    return = 4
  228.  *          num_dsts = 2  dst = [xy, zw]  return = 2
  229.  */
  230. int
  231. lp_build_concat_n(struct gallivm_state *gallivm,
  232.                   struct lp_type src_type,
  233.                   LLVMValueRef *src,
  234.                   unsigned num_srcs,
  235.                   LLVMValueRef *dst,
  236.                   unsigned num_dsts)
  237. {
  238.    int size = num_srcs / num_dsts;
  239.    int i;
  240.  
  241.    assert(num_srcs >= num_dsts);
  242.    assert((num_srcs % size) == 0);
  243.  
  244.    if (num_srcs == num_dsts) {
  245.       for (i = 0; i < num_dsts; ++i) {
  246.          dst[i] = src[i];
  247.       }
  248.       return 1;
  249.    }
  250.  
  251.    for (i = 0; i < num_dsts; ++i) {
  252.       dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
  253.    }
  254.  
  255.    return size;
  256. }
  257.  
  258.  
  259. /**
  260.  * Interleave vector elements.
  261.  *
  262.  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
  263.  * (but not for 256bit AVX vectors).
  264.  */
  265. LLVMValueRef
  266. lp_build_interleave2(struct gallivm_state *gallivm,
  267.                      struct lp_type type,
  268.                      LLVMValueRef a,
  269.                      LLVMValueRef b,
  270.                      unsigned lo_hi)
  271. {
  272.    LLVMValueRef shuffle;
  273.  
  274.    if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
  275.       /*
  276.        * XXX: This is a workaround for llvm code generation deficiency. Strangely
  277.        * enough, while this needs vinsertf128/vextractf128 instructions (hence
  278.        * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
  279.        * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
  280.        * So use some different shuffles instead (the exact shuffles don't seem to
  281.        * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
  282.        */
  283.       struct lp_type tmp_type = type;
  284.       LLVMValueRef srchalf[2], tmpdst;
  285.       tmp_type.length = 4;
  286.       tmp_type.width = 64;
  287.       a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
  288.       b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
  289.       srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
  290.       srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
  291.       tmp_type.length = 2;
  292.       tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
  293.       return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
  294.    }
  295.  
  296.    shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
  297.  
  298.    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
  299. }
  300.  
  301. /**
  302.  * Interleave vector elements but with 256 bit,
  303.  * treats it as interleave with 2 concatenated 128 bit vectors.
  304.  *
  305.  * This differs to lp_build_interleave2 as that function would do the following (for lo):
  306.  * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
  307.  *
  308.  *
  309.  * An example interleave 8x float with 8x float on AVX 256bit unpack:
  310.  *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
  311.  *
  312.  * Equivalent to interleaving 2x 128 bit vectors
  313.  *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
  314.  *
  315.  * So interleave-lo would result in:
  316.  *   a0 b0 a1 b1 a4 b4 a5 b5
  317.  *
  318.  * And interleave-hi would result in:
  319.  *   a2 b2 a3 b3 a6 b6 a7 b7
  320.  */
  321. LLVMValueRef
  322. lp_build_interleave2_half(struct gallivm_state *gallivm,
  323.                      struct lp_type type,
  324.                      LLVMValueRef a,
  325.                      LLVMValueRef b,
  326.                      unsigned lo_hi)
  327. {
  328.    if (type.length * type.width == 256) {
  329.       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
  330.       return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
  331.    } else {
  332.       return lp_build_interleave2(gallivm, type, a, b, lo_hi);
  333.    }
  334. }
  335.  
  336. /**
  337.  * Double the bit width.
  338.  *
  339.  * This will only change the number of bits the values are represented, not the
  340.  * values themselves.
  341.  */
  342. void
  343. lp_build_unpack2(struct gallivm_state *gallivm,
  344.                  struct lp_type src_type,
  345.                  struct lp_type dst_type,
  346.                  LLVMValueRef src,
  347.                  LLVMValueRef *dst_lo,
  348.                  LLVMValueRef *dst_hi)
  349. {
  350.    LLVMBuilderRef builder = gallivm->builder;
  351.    LLVMValueRef msb;
  352.    LLVMTypeRef dst_vec_type;
  353.  
  354.    assert(!src_type.floating);
  355.    assert(!dst_type.floating);
  356.    assert(dst_type.width == src_type.width * 2);
  357.    assert(dst_type.length * 2 == src_type.length);
  358.  
  359.    if(dst_type.sign && src_type.sign) {
  360.       /* Replicate the sign bit in the most significant bits */
  361.       msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
  362.    }
  363.    else
  364.       /* Most significant bits always zero */
  365.       msb = lp_build_zero(gallivm, src_type);
  366.  
  367.    /* Interleave bits */
  368. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  369.    *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
  370.    *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
  371. #else
  372.    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
  373.    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
  374. #endif
  375.  
  376.    /* Cast the result into the new type (twice as wide) */
  377.  
  378.    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
  379.  
  380.    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
  381.    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
  382. }
  383.  
  384.  
  385. /**
  386.  * Expand the bit width.
  387.  *
  388.  * This will only change the number of bits the values are represented, not the
  389.  * values themselves.
  390.  */
  391. void
  392. lp_build_unpack(struct gallivm_state *gallivm,
  393.                 struct lp_type src_type,
  394.                 struct lp_type dst_type,
  395.                 LLVMValueRef src,
  396.                 LLVMValueRef *dst, unsigned num_dsts)
  397. {
  398.    unsigned num_tmps;
  399.    unsigned i;
  400.  
  401.    /* Register width must remain constant */
  402.    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
  403.  
  404.    /* We must not loose or gain channels. Only precision */
  405.    assert(src_type.length == dst_type.length * num_dsts);
  406.  
  407.    num_tmps = 1;
  408.    dst[0] = src;
  409.  
  410.    while(src_type.width < dst_type.width) {
  411.       struct lp_type tmp_type = src_type;
  412.  
  413.       tmp_type.width *= 2;
  414.       tmp_type.length /= 2;
  415.  
  416.       for(i = num_tmps; i--; ) {
  417.          lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
  418.       }
  419.  
  420.       src_type = tmp_type;
  421.  
  422.       num_tmps *= 2;
  423.    }
  424.  
  425.    assert(num_tmps == num_dsts);
  426. }
  427.  
  428.  
  429. /**
  430.  * Non-interleaved pack.
  431.  *
  432.  * This will move values as
  433.  *         (LSB)                     (MSB)
  434.  *   lo =   l0 __ l1 __ l2 __..  __ ln __
  435.  *   hi =   h0 __ h1 __ h2 __..  __ hn __
  436.  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
  437.  *
  438.  * This will only change the number of bits the values are represented, not the
  439.  * values themselves.
  440.  *
  441.  * It is assumed the values are already clamped into the destination type range.
  442.  * Values outside that range will produce undefined results. Use
  443.  * lp_build_packs2 instead.
  444.  */
  445. LLVMValueRef
  446. lp_build_pack2(struct gallivm_state *gallivm,
  447.                struct lp_type src_type,
  448.                struct lp_type dst_type,
  449.                LLVMValueRef lo,
  450.                LLVMValueRef hi)
  451. {
  452.    LLVMBuilderRef builder = gallivm->builder;
  453.    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
  454.    LLVMValueRef shuffle;
  455.    LLVMValueRef res = NULL;
  456.    struct lp_type intr_type = dst_type;
  457.  
  458. #if HAVE_LLVM < 0x0207
  459.    intr_type = src_type;
  460. #endif
  461.  
  462.    assert(!src_type.floating);
  463.    assert(!dst_type.floating);
  464.    assert(src_type.width == dst_type.width * 2);
  465.    assert(src_type.length * 2 == dst_type.length);
  466.  
  467.    /* Check for special cases first */
  468.    if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
  469.        src_type.width * src_type.length >= 128) {
  470.       const char *intrinsic = NULL;
  471.  
  472.       switch(src_type.width) {
  473.       case 32:
  474.          if (util_cpu_caps.has_sse2) {
  475.            if(dst_type.sign) {
  476.               intrinsic = "llvm.x86.sse2.packssdw.128";
  477.            }
  478.            else {
  479.               if (util_cpu_caps.has_sse4_1) {
  480.                  intrinsic = "llvm.x86.sse41.packusdw";
  481. #if HAVE_LLVM < 0x0207
  482.                  /* llvm < 2.7 has inconsistent signatures except for packusdw */
  483.                  intr_type = dst_type;
  484. #endif
  485.               }
  486.            }
  487.          } else if (util_cpu_caps.has_altivec) {
  488.             if (dst_type.sign) {
  489.               intrinsic = "llvm.ppc.altivec.vpkswus";
  490.            } else {
  491.               intrinsic = "llvm.ppc.altivec.vpkuwus";
  492.            }
  493.          }
  494.          break;
  495.       case 16:
  496.          if (dst_type.sign) {
  497.             if (util_cpu_caps.has_sse2) {
  498.               intrinsic = "llvm.x86.sse2.packsswb.128";
  499.             } else if (util_cpu_caps.has_altivec) {
  500.               intrinsic = "llvm.ppc.altivec.vpkshss";
  501.             }
  502.          } else {
  503.             if (util_cpu_caps.has_sse2) {
  504.               intrinsic = "llvm.x86.sse2.packuswb.128";
  505.             } else if (util_cpu_caps.has_altivec) {
  506.               intrinsic = "llvm.ppc.altivec.vpkshus";
  507.             }
  508.          }
  509.          break;
  510.       /* default uses generic shuffle below */
  511.       }
  512.       if (intrinsic) {
  513.          if (src_type.width * src_type.length == 128) {
  514.             LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
  515.             res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
  516.             if (dst_vec_type != intr_vec_type) {
  517.                res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
  518.             }
  519.          }
  520.          else {
  521.             int num_split = src_type.width * src_type.length / 128;
  522.             int i;
  523.             int nlen = 128 / src_type.width;
  524.             struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
  525.             struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
  526.             LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
  527.             LLVMValueRef tmplo, tmphi;
  528.             LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
  529.             LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
  530.  
  531.             assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
  532.  
  533.             for (i = 0; i < num_split / 2; i++) {
  534.                tmplo = lp_build_extract_range(gallivm,
  535.                                               lo, i*nlen*2, nlen);
  536.                tmphi = lp_build_extract_range(gallivm,
  537.                                               lo, i*nlen*2 + nlen, nlen);
  538.                tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
  539.                                                      nintr_vec_type, tmplo, tmphi);
  540.                if (ndst_vec_type != nintr_vec_type) {
  541.                   tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
  542.                }
  543.             }
  544.             for (i = 0; i < num_split / 2; i++) {
  545.                tmplo = lp_build_extract_range(gallivm,
  546.                                               hi, i*nlen*2, nlen);
  547.                tmphi = lp_build_extract_range(gallivm,
  548.                                               hi, i*nlen*2 + nlen, nlen);
  549.                tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
  550.                                                                  nintr_vec_type,
  551.                                                                  tmplo, tmphi);
  552.                if (ndst_vec_type != nintr_vec_type) {
  553.                   tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
  554.                                                            ndst_vec_type, "");
  555.                }
  556.             }
  557.             res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
  558.          }
  559.          return res;
  560.       }
  561.    }
  562.  
  563.    /* generic shuffle */
  564.    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
  565.    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
  566.  
  567.    shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
  568.  
  569.    res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
  570.  
  571.    return res;
  572. }
  573.  
  574.  
  575.  
  576. /**
  577.  * Non-interleaved pack and saturate.
  578.  *
  579.  * Same as lp_build_pack2 but will saturate values so that they fit into the
  580.  * destination type.
  581.  */
  582. LLVMValueRef
  583. lp_build_packs2(struct gallivm_state *gallivm,
  584.                 struct lp_type src_type,
  585.                 struct lp_type dst_type,
  586.                 LLVMValueRef lo,
  587.                 LLVMValueRef hi)
  588. {
  589.    boolean clamp;
  590.  
  591.    assert(!src_type.floating);
  592.    assert(!dst_type.floating);
  593.    assert(src_type.sign == dst_type.sign);
  594.    assert(src_type.width == dst_type.width * 2);
  595.    assert(src_type.length * 2 == dst_type.length);
  596.  
  597.    clamp = TRUE;
  598.  
  599.    /* All X86 SSE non-interleaved pack instructions take signed inputs and
  600.     * saturate them, so no need to clamp for those cases. */
  601.    if(util_cpu_caps.has_sse2 &&
  602.       src_type.width * src_type.length >= 128 &&
  603.       src_type.sign &&
  604.       (src_type.width == 32 || src_type.width == 16))
  605.       clamp = FALSE;
  606.  
  607.    if(clamp) {
  608.       struct lp_build_context bld;
  609.       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
  610.       LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
  611.       lp_build_context_init(&bld, gallivm, src_type);
  612.       lo = lp_build_min(&bld, lo, dst_max);
  613.       hi = lp_build_min(&bld, hi, dst_max);
  614.       /* FIXME: What about lower bound? */
  615.    }
  616.  
  617.    return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
  618. }
  619.  
  620.  
  621. /**
  622.  * Truncate the bit width.
  623.  *
  624.  * TODO: Handle saturation consistently.
  625.  */
  626. LLVMValueRef
  627. lp_build_pack(struct gallivm_state *gallivm,
  628.               struct lp_type src_type,
  629.               struct lp_type dst_type,
  630.               boolean clamped,
  631.               const LLVMValueRef *src, unsigned num_srcs)
  632. {
  633.    LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
  634.                          struct lp_type src_type,
  635.                          struct lp_type dst_type,
  636.                          LLVMValueRef lo,
  637.                          LLVMValueRef hi);
  638.    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
  639.    unsigned i;
  640.  
  641.    /* Register width must remain constant */
  642.    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
  643.  
  644.    /* We must not loose or gain channels. Only precision */
  645.    assert(src_type.length * num_srcs == dst_type.length);
  646.  
  647.    if(clamped)
  648.       pack2 = &lp_build_pack2;
  649.    else
  650.       pack2 = &lp_build_packs2;
  651.  
  652.    for(i = 0; i < num_srcs; ++i)
  653.       tmp[i] = src[i];
  654.  
  655.    while(src_type.width > dst_type.width) {
  656.       struct lp_type tmp_type = src_type;
  657.  
  658.       tmp_type.width /= 2;
  659.       tmp_type.length *= 2;
  660.  
  661.       /* Take in consideration the sign changes only in the last step */
  662.       if(tmp_type.width == dst_type.width)
  663.          tmp_type.sign = dst_type.sign;
  664.  
  665.       num_srcs /= 2;
  666.  
  667.       for(i = 0; i < num_srcs; ++i)
  668.          tmp[i] = pack2(gallivm, src_type, tmp_type,
  669.                         tmp[2*i + 0], tmp[2*i + 1]);
  670.  
  671.       src_type = tmp_type;
  672.    }
  673.  
  674.    assert(num_srcs == 1);
  675.  
  676.    return tmp[0];
  677. }
  678.  
  679.  
  680. /**
  681.  * Truncate or expand the bitwidth.
  682.  *
  683.  * NOTE: Getting the right sign flags is crucial here, as we employ some
  684.  * intrinsics that do saturation.
  685.  */
  686. void
  687. lp_build_resize(struct gallivm_state *gallivm,
  688.                 struct lp_type src_type,
  689.                 struct lp_type dst_type,
  690.                 const LLVMValueRef *src, unsigned num_srcs,
  691.                 LLVMValueRef *dst, unsigned num_dsts)
  692. {
  693.    LLVMBuilderRef builder = gallivm->builder;
  694.    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
  695.    unsigned i;
  696.  
  697.    /*
  698.     * We don't support float <-> int conversion here. That must be done
  699.     * before/after calling this function.
  700.     */
  701.    assert(src_type.floating == dst_type.floating);
  702.  
  703.    /*
  704.     * We don't support double <-> float conversion yet, although it could be
  705.     * added with little effort.
  706.     */
  707.    assert((!src_type.floating && !dst_type.floating) ||
  708.           src_type.width == dst_type.width);
  709.  
  710.    /* We must not loose or gain channels. Only precision */
  711.    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
  712.  
  713.    /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
  714.    assert(num_srcs == 1 || num_dsts == 1);
  715.  
  716.    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
  717.    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
  718.    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
  719.    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
  720.  
  721.    if (src_type.width > dst_type.width) {
  722.       /*
  723.        * Truncate bit width.
  724.        */
  725.  
  726.       assert(num_dsts == 1);
  727.  
  728.       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
  729.         /*
  730.          * Register width remains constant -- use vector packing intrinsics
  731.          */
  732.          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
  733.       }
  734.       else {
  735.          if (src_type.width / dst_type.width > num_srcs) {
  736.             /*
  737.             * First change src vectors size (with shuffle) so they have the
  738.             * same size as the destination vector, then pack normally.
  739.             * Note: cannot use cast/extract because llvm generates atrocious code.
  740.             */
  741.             unsigned size_ratio = (src_type.width * src_type.length) /
  742.                                   (dst_type.length * dst_type.width);
  743.             unsigned new_length = src_type.length / size_ratio;
  744.  
  745.             for (i = 0; i < size_ratio * num_srcs; i++) {
  746.                unsigned start_index = (i % size_ratio) * new_length;
  747.                tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
  748.                                                start_index, new_length);
  749.             }
  750.             num_srcs *= size_ratio;
  751.             src_type.length = new_length;
  752.             tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
  753.          }
  754.          else {
  755.             /*
  756.              * Truncate bit width but expand vector size - first pack
  757.              * then expand simply because this should be more AVX-friendly
  758.              * for the cases we probably hit.
  759.              */
  760.             unsigned size_ratio = (dst_type.width * dst_type.length) /
  761.                                   (src_type.length * src_type.width);
  762.             unsigned num_pack_srcs = num_srcs / size_ratio;
  763.             dst_type.length = dst_type.length / size_ratio;
  764.  
  765.             for (i = 0; i < size_ratio; i++) {
  766.                tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
  767.                                       &src[i*num_pack_srcs], num_pack_srcs);
  768.             }
  769.             tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
  770.          }
  771.       }
  772.    }
  773.    else if (src_type.width < dst_type.width) {
  774.       /*
  775.        * Expand bit width.
  776.        */
  777.  
  778.       assert(num_srcs == 1);
  779.  
  780.       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
  781.          /*
  782.           * Register width remains constant -- use vector unpack intrinsics
  783.           */
  784.          lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
  785.       }
  786.       else {
  787.          /*
  788.           * Do it element-wise.
  789.           */
  790.          assert(src_type.length * num_srcs == dst_type.length * num_dsts);
  791.  
  792.          for (i = 0; i < num_dsts; i++) {
  793.             tmp[i] = lp_build_undef(gallivm, dst_type);
  794.          }
  795.  
  796.          for (i = 0; i < src_type.length; ++i) {
  797.             unsigned j = i / dst_type.length;
  798.             LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
  799.             LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
  800.             LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
  801.  
  802.             if (src_type.sign && dst_type.sign) {
  803.                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
  804.             } else {
  805.                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
  806.             }
  807.             tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
  808.          }
  809.       }
  810.    }
  811.    else {
  812.       /*
  813.        * No-op
  814.        */
  815.  
  816.       assert(num_srcs == 1);
  817.       assert(num_dsts == 1);
  818.  
  819.       tmp[0] = src[0];
  820.    }
  821.  
  822.    for(i = 0; i < num_dsts; ++i)
  823.       dst[i] = tmp[i];
  824. }
  825.  
  826.  
  827. /**
  828.  * Expands src vector from src.length to dst_length
  829.  */
  830. LLVMValueRef
  831. lp_build_pad_vector(struct gallivm_state *gallivm,
  832.                     LLVMValueRef src,
  833.                     unsigned dst_length)
  834. {
  835.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  836.    LLVMValueRef undef;
  837.    LLVMTypeRef type;
  838.    unsigned i, src_length;
  839.  
  840.    type = LLVMTypeOf(src);
  841.  
  842.    if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
  843.       /* Can't use ShuffleVector on non-vector type */
  844.       undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
  845.       return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
  846.    }
  847.  
  848.    undef      = LLVMGetUndef(type);
  849.    src_length = LLVMGetVectorSize(type);
  850.  
  851.    assert(dst_length <= Elements(elems));
  852.    assert(dst_length >= src_length);
  853.  
  854.    if (src_length == dst_length)
  855.       return src;
  856.  
  857.    /* All elements from src vector */
  858.    for (i = 0; i < src_length; ++i)
  859.       elems[i] = lp_build_const_int32(gallivm, i);
  860.  
  861.    /* Undef fill remaining space */
  862.    for (i = src_length; i < dst_length; ++i)
  863.       elems[i] = lp_build_const_int32(gallivm, src_length);
  864.  
  865.    /* Combine the two vectors */
  866.    return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
  867. }
  868.