Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28.  
  29. /**
  30.  * @file
  31.  * Helper functions for packing/unpacking.
  32.  *
  33.  * Pack/unpacking is necessary for conversion between types of different
  34.  * bit width.
  35.  *
  36.  * They are also commonly used when an computation needs higher
  37.  * precision for the intermediate values. For example, if one needs the
  38.  * function:
  39.  *
  40.  *   c = compute(a, b);
  41.  *
  42.  * to use more precision for intermediate results then one should implement it
  43.  * as:
  44.  *
  45.  *   LLVMValueRef
  46.  *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
  47.  *   {
  48.  *      struct lp_type wide_type = lp_wider_type(type);
  49.  *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
  50.  *
  51.  *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
  52.  *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
  53.  *
  54.  *      cl = compute_half(al, bl);
  55.  *      ch = compute_half(ah, bh);
  56.  *
  57.  *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
  58.  *
  59.  *      return c;
  60.  *   }
  61.  *
  62.  * where compute_half() would do the computation for half the elements with
  63.  * twice the precision.
  64.  *
  65.  * @author Jose Fonseca <jfonseca@vmware.com>
  66.  */
  67.  
  68.  
  69. #include "util/u_debug.h"
  70. #include "util/u_math.h"
  71. #include "util/u_cpu_detect.h"
  72. #include "util/u_memory.h"
  73.  
  74. #include "lp_bld_type.h"
  75. #include "lp_bld_const.h"
  76. #include "lp_bld_init.h"
  77. #include "lp_bld_intr.h"
  78. #include "lp_bld_arit.h"
  79. #include "lp_bld_pack.h"
  80. #include "lp_bld_swizzle.h"
  81.  
  82.  
  83. /**
  84.  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
  85.  */
  86. static LLVMValueRef
  87. lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
  88.                               unsigned n, unsigned lo_hi)
  89. {
  90.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  91.    unsigned i, j;
  92.  
  93.    assert(n <= LP_MAX_VECTOR_LENGTH);
  94.    assert(lo_hi < 2);
  95.  
  96.    /* TODO: cache results in a static table */
  97.  
  98.    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
  99.       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
  100.       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
  101.    }
  102.  
  103.    return LLVMConstVector(elems, n);
  104. }
  105.  
  106. /**
  107.  * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
  108.  * See comment above lp_build_interleave2_half for more details.
  109.  */
  110. static LLVMValueRef
  111. lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
  112.                                    unsigned n, unsigned lo_hi)
  113. {
  114.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  115.    unsigned i, j;
  116.  
  117.    assert(n <= LP_MAX_VECTOR_LENGTH);
  118.    assert(lo_hi < 2);
  119.  
  120.    for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
  121.       if (i == (n / 2))
  122.          j += n / 4;
  123.  
  124.       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
  125.       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
  126.    }
  127.  
  128.    return LLVMConstVector(elems, n);
  129. }
  130.  
  131. /**
  132.  * Build shuffle vectors that match PACKxx (SSE) instructions or
  133.  * VPERM (Altivec).
  134.  */
  135. static LLVMValueRef
  136. lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
  137. {
  138.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  139.    unsigned i;
  140.  
  141.    assert(n <= LP_MAX_VECTOR_LENGTH);
  142.  
  143.    for(i = 0; i < n; ++i)
  144. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  145.       elems[i] = lp_build_const_int32(gallivm, 2*i);
  146. #else
  147.       elems[i] = lp_build_const_int32(gallivm, 2*i+1);
  148. #endif
  149.  
  150.    return LLVMConstVector(elems, n);
  151. }
  152.  
  153. /**
  154.  * Return a vector with elements src[start:start+size]
  155.  * Most useful for getting half the values out of a 256bit sized vector,
  156.  * otherwise may cause data rearrangement to happen.
  157.  */
  158. LLVMValueRef
  159. lp_build_extract_range(struct gallivm_state *gallivm,
  160.                        LLVMValueRef src,
  161.                        unsigned start,
  162.                        unsigned size)
  163. {
  164.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  165.    unsigned i;
  166.  
  167.    assert(size <= Elements(elems));
  168.  
  169.    for (i = 0; i < size; ++i)
  170.       elems[i] = lp_build_const_int32(gallivm, i + start);
  171.  
  172.    if (size == 1) {
  173.       return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
  174.    }
  175.    else {
  176.       return LLVMBuildShuffleVector(gallivm->builder, src, src,
  177.                                     LLVMConstVector(elems, size), "");
  178.    }
  179. }
  180.  
  181. /**
  182.  * Concatenates several (must be a power of 2) vectors (of same type)
  183.  * into a larger one.
  184.  * Most useful for building up a 256bit sized vector out of two 128bit ones.
  185.  */
  186. LLVMValueRef
  187. lp_build_concat(struct gallivm_state *gallivm,
  188.                 LLVMValueRef src[],
  189.                 struct lp_type src_type,
  190.                 unsigned num_vectors)
  191. {
  192.    unsigned new_length, i;
  193.    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
  194.    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
  195.  
  196.    assert(src_type.length * num_vectors <= Elements(shuffles));
  197.    assert(util_is_power_of_two(num_vectors));
  198.  
  199.    new_length = src_type.length;
  200.  
  201.    for (i = 0; i < num_vectors; i++)
  202.       tmp[i] = src[i];
  203.  
  204.    while (num_vectors > 1) {
  205.       num_vectors >>= 1;
  206.       new_length <<= 1;
  207.       for (i = 0; i < new_length; i++) {
  208.          shuffles[i] = lp_build_const_int32(gallivm, i);
  209.       }
  210.       for (i = 0; i < num_vectors; i++) {
  211.          tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
  212.                                          LLVMConstVector(shuffles, new_length), "");
  213.       }
  214.    }
  215.  
  216.    return tmp[0];
  217. }
  218.  
  219.  
  220. /**
  221.  * Combines vectors to reduce from num_srcs to num_dsts.
  222.  * Returns the number of src vectors concatenated in a single dst.
  223.  *
  224.  * num_srcs must be exactly divisible by num_dsts.
  225.  *
  226.  * e.g. For num_srcs = 4 and src = [x, y, z, w]
  227.  *          num_dsts = 1  dst = [xyzw]    return = 4
  228.  *          num_dsts = 2  dst = [xy, zw]  return = 2
  229.  */
  230. int
  231. lp_build_concat_n(struct gallivm_state *gallivm,
  232.                   struct lp_type src_type,
  233.                   LLVMValueRef *src,
  234.                   unsigned num_srcs,
  235.                   LLVMValueRef *dst,
  236.                   unsigned num_dsts)
  237. {
  238.    int size = num_srcs / num_dsts;
  239.    int i;
  240.  
  241.    assert(num_srcs >= num_dsts);
  242.    assert((num_srcs % size) == 0);
  243.  
  244.    if (num_srcs == num_dsts) {
  245.       for (i = 0; i < num_dsts; ++i) {
  246.          dst[i] = src[i];
  247.       }
  248.       return 1;
  249.    }
  250.  
  251.    for (i = 0; i < num_dsts; ++i) {
  252.       dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
  253.    }
  254.  
  255.    return size;
  256. }
  257.  
  258.  
  259. /**
  260.  * Interleave vector elements.
  261.  *
  262.  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
  263.  * (but not for 256bit AVX vectors).
  264.  */
  265. LLVMValueRef
  266. lp_build_interleave2(struct gallivm_state *gallivm,
  267.                      struct lp_type type,
  268.                      LLVMValueRef a,
  269.                      LLVMValueRef b,
  270.                      unsigned lo_hi)
  271. {
  272.    LLVMValueRef shuffle;
  273.  
  274.    if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
  275.       /*
  276.        * XXX: This is a workaround for llvm code generation deficiency. Strangely
  277.        * enough, while this needs vinsertf128/vextractf128 instructions (hence
  278.        * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
  279.        * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
  280.        * So use some different shuffles instead (the exact shuffles don't seem to
  281.        * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
  282.        */
  283.       struct lp_type tmp_type = type;
  284.       LLVMValueRef srchalf[2], tmpdst;
  285.       tmp_type.length = 4;
  286.       tmp_type.width = 64;
  287.       a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
  288.       b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
  289.       srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
  290.       srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
  291.       tmp_type.length = 2;
  292.       tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
  293.       return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
  294.    }
  295.  
  296.    shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
  297.  
  298.    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
  299. }
  300.  
  301. /**
  302.  * Interleave vector elements but with 256 bit,
  303.  * treats it as interleave with 2 concatenated 128 bit vectors.
  304.  *
  305.  * This differs to lp_build_interleave2 as that function would do the following (for lo):
  306.  * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
  307.  *
  308.  *
  309.  * An example interleave 8x float with 8x float on AVX 256bit unpack:
  310.  *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
  311.  *
  312.  * Equivalent to interleaving 2x 128 bit vectors
  313.  *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
  314.  *
  315.  * So interleave-lo would result in:
  316.  *   a0 b0 a1 b1 a4 b4 a5 b5
  317.  *
  318.  * And interleave-hi would result in:
  319.  *   a2 b2 a3 b3 a6 b6 a7 b7
  320.  */
  321. LLVMValueRef
  322. lp_build_interleave2_half(struct gallivm_state *gallivm,
  323.                      struct lp_type type,
  324.                      LLVMValueRef a,
  325.                      LLVMValueRef b,
  326.                      unsigned lo_hi)
  327. {
  328.    if (type.length * type.width == 256) {
  329.       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
  330.       return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
  331.    } else {
  332.       return lp_build_interleave2(gallivm, type, a, b, lo_hi);
  333.    }
  334. }
  335.  
  336. /**
  337.  * Double the bit width.
  338.  *
  339.  * This will only change the number of bits the values are represented, not the
  340.  * values themselves.
  341.  */
  342. void
  343. lp_build_unpack2(struct gallivm_state *gallivm,
  344.                  struct lp_type src_type,
  345.                  struct lp_type dst_type,
  346.                  LLVMValueRef src,
  347.                  LLVMValueRef *dst_lo,
  348.                  LLVMValueRef *dst_hi)
  349. {
  350.    LLVMBuilderRef builder = gallivm->builder;
  351.    LLVMValueRef msb;
  352.    LLVMTypeRef dst_vec_type;
  353.  
  354.    assert(!src_type.floating);
  355.    assert(!dst_type.floating);
  356.    assert(dst_type.width == src_type.width * 2);
  357.    assert(dst_type.length * 2 == src_type.length);
  358.  
  359.    if(dst_type.sign && src_type.sign) {
  360.       /* Replicate the sign bit in the most significant bits */
  361.       msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
  362.    }
  363.    else
  364.       /* Most significant bits always zero */
  365.       msb = lp_build_zero(gallivm, src_type);
  366.  
  367.    /* Interleave bits */
  368. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  369.    *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
  370.    *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
  371. #else
  372.    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
  373.    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
  374. #endif
  375.  
  376.    /* Cast the result into the new type (twice as wide) */
  377.  
  378.    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
  379.  
  380.    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
  381.    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
  382. }
  383.  
  384.  
  385. /**
  386.  * Expand the bit width.
  387.  *
  388.  * This will only change the number of bits the values are represented, not the
  389.  * values themselves.
  390.  */
  391. void
  392. lp_build_unpack(struct gallivm_state *gallivm,
  393.                 struct lp_type src_type,
  394.                 struct lp_type dst_type,
  395.                 LLVMValueRef src,
  396.                 LLVMValueRef *dst, unsigned num_dsts)
  397. {
  398.    unsigned num_tmps;
  399.    unsigned i;
  400.  
  401.    /* Register width must remain constant */
  402.    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
  403.  
  404.    /* We must not loose or gain channels. Only precision */
  405.    assert(src_type.length == dst_type.length * num_dsts);
  406.  
  407.    num_tmps = 1;
  408.    dst[0] = src;
  409.  
  410.    while(src_type.width < dst_type.width) {
  411.       struct lp_type tmp_type = src_type;
  412.  
  413.       tmp_type.width *= 2;
  414.       tmp_type.length /= 2;
  415.  
  416.       for(i = num_tmps; i--; ) {
  417.          lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
  418.       }
  419.  
  420.       src_type = tmp_type;
  421.  
  422.       num_tmps *= 2;
  423.    }
  424.  
  425.    assert(num_tmps == num_dsts);
  426. }
  427.  
  428.  
  429. /**
  430.  * Non-interleaved pack.
  431.  *
  432.  * This will move values as
  433.  *         (LSB)                     (MSB)
  434.  *   lo =   l0 __ l1 __ l2 __..  __ ln __
  435.  *   hi =   h0 __ h1 __ h2 __..  __ hn __
  436.  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
  437.  *
  438.  * This will only change the number of bits the values are represented, not the
  439.  * values themselves.
  440.  *
  441.  * It is assumed the values are already clamped into the destination type range.
  442.  * Values outside that range will produce undefined results. Use
  443.  * lp_build_packs2 instead.
  444.  */
  445. LLVMValueRef
  446. lp_build_pack2(struct gallivm_state *gallivm,
  447.                struct lp_type src_type,
  448.                struct lp_type dst_type,
  449.                LLVMValueRef lo,
  450.                LLVMValueRef hi)
  451. {
  452.    LLVMBuilderRef builder = gallivm->builder;
  453.    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
  454.    LLVMValueRef shuffle;
  455.    LLVMValueRef res = NULL;
  456.    struct lp_type intr_type = dst_type;
  457.  
  458.    assert(!src_type.floating);
  459.    assert(!dst_type.floating);
  460.    assert(src_type.width == dst_type.width * 2);
  461.    assert(src_type.length * 2 == dst_type.length);
  462.  
  463.    /* Check for special cases first */
  464.    if((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
  465.        src_type.width * src_type.length >= 128) {
  466.       const char *intrinsic = NULL;
  467.       boolean swap_intrinsic_operands = FALSE;
  468.  
  469.       switch(src_type.width) {
  470.       case 32:
  471.          if (util_cpu_caps.has_sse2) {
  472.            if(dst_type.sign) {
  473.               intrinsic = "llvm.x86.sse2.packssdw.128";
  474.            }
  475.            else {
  476.               if (util_cpu_caps.has_sse4_1) {
  477.                  intrinsic = "llvm.x86.sse41.packusdw";
  478.               }
  479.            }
  480.          } else if (util_cpu_caps.has_altivec) {
  481.             if (dst_type.sign) {
  482.               intrinsic = "llvm.ppc.altivec.vpkswus";
  483.            } else {
  484.               intrinsic = "llvm.ppc.altivec.vpkuwus";
  485.            }
  486. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  487.            swap_intrinsic_operands = TRUE;
  488. #endif
  489.          }
  490.          break;
  491.       case 16:
  492.          if (dst_type.sign) {
  493.             if (util_cpu_caps.has_sse2) {
  494.               intrinsic = "llvm.x86.sse2.packsswb.128";
  495.             } else if (util_cpu_caps.has_altivec) {
  496.               intrinsic = "llvm.ppc.altivec.vpkshss";
  497. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  498.               swap_intrinsic_operands = TRUE;
  499. #endif
  500.             }
  501.          } else {
  502.             if (util_cpu_caps.has_sse2) {
  503.               intrinsic = "llvm.x86.sse2.packuswb.128";
  504.             } else if (util_cpu_caps.has_altivec) {
  505.               intrinsic = "llvm.ppc.altivec.vpkshus";
  506. #ifdef PIPE_ARCH_LITTLE_ENDIAN
  507.               swap_intrinsic_operands = TRUE;
  508. #endif
  509.             }
  510.          }
  511.          break;
  512.       /* default uses generic shuffle below */
  513.       }
  514.       if (intrinsic) {
  515.          if (src_type.width * src_type.length == 128) {
  516.             LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
  517.             if (swap_intrinsic_operands) {
  518.                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
  519.             } else {
  520.                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
  521.             }
  522.             if (dst_vec_type != intr_vec_type) {
  523.                res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
  524.             }
  525.          }
  526.          else {
  527.             int num_split = src_type.width * src_type.length / 128;
  528.             int i;
  529.             int nlen = 128 / src_type.width;
  530.             int lo_off = swap_intrinsic_operands ? nlen : 0;
  531.             int hi_off = swap_intrinsic_operands ? 0 : nlen;
  532.             struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
  533.             struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
  534.             LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
  535.             LLVMValueRef tmplo, tmphi;
  536.             LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
  537.             LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
  538.  
  539.             assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
  540.  
  541.             for (i = 0; i < num_split / 2; i++) {
  542.                tmplo = lp_build_extract_range(gallivm,
  543.                                               lo, i*nlen*2 + lo_off, nlen);
  544.                tmphi = lp_build_extract_range(gallivm,
  545.                                               lo, i*nlen*2 + hi_off, nlen);
  546.                tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
  547.                                                      nintr_vec_type, tmplo, tmphi);
  548.                if (ndst_vec_type != nintr_vec_type) {
  549.                   tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
  550.                }
  551.             }
  552.             for (i = 0; i < num_split / 2; i++) {
  553.                tmplo = lp_build_extract_range(gallivm,
  554.                                               hi, i*nlen*2 + lo_off, nlen);
  555.                tmphi = lp_build_extract_range(gallivm,
  556.                                               hi, i*nlen*2 + hi_off, nlen);
  557.                tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
  558.                                                                  nintr_vec_type,
  559.                                                                  tmplo, tmphi);
  560.                if (ndst_vec_type != nintr_vec_type) {
  561.                   tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
  562.                                                            ndst_vec_type, "");
  563.                }
  564.             }
  565.             res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
  566.          }
  567.          return res;
  568.       }
  569.    }
  570.  
  571.    /* generic shuffle */
  572.    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
  573.    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
  574.  
  575.    shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
  576.  
  577.    res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
  578.  
  579.    return res;
  580. }
  581.  
  582.  
  583.  
  584. /**
  585.  * Non-interleaved pack and saturate.
  586.  *
  587.  * Same as lp_build_pack2 but will saturate values so that they fit into the
  588.  * destination type.
  589.  */
  590. LLVMValueRef
  591. lp_build_packs2(struct gallivm_state *gallivm,
  592.                 struct lp_type src_type,
  593.                 struct lp_type dst_type,
  594.                 LLVMValueRef lo,
  595.                 LLVMValueRef hi)
  596. {
  597.    boolean clamp;
  598.  
  599.    assert(!src_type.floating);
  600.    assert(!dst_type.floating);
  601.    assert(src_type.sign == dst_type.sign);
  602.    assert(src_type.width == dst_type.width * 2);
  603.    assert(src_type.length * 2 == dst_type.length);
  604.  
  605.    clamp = TRUE;
  606.  
  607.    /* All X86 SSE non-interleaved pack instructions take signed inputs and
  608.     * saturate them, so no need to clamp for those cases. */
  609.    if(util_cpu_caps.has_sse2 &&
  610.       src_type.width * src_type.length >= 128 &&
  611.       src_type.sign &&
  612.       (src_type.width == 32 || src_type.width == 16))
  613.       clamp = FALSE;
  614.  
  615.    if(clamp) {
  616.       struct lp_build_context bld;
  617.       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
  618.       LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1);
  619.       lp_build_context_init(&bld, gallivm, src_type);
  620.       lo = lp_build_min(&bld, lo, dst_max);
  621.       hi = lp_build_min(&bld, hi, dst_max);
  622.       /* FIXME: What about lower bound? */
  623.    }
  624.  
  625.    return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
  626. }
  627.  
  628.  
  629. /**
  630.  * Truncate the bit width.
  631.  *
  632.  * TODO: Handle saturation consistently.
  633.  */
  634. LLVMValueRef
  635. lp_build_pack(struct gallivm_state *gallivm,
  636.               struct lp_type src_type,
  637.               struct lp_type dst_type,
  638.               boolean clamped,
  639.               const LLVMValueRef *src, unsigned num_srcs)
  640. {
  641.    LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
  642.                          struct lp_type src_type,
  643.                          struct lp_type dst_type,
  644.                          LLVMValueRef lo,
  645.                          LLVMValueRef hi);
  646.    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
  647.    unsigned i;
  648.  
  649.    /* Register width must remain constant */
  650.    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
  651.  
  652.    /* We must not loose or gain channels. Only precision */
  653.    assert(src_type.length * num_srcs == dst_type.length);
  654.  
  655.    if(clamped)
  656.       pack2 = &lp_build_pack2;
  657.    else
  658.       pack2 = &lp_build_packs2;
  659.  
  660.    for(i = 0; i < num_srcs; ++i)
  661.       tmp[i] = src[i];
  662.  
  663.    while(src_type.width > dst_type.width) {
  664.       struct lp_type tmp_type = src_type;
  665.  
  666.       tmp_type.width /= 2;
  667.       tmp_type.length *= 2;
  668.  
  669.       /* Take in consideration the sign changes only in the last step */
  670.       if(tmp_type.width == dst_type.width)
  671.          tmp_type.sign = dst_type.sign;
  672.  
  673.       num_srcs /= 2;
  674.  
  675.       for(i = 0; i < num_srcs; ++i)
  676.          tmp[i] = pack2(gallivm, src_type, tmp_type,
  677.                         tmp[2*i + 0], tmp[2*i + 1]);
  678.  
  679.       src_type = tmp_type;
  680.    }
  681.  
  682.    assert(num_srcs == 1);
  683.  
  684.    return tmp[0];
  685. }
  686.  
  687.  
  688. /**
  689.  * Truncate or expand the bitwidth.
  690.  *
  691.  * NOTE: Getting the right sign flags is crucial here, as we employ some
  692.  * intrinsics that do saturation.
  693.  */
  694. void
  695. lp_build_resize(struct gallivm_state *gallivm,
  696.                 struct lp_type src_type,
  697.                 struct lp_type dst_type,
  698.                 const LLVMValueRef *src, unsigned num_srcs,
  699.                 LLVMValueRef *dst, unsigned num_dsts)
  700. {
  701.    LLVMBuilderRef builder = gallivm->builder;
  702.    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
  703.    unsigned i;
  704.  
  705.    /*
  706.     * We don't support float <-> int conversion here. That must be done
  707.     * before/after calling this function.
  708.     */
  709.    assert(src_type.floating == dst_type.floating);
  710.  
  711.    /*
  712.     * We don't support double <-> float conversion yet, although it could be
  713.     * added with little effort.
  714.     */
  715.    assert((!src_type.floating && !dst_type.floating) ||
  716.           src_type.width == dst_type.width);
  717.  
  718.    /* We must not loose or gain channels. Only precision */
  719.    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
  720.  
  721.    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
  722.    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
  723.    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
  724.    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
  725.  
  726.    if (src_type.width > dst_type.width) {
  727.       /*
  728.        * Truncate bit width.
  729.        */
  730.  
  731.       /* Conversion must be M:1 */
  732.       assert(num_dsts == 1);
  733.  
  734.       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
  735.         /*
  736.          * Register width remains constant -- use vector packing intrinsics
  737.          */
  738.          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
  739.       }
  740.       else {
  741.          if (src_type.width / dst_type.width > num_srcs) {
  742.             /*
  743.             * First change src vectors size (with shuffle) so they have the
  744.             * same size as the destination vector, then pack normally.
  745.             * Note: cannot use cast/extract because llvm generates atrocious code.
  746.             */
  747.             unsigned size_ratio = (src_type.width * src_type.length) /
  748.                                   (dst_type.length * dst_type.width);
  749.             unsigned new_length = src_type.length / size_ratio;
  750.  
  751.             for (i = 0; i < size_ratio * num_srcs; i++) {
  752.                unsigned start_index = (i % size_ratio) * new_length;
  753.                tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
  754.                                                start_index, new_length);
  755.             }
  756.             num_srcs *= size_ratio;
  757.             src_type.length = new_length;
  758.             tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
  759.          }
  760.          else {
  761.             /*
  762.              * Truncate bit width but expand vector size - first pack
  763.              * then expand simply because this should be more AVX-friendly
  764.              * for the cases we probably hit.
  765.              */
  766.             unsigned size_ratio = (dst_type.width * dst_type.length) /
  767.                                   (src_type.length * src_type.width);
  768.             unsigned num_pack_srcs = num_srcs / size_ratio;
  769.             dst_type.length = dst_type.length / size_ratio;
  770.  
  771.             for (i = 0; i < size_ratio; i++) {
  772.                tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
  773.                                       &src[i*num_pack_srcs], num_pack_srcs);
  774.             }
  775.             tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
  776.          }
  777.       }
  778.    }
  779.    else if (src_type.width < dst_type.width) {
  780.       /*
  781.        * Expand bit width.
  782.        */
  783.  
  784.       /* Conversion must be 1:N */
  785.       assert(num_srcs == 1);
  786.  
  787.       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
  788.          /*
  789.           * Register width remains constant -- use vector unpack intrinsics
  790.           */
  791.          lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
  792.       }
  793.       else {
  794.          /*
  795.           * Do it element-wise.
  796.           */
  797.          assert(src_type.length * num_srcs == dst_type.length * num_dsts);
  798.  
  799.          for (i = 0; i < num_dsts; i++) {
  800.             tmp[i] = lp_build_undef(gallivm, dst_type);
  801.          }
  802.  
  803.          for (i = 0; i < src_type.length; ++i) {
  804.             unsigned j = i / dst_type.length;
  805.             LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
  806.             LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
  807.             LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
  808.  
  809.             if (src_type.sign && dst_type.sign) {
  810.                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
  811.             } else {
  812.                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
  813.             }
  814.             tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
  815.          }
  816.       }
  817.    }
  818.    else {
  819.       /*
  820.        * No-op
  821.        */
  822.  
  823.       /* "Conversion" must be N:N */
  824.       assert(num_srcs == num_dsts);
  825.  
  826.       for(i = 0; i < num_dsts; ++i)
  827.          tmp[i] = src[i];
  828.    }
  829.  
  830.    for(i = 0; i < num_dsts; ++i)
  831.       dst[i] = tmp[i];
  832. }
  833.  
  834.  
  835. /**
  836.  * Expands src vector from src.length to dst_length
  837.  */
  838. LLVMValueRef
  839. lp_build_pad_vector(struct gallivm_state *gallivm,
  840.                     LLVMValueRef src,
  841.                     unsigned dst_length)
  842. {
  843.    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
  844.    LLVMValueRef undef;
  845.    LLVMTypeRef type;
  846.    unsigned i, src_length;
  847.  
  848.    type = LLVMTypeOf(src);
  849.  
  850.    if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
  851.       /* Can't use ShuffleVector on non-vector type */
  852.       undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
  853.       return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
  854.    }
  855.  
  856.    undef      = LLVMGetUndef(type);
  857.    src_length = LLVMGetVectorSize(type);
  858.  
  859.    assert(dst_length <= Elements(elems));
  860.    assert(dst_length >= src_length);
  861.  
  862.    if (src_length == dst_length)
  863.       return src;
  864.  
  865.    /* All elements from src vector */
  866.    for (i = 0; i < src_length; ++i)
  867.       elems[i] = lp_build_const_int32(gallivm, i);
  868.  
  869.    /* Undef fill remaining space */
  870.    for (i = src_length; i < dst_length; ++i)
  871.       elems[i] = lp_build_const_int32(gallivm, src_length);
  872.  
  873.    /* Combine the two vectors */
  874.    return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
  875. }
  876.