Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009-2010 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28.  
  29. /**
  30.  * @file
  31.  * Helper
  32.  *
  33.  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34.  * notably min/max and saturated operations), and it is often necessary to
  35.  * resort machine-specific intrinsics directly. The functions here hide all
  36.  * these implementation details from the other modules.
  37.  *
  38.  * We also do simple expressions simplification here. Reasons are:
  39.  * - it is very easy given we have all necessary information readily available
  40.  * - LLVM optimization passes fail to simplify several vector expressions
  41.  * - We often know value constraints which the optimization passes have no way
  42.  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43.  *
  44.  * @author Jose Fonseca <jfonseca@vmware.com>
  45.  */
  46.  
  47.  
  48. #include <float.h>
  49.  
  50. #include "util/u_memory.h"
  51. #include "util/u_debug.h"
  52. #include "util/u_math.h"
  53. #include "util/u_string.h"
  54. #include "util/u_cpu_detect.h"
  55.  
  56. #include "lp_bld_type.h"
  57. #include "lp_bld_const.h"
  58. #include "lp_bld_init.h"
  59. #include "lp_bld_intr.h"
  60. #include "lp_bld_logic.h"
  61. #include "lp_bld_pack.h"
  62. #include "lp_bld_debug.h"
  63. #include "lp_bld_bitarit.h"
  64. #include "lp_bld_arit.h"
  65. #include "lp_bld_flow.h"
  66.  
  67.  
  68. #define EXP_POLY_DEGREE 5
  69.  
  70. #define LOG_POLY_DEGREE 4
  71.  
  72.  
  73. /**
  74.  * Generate min(a, b)
  75.  * No checks for special case values of a or b = 1 or 0 are done.
  76.  */
  77. static LLVMValueRef
  78. lp_build_min_simple(struct lp_build_context *bld,
  79.                     LLVMValueRef a,
  80.                     LLVMValueRef b)
  81. {
  82.    const struct lp_type type = bld->type;
  83.    const char *intrinsic = NULL;
  84.    unsigned intr_size = 0;
  85.    LLVMValueRef cond;
  86.  
  87.    assert(lp_check_value(type, a));
  88.    assert(lp_check_value(type, b));
  89.  
  90.    /* TODO: optimize the constant case */
  91.  
  92.    if (type.floating && util_cpu_caps.has_sse) {
  93.       if (type.width == 32) {
  94.          if (type.length == 1) {
  95.             intrinsic = "llvm.x86.sse.min.ss";
  96.             intr_size = 128;
  97.          }
  98.          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  99.             intrinsic = "llvm.x86.sse.min.ps";
  100.             intr_size = 128;
  101.          }
  102.          else {
  103.             intrinsic = "llvm.x86.avx.min.ps.256";
  104.             intr_size = 256;
  105.          }
  106.       }
  107.       if (type.width == 64 && util_cpu_caps.has_sse2) {
  108.          if (type.length == 1) {
  109.             intrinsic = "llvm.x86.sse2.min.sd";
  110.             intr_size = 128;
  111.          }
  112.          else if (type.length == 2 || !util_cpu_caps.has_avx) {
  113.             intrinsic = "llvm.x86.sse2.min.pd";
  114.             intr_size = 128;
  115.          }
  116.          else {
  117.             intrinsic = "llvm.x86.avx.min.pd.256";
  118.             intr_size = 256;
  119.          }
  120.       }
  121.    }
  122.    else if (type.floating && util_cpu_caps.has_altivec) {
  123.       if (type.width == 32 && type.length == 4) {
  124.          intrinsic = "llvm.ppc.altivec.vminfp";
  125.          intr_size = 128;
  126.       }
  127.    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
  128.       intr_size = 128;
  129.       if ((type.width == 8 || type.width == 16) &&
  130.           (type.width * type.length <= 64) &&
  131.           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
  132.          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
  133.                       __FUNCTION__);
  134.          }
  135.       if (type.width == 8 && !type.sign) {
  136.          intrinsic = "llvm.x86.sse2.pminu.b";
  137.       }
  138.       else if (type.width == 16 && type.sign) {
  139.          intrinsic = "llvm.x86.sse2.pmins.w";
  140.       }
  141.       if (util_cpu_caps.has_sse4_1) {
  142.          if (type.width == 8 && type.sign) {
  143.             intrinsic = "llvm.x86.sse41.pminsb";
  144.          }
  145.          if (type.width == 16 && !type.sign) {
  146.             intrinsic = "llvm.x86.sse41.pminuw";
  147.          }
  148.          if (type.width == 32 && !type.sign) {
  149.             intrinsic = "llvm.x86.sse41.pminud";
  150.         }
  151.          if (type.width == 32 && type.sign) {
  152.             intrinsic = "llvm.x86.sse41.pminsd";
  153.          }
  154.       }
  155.    } else if (util_cpu_caps.has_altivec) {
  156.      intr_size = 128;
  157.      if (type.width == 8) {
  158.        if (!type.sign) {
  159.          intrinsic = "llvm.ppc.altivec.vminub";
  160.        } else {
  161.          intrinsic = "llvm.ppc.altivec.vminsb";
  162.        }
  163.      } else if (type.width == 16) {
  164.        if (!type.sign) {
  165.          intrinsic = "llvm.ppc.altivec.vminuh";
  166.        } else {
  167.          intrinsic = "llvm.ppc.altivec.vminsh";
  168.        }
  169.      } else if (type.width == 32) {
  170.        if (!type.sign) {
  171.          intrinsic = "llvm.ppc.altivec.vminuw";
  172.        } else {
  173.          intrinsic = "llvm.ppc.altivec.vminsw";
  174.        }
  175.      }
  176.    }
  177.  
  178.    if(intrinsic) {
  179.       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
  180.                                                  type,
  181.                                                  intr_size, a, b);
  182.    }
  183.  
  184.    cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
  185.    return lp_build_select(bld, cond, a, b);
  186. }
  187.  
  188.  
  189. /**
  190.  * Generate max(a, b)
  191.  * No checks for special case values of a or b = 1 or 0 are done.
  192.  */
  193. static LLVMValueRef
  194. lp_build_max_simple(struct lp_build_context *bld,
  195.                     LLVMValueRef a,
  196.                     LLVMValueRef b)
  197. {
  198.    const struct lp_type type = bld->type;
  199.    const char *intrinsic = NULL;
  200.    unsigned intr_size = 0;
  201.    LLVMValueRef cond;
  202.  
  203.    assert(lp_check_value(type, a));
  204.    assert(lp_check_value(type, b));
  205.  
  206.    /* TODO: optimize the constant case */
  207.  
  208.    if (type.floating && util_cpu_caps.has_sse) {
  209.       if (type.width == 32) {
  210.          if (type.length == 1) {
  211.             intrinsic = "llvm.x86.sse.max.ss";
  212.             intr_size = 128;
  213.          }
  214.          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  215.             intrinsic = "llvm.x86.sse.max.ps";
  216.             intr_size = 128;
  217.          }
  218.          else {
  219.             intrinsic = "llvm.x86.avx.max.ps.256";
  220.             intr_size = 256;
  221.          }
  222.       }
  223.       if (type.width == 64 && util_cpu_caps.has_sse2) {
  224.          if (type.length == 1) {
  225.             intrinsic = "llvm.x86.sse2.max.sd";
  226.             intr_size = 128;
  227.          }
  228.          else if (type.length == 2 || !util_cpu_caps.has_avx) {
  229.             intrinsic = "llvm.x86.sse2.max.pd";
  230.             intr_size = 128;
  231.          }
  232.          else {
  233.             intrinsic = "llvm.x86.avx.max.pd.256";
  234.             intr_size = 256;
  235.          }
  236.       }
  237.    }
  238.    else if (type.floating && util_cpu_caps.has_altivec) {
  239.       if (type.width == 32 || type.length == 4) {
  240.          intrinsic = "llvm.ppc.altivec.vmaxfp";
  241.          intr_size = 128;
  242.       }
  243.    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
  244.       intr_size = 128;
  245.       if ((type.width == 8 || type.width == 16) &&
  246.           (type.width * type.length <= 64) &&
  247.           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
  248.          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
  249.                       __FUNCTION__);
  250.          }
  251.       if (type.width == 8 && !type.sign) {
  252.          intrinsic = "llvm.x86.sse2.pmaxu.b";
  253.          intr_size = 128;
  254.       }
  255.       else if (type.width == 16 && type.sign) {
  256.          intrinsic = "llvm.x86.sse2.pmaxs.w";
  257.       }
  258.       if (util_cpu_caps.has_sse4_1) {
  259.          if (type.width == 8 && type.sign) {
  260.             intrinsic = "llvm.x86.sse41.pmaxsb";
  261.          }
  262.          if (type.width == 16 && !type.sign) {
  263.             intrinsic = "llvm.x86.sse41.pmaxuw";
  264.          }
  265.          if (type.width == 32 && !type.sign) {
  266.             intrinsic = "llvm.x86.sse41.pmaxud";
  267.         }
  268.          if (type.width == 32 && type.sign) {
  269.             intrinsic = "llvm.x86.sse41.pmaxsd";
  270.          }
  271.       }
  272.    } else if (util_cpu_caps.has_altivec) {
  273.      intr_size = 128;
  274.      if (type.width == 8) {
  275.        if (!type.sign) {
  276.          intrinsic = "llvm.ppc.altivec.vmaxub";
  277.        } else {
  278.          intrinsic = "llvm.ppc.altivec.vmaxsb";
  279.        }
  280.      } else if (type.width == 16) {
  281.        if (!type.sign) {
  282.          intrinsic = "llvm.ppc.altivec.vmaxuh";
  283.        } else {
  284.          intrinsic = "llvm.ppc.altivec.vmaxsh";
  285.        }
  286.      } else if (type.width == 32) {
  287.        if (!type.sign) {
  288.          intrinsic = "llvm.ppc.altivec.vmaxuw";
  289.        } else {
  290.          intrinsic = "llvm.ppc.altivec.vmaxsw";
  291.        }
  292.      }
  293.    }
  294.  
  295.    if(intrinsic) {
  296.       return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
  297.                                                  type,
  298.                                                  intr_size, a, b);
  299.    }
  300.  
  301.    cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  302.    return lp_build_select(bld, cond, a, b);
  303. }
  304.  
  305.  
  306. /**
  307.  * Generate 1 - a, or ~a depending on bld->type.
  308.  */
  309. LLVMValueRef
  310. lp_build_comp(struct lp_build_context *bld,
  311.               LLVMValueRef a)
  312. {
  313.    LLVMBuilderRef builder = bld->gallivm->builder;
  314.    const struct lp_type type = bld->type;
  315.  
  316.    assert(lp_check_value(type, a));
  317.  
  318.    if(a == bld->one)
  319.       return bld->zero;
  320.    if(a == bld->zero)
  321.       return bld->one;
  322.  
  323.    if(type.norm && !type.floating && !type.fixed && !type.sign) {
  324.       if(LLVMIsConstant(a))
  325.          return LLVMConstNot(a);
  326.       else
  327.          return LLVMBuildNot(builder, a, "");
  328.    }
  329.  
  330.    if(LLVMIsConstant(a))
  331.       if (type.floating)
  332.           return LLVMConstFSub(bld->one, a);
  333.       else
  334.           return LLVMConstSub(bld->one, a);
  335.    else
  336.       if (type.floating)
  337.          return LLVMBuildFSub(builder, bld->one, a, "");
  338.       else
  339.          return LLVMBuildSub(builder, bld->one, a, "");
  340. }
  341.  
  342.  
  343. /**
  344.  * Generate a + b
  345.  */
  346. LLVMValueRef
  347. lp_build_add(struct lp_build_context *bld,
  348.              LLVMValueRef a,
  349.              LLVMValueRef b)
  350. {
  351.    LLVMBuilderRef builder = bld->gallivm->builder;
  352.    const struct lp_type type = bld->type;
  353.    LLVMValueRef res;
  354.  
  355.    assert(lp_check_value(type, a));
  356.    assert(lp_check_value(type, b));
  357.  
  358.    if(a == bld->zero)
  359.       return b;
  360.    if(b == bld->zero)
  361.       return a;
  362.    if(a == bld->undef || b == bld->undef)
  363.       return bld->undef;
  364.  
  365.    if(bld->type.norm) {
  366.       const char *intrinsic = NULL;
  367.  
  368.       if(a == bld->one || b == bld->one)
  369.         return bld->one;
  370.  
  371.       if (type.width * type.length == 128 &&
  372.           !type.floating && !type.fixed) {
  373.          if(util_cpu_caps.has_sse2) {
  374.            if(type.width == 8)
  375.              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
  376.            if(type.width == 16)
  377.              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
  378.          } else if (util_cpu_caps.has_altivec) {
  379.            if(type.width == 8)
  380.               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
  381.            if(type.width == 16)
  382.               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
  383.          }
  384.       }
  385.    
  386.       if(intrinsic)
  387.          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
  388.    }
  389.  
  390.    /* TODO: handle signed case */
  391.    if(type.norm && !type.floating && !type.fixed && !type.sign)
  392.       a = lp_build_min_simple(bld, a, lp_build_comp(bld, b));
  393.  
  394.    if(LLVMIsConstant(a) && LLVMIsConstant(b))
  395.       if (type.floating)
  396.          res = LLVMConstFAdd(a, b);
  397.       else
  398.          res = LLVMConstAdd(a, b);
  399.    else
  400.       if (type.floating)
  401.          res = LLVMBuildFAdd(builder, a, b, "");
  402.       else
  403.          res = LLVMBuildAdd(builder, a, b, "");
  404.  
  405.    /* clamp to ceiling of 1.0 */
  406.    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
  407.       res = lp_build_min_simple(bld, res, bld->one);
  408.  
  409.    /* XXX clamp to floor of -1 or 0??? */
  410.  
  411.    return res;
  412. }
  413.  
  414.  
  415. /** Return the scalar sum of the elements of a.
  416.  * Should avoid this operation whenever possible.
  417.  */
  418. LLVMValueRef
  419. lp_build_horizontal_add(struct lp_build_context *bld,
  420.                         LLVMValueRef a)
  421. {
  422.    LLVMBuilderRef builder = bld->gallivm->builder;
  423.    const struct lp_type type = bld->type;
  424.    LLVMValueRef index, res;
  425.    unsigned i, length;
  426.    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
  427.    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
  428.    LLVMValueRef vecres, elem2;
  429.  
  430.    assert(lp_check_value(type, a));
  431.  
  432.    if (type.length == 1) {
  433.       return a;
  434.    }
  435.  
  436.    assert(!bld->type.norm);
  437.  
  438.    /*
  439.     * for byte vectors can do much better with psadbw.
  440.     * Using repeated shuffle/adds here. Note with multiple vectors
  441.     * this can be done more efficiently as outlined in the intel
  442.     * optimization manual.
  443.     * Note: could cause data rearrangement if used with smaller element
  444.     * sizes.
  445.     */
  446.  
  447.    vecres = a;
  448.    length = type.length / 2;
  449.    while (length > 1) {
  450.       LLVMValueRef vec1, vec2;
  451.       for (i = 0; i < length; i++) {
  452.          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
  453.          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
  454.       }
  455.       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
  456.                                     LLVMConstVector(shuffles1, length), "");
  457.       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
  458.                                     LLVMConstVector(shuffles2, length), "");
  459.       if (type.floating) {
  460.          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
  461.       }
  462.       else {
  463.          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
  464.       }
  465.       length = length >> 1;
  466.    }
  467.  
  468.    /* always have vector of size 2 here */
  469.    assert(length == 1);
  470.  
  471.    index = lp_build_const_int32(bld->gallivm, 0);
  472.    res = LLVMBuildExtractElement(builder, vecres, index, "");
  473.    index = lp_build_const_int32(bld->gallivm, 1);
  474.    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
  475.  
  476.    if (type.floating)
  477.       res = LLVMBuildFAdd(builder, res, elem2, "");
  478.     else
  479.       res = LLVMBuildAdd(builder, res, elem2, "");
  480.  
  481.    return res;
  482. }
  483.  
  484. /**
  485.  * Return the horizontal sums of 4 float vectors as a float4 vector.
  486.  * This uses the technique as outlined in Intel Optimization Manual.
  487.  */
  488. static LLVMValueRef
  489. lp_build_horizontal_add4x4f(struct lp_build_context *bld,
  490.                             LLVMValueRef src[4])
  491. {
  492.    struct gallivm_state *gallivm = bld->gallivm;
  493.    LLVMBuilderRef builder = gallivm->builder;
  494.    LLVMValueRef shuffles[4];
  495.    LLVMValueRef tmp[4];
  496.    LLVMValueRef sumtmp[2], shuftmp[2];
  497.  
  498.    /* lower half of regs */
  499.    shuffles[0] = lp_build_const_int32(gallivm, 0);
  500.    shuffles[1] = lp_build_const_int32(gallivm, 1);
  501.    shuffles[2] = lp_build_const_int32(gallivm, 4);
  502.    shuffles[3] = lp_build_const_int32(gallivm, 5);
  503.    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
  504.                                    LLVMConstVector(shuffles, 4), "");
  505.    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
  506.                                    LLVMConstVector(shuffles, 4), "");
  507.  
  508.    /* upper half of regs */
  509.    shuffles[0] = lp_build_const_int32(gallivm, 2);
  510.    shuffles[1] = lp_build_const_int32(gallivm, 3);
  511.    shuffles[2] = lp_build_const_int32(gallivm, 6);
  512.    shuffles[3] = lp_build_const_int32(gallivm, 7);
  513.    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
  514.                                    LLVMConstVector(shuffles, 4), "");
  515.    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
  516.                                    LLVMConstVector(shuffles, 4), "");
  517.  
  518.    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
  519.    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
  520.  
  521.    shuffles[0] = lp_build_const_int32(gallivm, 0);
  522.    shuffles[1] = lp_build_const_int32(gallivm, 2);
  523.    shuffles[2] = lp_build_const_int32(gallivm, 4);
  524.    shuffles[3] = lp_build_const_int32(gallivm, 6);
  525.    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
  526.                                        LLVMConstVector(shuffles, 4), "");
  527.  
  528.    shuffles[0] = lp_build_const_int32(gallivm, 1);
  529.    shuffles[1] = lp_build_const_int32(gallivm, 3);
  530.    shuffles[2] = lp_build_const_int32(gallivm, 5);
  531.    shuffles[3] = lp_build_const_int32(gallivm, 7);
  532.    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
  533.                                        LLVMConstVector(shuffles, 4), "");
  534.  
  535.    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
  536. }
  537.  
  538.  
  539. /*
  540.  * partially horizontally add 2-4 float vectors with length nx4,
  541.  * i.e. only four adjacent values in each vector will be added,
  542.  * assuming values are really grouped in 4 which also determines
  543.  * output order.
  544.  *
  545.  * Return a vector of the same length as the initial vectors,
  546.  * with the excess elements (if any) being undefined.
  547.  * The element order is independent of number of input vectors.
  548.  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
  549.  * the output order thus will be
  550.  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
  551.  */
  552. LLVMValueRef
  553. lp_build_hadd_partial4(struct lp_build_context *bld,
  554.                        LLVMValueRef vectors[],
  555.                        unsigned num_vecs)
  556. {
  557.    struct gallivm_state *gallivm = bld->gallivm;
  558.    LLVMBuilderRef builder = gallivm->builder;
  559.    LLVMValueRef ret_vec;
  560.    LLVMValueRef tmp[4];
  561.    const char *intrinsic = NULL;
  562.  
  563.    assert(num_vecs >= 2 && num_vecs <= 4);
  564.    assert(bld->type.floating);
  565.  
  566.    /* only use this with at least 2 vectors, as it is sort of expensive
  567.     * (depending on cpu) and we always need two horizontal adds anyway,
  568.     * so a shuffle/add approach might be better.
  569.     */
  570.  
  571.    tmp[0] = vectors[0];
  572.    tmp[1] = vectors[1];
  573.  
  574.    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
  575.    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
  576.  
  577.    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
  578.        bld->type.length == 4) {
  579.       intrinsic = "llvm.x86.sse3.hadd.ps";
  580.    }
  581.    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
  582.             bld->type.length == 8) {
  583.       intrinsic = "llvm.x86.avx.hadd.ps.256";
  584.    }
  585.    if (intrinsic) {
  586.       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
  587.                                        lp_build_vec_type(gallivm, bld->type),
  588.                                        tmp[0], tmp[1]);
  589.       if (num_vecs > 2) {
  590.          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
  591.                                           lp_build_vec_type(gallivm, bld->type),
  592.                                           tmp[2], tmp[3]);
  593.       }
  594.       else {
  595.          tmp[1] = tmp[0];
  596.       }
  597.       return lp_build_intrinsic_binary(builder, intrinsic,
  598.                                        lp_build_vec_type(gallivm, bld->type),
  599.                                        tmp[0], tmp[1]);
  600.    }
  601.  
  602.    if (bld->type.length == 4) {
  603.       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
  604.    }
  605.    else {
  606.       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
  607.       unsigned j;
  608.       unsigned num_iter = bld->type.length / 4;
  609.       struct lp_type parttype = bld->type;
  610.       parttype.length = 4;
  611.       for (j = 0; j < num_iter; j++) {
  612.          LLVMValueRef partsrc[4];
  613.          unsigned i;
  614.          for (i = 0; i < 4; i++) {
  615.             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
  616.          }
  617.          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
  618.       }
  619.       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
  620.    }
  621.    return ret_vec;
  622. }
  623.  
  624. /**
  625.  * Generate a - b
  626.  */
  627. LLVMValueRef
  628. lp_build_sub(struct lp_build_context *bld,
  629.              LLVMValueRef a,
  630.              LLVMValueRef b)
  631. {
  632.    LLVMBuilderRef builder = bld->gallivm->builder;
  633.    const struct lp_type type = bld->type;
  634.    LLVMValueRef res;
  635.  
  636.    assert(lp_check_value(type, a));
  637.    assert(lp_check_value(type, b));
  638.  
  639.    if(b == bld->zero)
  640.       return a;
  641.    if(a == bld->undef || b == bld->undef)
  642.       return bld->undef;
  643.    if(a == b)
  644.       return bld->zero;
  645.  
  646.    if(bld->type.norm) {
  647.       const char *intrinsic = NULL;
  648.  
  649.       if(b == bld->one)
  650.         return bld->zero;
  651.  
  652.       if (type.width * type.length == 128 &&
  653.           !type.floating && !type.fixed) {
  654.          if (util_cpu_caps.has_sse2) {
  655.            if(type.width == 8)
  656.               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
  657.            if(type.width == 16)
  658.               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
  659.          } else if (util_cpu_caps.has_altivec) {
  660.            if(type.width == 8)
  661.               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
  662.            if(type.width == 16)
  663.               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
  664.          }
  665.       }
  666.    
  667.       if(intrinsic)
  668.          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
  669.    }
  670.  
  671.    /* TODO: handle signed case */
  672.    if(type.norm && !type.floating && !type.fixed && !type.sign)
  673.       a = lp_build_max_simple(bld, a, b);
  674.  
  675.    if(LLVMIsConstant(a) && LLVMIsConstant(b))
  676.       if (type.floating)
  677.          res = LLVMConstFSub(a, b);
  678.       else
  679.          res = LLVMConstSub(a, b);
  680.    else
  681.       if (type.floating)
  682.          res = LLVMBuildFSub(builder, a, b, "");
  683.       else
  684.          res = LLVMBuildSub(builder, a, b, "");
  685.  
  686.    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
  687.       res = lp_build_max_simple(bld, res, bld->zero);
  688.  
  689.    return res;
  690. }
  691.  
  692.  
  693.  
  694. /**
  695.  * Normalized multiplication.
  696.  *
  697.  * There are several approaches for (using 8-bit normalized multiplication as
  698.  * an example):
  699.  *
  700.  * - alpha plus one
  701.  *
  702.  *     makes the following approximation to the division (Sree)
  703.  *    
  704.  *       a*b/255 ~= (a*(b + 1)) >> 256
  705.  *    
  706.  *     which is the fastest method that satisfies the following OpenGL criteria of
  707.  *    
  708.  *       0*0 = 0 and 255*255 = 255
  709.  *
  710.  * - geometric series
  711.  *
  712.  *     takes the geometric series approximation to the division
  713.  *
  714.  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
  715.  *
  716.  *     in this case just the first two terms to fit in 16bit arithmetic
  717.  *
  718.  *       t/255 ~= (t + (t >> 8)) >> 8
  719.  *
  720.  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
  721.  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
  722.  *     must be used.
  723.  *
  724.  * - geometric series plus rounding
  725.  *
  726.  *     when using a geometric series division instead of truncating the result
  727.  *     use roundoff in the approximation (Jim Blinn)
  728.  *
  729.  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
  730.  *
  731.  *     achieving the exact results.
  732.  *
  733.  *
  734.  *
  735.  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
  736.  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
  737.  * @sa Michael Herf, The "double blend trick", May 2000,
  738.  *     http://www.stereopsis.com/doubleblend.html
  739.  */
  740. static LLVMValueRef
  741. lp_build_mul_norm(struct gallivm_state *gallivm,
  742.                   struct lp_type wide_type,
  743.                   LLVMValueRef a, LLVMValueRef b)
  744. {
  745.    LLVMBuilderRef builder = gallivm->builder;
  746.    struct lp_build_context bld;
  747.    unsigned n;
  748.    LLVMValueRef half;
  749.    LLVMValueRef ab;
  750.  
  751.    assert(!wide_type.floating);
  752.    assert(lp_check_value(wide_type, a));
  753.    assert(lp_check_value(wide_type, b));
  754.  
  755.    lp_build_context_init(&bld, gallivm, wide_type);
  756.  
  757.    n = wide_type.width / 2;
  758.    if (wide_type.sign) {
  759.       --n;
  760.    }
  761.  
  762.    /*
  763.     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
  764.     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
  765.     */
  766.  
  767.    /*
  768.     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
  769.     */
  770.  
  771.    ab = LLVMBuildMul(builder, a, b, "");
  772.    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
  773.  
  774.    /*
  775.     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
  776.     */
  777.  
  778.    half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
  779.    if (wide_type.sign) {
  780.       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
  781.       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
  782.       half = lp_build_select(&bld, sign, minus_half, half);
  783.    }
  784.    ab = LLVMBuildAdd(builder, ab, half, "");
  785.  
  786.    /* Final division */
  787.    ab = lp_build_shr_imm(&bld, ab, n);
  788.  
  789.    return ab;
  790. }
  791.  
  792. /**
  793.  * Generate a * b
  794.  */
  795. LLVMValueRef
  796. lp_build_mul(struct lp_build_context *bld,
  797.              LLVMValueRef a,
  798.              LLVMValueRef b)
  799. {
  800.    LLVMBuilderRef builder = bld->gallivm->builder;
  801.    const struct lp_type type = bld->type;
  802.    LLVMValueRef shift;
  803.    LLVMValueRef res;
  804.  
  805.    assert(lp_check_value(type, a));
  806.    assert(lp_check_value(type, b));
  807.  
  808.    if(a == bld->zero)
  809.       return bld->zero;
  810.    if(a == bld->one)
  811.       return b;
  812.    if(b == bld->zero)
  813.       return bld->zero;
  814.    if(b == bld->one)
  815.       return a;
  816.    if(a == bld->undef || b == bld->undef)
  817.       return bld->undef;
  818.  
  819.    if (!type.floating && !type.fixed && type.norm) {
  820.       struct lp_type wide_type = lp_wider_type(type);
  821.       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
  822.  
  823.       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
  824.       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
  825.  
  826.       /* PMULLW, PSRLW, PADDW */
  827.       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
  828.       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
  829.  
  830.       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
  831.  
  832.       return ab;
  833.    }
  834.  
  835.    if(type.fixed)
  836.       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
  837.    else
  838.       shift = NULL;
  839.  
  840.    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
  841.       if (type.floating)
  842.          res = LLVMConstFMul(a, b);
  843.       else
  844.          res = LLVMConstMul(a, b);
  845.       if(shift) {
  846.          if(type.sign)
  847.             res = LLVMConstAShr(res, shift);
  848.          else
  849.             res = LLVMConstLShr(res, shift);
  850.       }
  851.    }
  852.    else {
  853.       if (type.floating)
  854.          res = LLVMBuildFMul(builder, a, b, "");
  855.       else
  856.          res = LLVMBuildMul(builder, a, b, "");
  857.       if(shift) {
  858.          if(type.sign)
  859.             res = LLVMBuildAShr(builder, res, shift, "");
  860.          else
  861.             res = LLVMBuildLShr(builder, res, shift, "");
  862.       }
  863.    }
  864.  
  865.    return res;
  866. }
  867.  
  868.  
  869. /**
  870.  * Small vector x scale multiplication optimization.
  871.  */
  872. LLVMValueRef
  873. lp_build_mul_imm(struct lp_build_context *bld,
  874.                  LLVMValueRef a,
  875.                  int b)
  876. {
  877.    LLVMBuilderRef builder = bld->gallivm->builder;
  878.    LLVMValueRef factor;
  879.  
  880.    assert(lp_check_value(bld->type, a));
  881.  
  882.    if(b == 0)
  883.       return bld->zero;
  884.  
  885.    if(b == 1)
  886.       return a;
  887.  
  888.    if(b == -1)
  889.       return lp_build_negate(bld, a);
  890.  
  891.    if(b == 2 && bld->type.floating)
  892.       return lp_build_add(bld, a, a);
  893.  
  894.    if(util_is_power_of_two(b)) {
  895.       unsigned shift = ffs(b) - 1;
  896.  
  897.       if(bld->type.floating) {
  898. #if 0
  899.          /*
  900.           * Power of two multiplication by directly manipulating the exponent.
  901.           *
  902.           * XXX: This might not be always faster, it will introduce a small error
  903.           * for multiplication by zero, and it will produce wrong results
  904.           * for Inf and NaN.
  905.           */
  906.          unsigned mantissa = lp_mantissa(bld->type);
  907.          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
  908.          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
  909.          a = LLVMBuildAdd(builder, a, factor, "");
  910.          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
  911.          return a;
  912. #endif
  913.       }
  914.       else {
  915.          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
  916.          return LLVMBuildShl(builder, a, factor, "");
  917.       }
  918.    }
  919.  
  920.    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
  921.    return lp_build_mul(bld, a, factor);
  922. }
  923.  
  924.  
  925. /**
  926.  * Generate a / b
  927.  */
  928. LLVMValueRef
  929. lp_build_div(struct lp_build_context *bld,
  930.              LLVMValueRef a,
  931.              LLVMValueRef b)
  932. {
  933.    LLVMBuilderRef builder = bld->gallivm->builder;
  934.    const struct lp_type type = bld->type;
  935.  
  936.    assert(lp_check_value(type, a));
  937.    assert(lp_check_value(type, b));
  938.  
  939.    if(a == bld->zero)
  940.       return bld->zero;
  941.    if(a == bld->one)
  942.       return lp_build_rcp(bld, b);
  943.    if(b == bld->zero)
  944.       return bld->undef;
  945.    if(b == bld->one)
  946.       return a;
  947.    if(a == bld->undef || b == bld->undef)
  948.       return bld->undef;
  949.  
  950.    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
  951.       if (type.floating)
  952.          return LLVMConstFDiv(a, b);
  953.       else if (type.sign)
  954.          return LLVMConstSDiv(a, b);
  955.       else
  956.          return LLVMConstUDiv(a, b);
  957.    }
  958.  
  959.    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
  960.        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
  961.       type.floating)
  962.       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
  963.  
  964.    if (type.floating)
  965.       return LLVMBuildFDiv(builder, a, b, "");
  966.    else if (type.sign)
  967.       return LLVMBuildSDiv(builder, a, b, "");
  968.    else
  969.       return LLVMBuildUDiv(builder, a, b, "");
  970. }
  971.  
  972.  
  973. /**
  974.  * Linear interpolation helper.
  975.  *
  976.  * @param normalized whether we are interpolating normalized values,
  977.  *        encoded in normalized integers, twice as wide.
  978.  *
  979.  * @sa http://www.stereopsis.com/doubleblend.html
  980.  */
  981. static INLINE LLVMValueRef
  982. lp_build_lerp_simple(struct lp_build_context *bld,
  983.                      LLVMValueRef x,
  984.                      LLVMValueRef v0,
  985.                      LLVMValueRef v1,
  986.                      unsigned flags)
  987. {
  988.    unsigned half_width = bld->type.width/2;
  989.    LLVMBuilderRef builder = bld->gallivm->builder;
  990.    LLVMValueRef delta;
  991.    LLVMValueRef res;
  992.  
  993.    assert(lp_check_value(bld->type, x));
  994.    assert(lp_check_value(bld->type, v0));
  995.    assert(lp_check_value(bld->type, v1));
  996.  
  997.    delta = lp_build_sub(bld, v1, v0);
  998.  
  999.    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
  1000.       if (!bld->type.sign) {
  1001.          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
  1002.             /*
  1003.              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
  1004.              * most-significant-bit to the lowest-significant-bit, so that
  1005.              * later we can just divide by 2**n instead of 2**n - 1.
  1006.              */
  1007.  
  1008.             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
  1009.          }
  1010.  
  1011.          /* (x * delta) >> n */
  1012.          res = lp_build_mul(bld, x, delta);
  1013.          res = lp_build_shr_imm(bld, res, half_width);
  1014.       } else {
  1015.          /*
  1016.           * The rescaling trick above doesn't work for signed numbers, so
  1017.           * use the 2**n - 1 divison approximation in lp_build_mul_norm
  1018.           * instead.
  1019.           */
  1020.          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
  1021.          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
  1022.       }
  1023.    } else {
  1024.       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
  1025.       res = lp_build_mul(bld, x, delta);
  1026.    }
  1027.  
  1028.    res = lp_build_add(bld, v0, res);
  1029.  
  1030.    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
  1031.        bld->type.fixed) {
  1032.       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
  1033.       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
  1034.        * but it will be wrong for true fixed point use cases. Basically we need
  1035.        * a more powerful lp_type, capable of further distinguishing the values
  1036.        * interpretation from the value storage. */
  1037.       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
  1038.    }
  1039.  
  1040.    return res;
  1041. }
  1042.  
  1043.  
  1044. /**
  1045.  * Linear interpolation.
  1046.  */
  1047. LLVMValueRef
  1048. lp_build_lerp(struct lp_build_context *bld,
  1049.               LLVMValueRef x,
  1050.               LLVMValueRef v0,
  1051.               LLVMValueRef v1,
  1052.               unsigned flags)
  1053. {
  1054.    const struct lp_type type = bld->type;
  1055.    LLVMValueRef res;
  1056.  
  1057.    assert(lp_check_value(type, x));
  1058.    assert(lp_check_value(type, v0));
  1059.    assert(lp_check_value(type, v1));
  1060.  
  1061.    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
  1062.  
  1063.    if (type.norm) {
  1064.       struct lp_type wide_type;
  1065.       struct lp_build_context wide_bld;
  1066.       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
  1067.  
  1068.       assert(type.length >= 2);
  1069.  
  1070.       /*
  1071.        * Create a wider integer type, enough to hold the
  1072.        * intermediate result of the multiplication.
  1073.        */
  1074.       memset(&wide_type, 0, sizeof wide_type);
  1075.       wide_type.sign   = type.sign;
  1076.       wide_type.width  = type.width*2;
  1077.       wide_type.length = type.length/2;
  1078.  
  1079.       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
  1080.  
  1081.       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
  1082.       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
  1083.       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
  1084.  
  1085.       /*
  1086.        * Lerp both halves.
  1087.        */
  1088.  
  1089.       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
  1090.  
  1091.       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
  1092.       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
  1093.  
  1094.       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
  1095.    } else {
  1096.       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
  1097.    }
  1098.  
  1099.    return res;
  1100. }
  1101.  
  1102.  
  1103. /**
  1104.  * Bilinear interpolation.
  1105.  *
  1106.  * Values indices are in v_{yx}.
  1107.  */
  1108. LLVMValueRef
  1109. lp_build_lerp_2d(struct lp_build_context *bld,
  1110.                  LLVMValueRef x,
  1111.                  LLVMValueRef y,
  1112.                  LLVMValueRef v00,
  1113.                  LLVMValueRef v01,
  1114.                  LLVMValueRef v10,
  1115.                  LLVMValueRef v11,
  1116.                  unsigned flags)
  1117. {
  1118.    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
  1119.    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
  1120.    return lp_build_lerp(bld, y, v0, v1, flags);
  1121. }
  1122.  
  1123.  
  1124. LLVMValueRef
  1125. lp_build_lerp_3d(struct lp_build_context *bld,
  1126.                  LLVMValueRef x,
  1127.                  LLVMValueRef y,
  1128.                  LLVMValueRef z,
  1129.                  LLVMValueRef v000,
  1130.                  LLVMValueRef v001,
  1131.                  LLVMValueRef v010,
  1132.                  LLVMValueRef v011,
  1133.                  LLVMValueRef v100,
  1134.                  LLVMValueRef v101,
  1135.                  LLVMValueRef v110,
  1136.                  LLVMValueRef v111,
  1137.                  unsigned flags)
  1138. {
  1139.    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
  1140.    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
  1141.    return lp_build_lerp(bld, z, v0, v1, flags);
  1142. }
  1143.  
  1144.  
  1145. /**
  1146.  * Generate min(a, b)
  1147.  * Do checks for special cases.
  1148.  */
  1149. LLVMValueRef
  1150. lp_build_min(struct lp_build_context *bld,
  1151.              LLVMValueRef a,
  1152.              LLVMValueRef b)
  1153. {
  1154.    assert(lp_check_value(bld->type, a));
  1155.    assert(lp_check_value(bld->type, b));
  1156.  
  1157.    if(a == bld->undef || b == bld->undef)
  1158.       return bld->undef;
  1159.  
  1160.    if(a == b)
  1161.       return a;
  1162.  
  1163.    if (bld->type.norm) {
  1164.       if (!bld->type.sign) {
  1165.          if (a == bld->zero || b == bld->zero) {
  1166.             return bld->zero;
  1167.          }
  1168.       }
  1169.       if(a == bld->one)
  1170.          return b;
  1171.       if(b == bld->one)
  1172.          return a;
  1173.    }
  1174.  
  1175.    return lp_build_min_simple(bld, a, b);
  1176. }
  1177.  
  1178.  
  1179. /**
  1180.  * Generate max(a, b)
  1181.  * Do checks for special cases.
  1182.  */
  1183. LLVMValueRef
  1184. lp_build_max(struct lp_build_context *bld,
  1185.              LLVMValueRef a,
  1186.              LLVMValueRef b)
  1187. {
  1188.    assert(lp_check_value(bld->type, a));
  1189.    assert(lp_check_value(bld->type, b));
  1190.  
  1191.    if(a == bld->undef || b == bld->undef)
  1192.       return bld->undef;
  1193.  
  1194.    if(a == b)
  1195.       return a;
  1196.  
  1197.    if(bld->type.norm) {
  1198.       if(a == bld->one || b == bld->one)
  1199.          return bld->one;
  1200.       if (!bld->type.sign) {
  1201.          if (a == bld->zero) {
  1202.             return b;
  1203.          }
  1204.          if (b == bld->zero) {
  1205.             return a;
  1206.          }
  1207.       }
  1208.    }
  1209.  
  1210.    return lp_build_max_simple(bld, a, b);
  1211. }
  1212.  
  1213.  
  1214. /**
  1215.  * Generate clamp(a, min, max)
  1216.  * Do checks for special cases.
  1217.  */
  1218. LLVMValueRef
  1219. lp_build_clamp(struct lp_build_context *bld,
  1220.                LLVMValueRef a,
  1221.                LLVMValueRef min,
  1222.                LLVMValueRef max)
  1223. {
  1224.    assert(lp_check_value(bld->type, a));
  1225.    assert(lp_check_value(bld->type, min));
  1226.    assert(lp_check_value(bld->type, max));
  1227.  
  1228.    a = lp_build_min(bld, a, max);
  1229.    a = lp_build_max(bld, a, min);
  1230.    return a;
  1231. }
  1232.  
  1233.  
  1234. /**
  1235.  * Generate abs(a)
  1236.  */
  1237. LLVMValueRef
  1238. lp_build_abs(struct lp_build_context *bld,
  1239.              LLVMValueRef a)
  1240. {
  1241.    LLVMBuilderRef builder = bld->gallivm->builder;
  1242.    const struct lp_type type = bld->type;
  1243.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  1244.  
  1245.    assert(lp_check_value(type, a));
  1246.  
  1247.    if(!type.sign)
  1248.       return a;
  1249.  
  1250.    if(type.floating) {
  1251.       /* Mask out the sign bit */
  1252.       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  1253.       unsigned long long absMask = ~(1ULL << (type.width - 1));
  1254.       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
  1255.       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
  1256.       a = LLVMBuildAnd(builder, a, mask, "");
  1257.       a = LLVMBuildBitCast(builder, a, vec_type, "");
  1258.       return a;
  1259.    }
  1260.  
  1261.    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
  1262.       switch(type.width) {
  1263.       case 8:
  1264.          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
  1265.       case 16:
  1266.          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
  1267.       case 32:
  1268.          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
  1269.       }
  1270.    }
  1271.    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
  1272.             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
  1273.             (type.width == 8 || type.width == 16 || type.width == 32)) {
  1274.       debug_printf("%s: inefficient code, should split vectors manually\n",
  1275.                    __FUNCTION__);
  1276.    }
  1277.  
  1278.    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
  1279. }
  1280.  
  1281.  
  1282. LLVMValueRef
  1283. lp_build_negate(struct lp_build_context *bld,
  1284.                 LLVMValueRef a)
  1285. {
  1286.    LLVMBuilderRef builder = bld->gallivm->builder;
  1287.  
  1288.    assert(lp_check_value(bld->type, a));
  1289.  
  1290. #if HAVE_LLVM >= 0x0207
  1291.    if (bld->type.floating)
  1292.       a = LLVMBuildFNeg(builder, a, "");
  1293.    else
  1294. #endif
  1295.       a = LLVMBuildNeg(builder, a, "");
  1296.  
  1297.    return a;
  1298. }
  1299.  
  1300.  
  1301. /** Return -1, 0 or +1 depending on the sign of a */
  1302. LLVMValueRef
  1303. lp_build_sgn(struct lp_build_context *bld,
  1304.              LLVMValueRef a)
  1305. {
  1306.    LLVMBuilderRef builder = bld->gallivm->builder;
  1307.    const struct lp_type type = bld->type;
  1308.    LLVMValueRef cond;
  1309.    LLVMValueRef res;
  1310.  
  1311.    assert(lp_check_value(type, a));
  1312.  
  1313.    /* Handle non-zero case */
  1314.    if(!type.sign) {
  1315.       /* if not zero then sign must be positive */
  1316.       res = bld->one;
  1317.    }
  1318.    else if(type.floating) {
  1319.       LLVMTypeRef vec_type;
  1320.       LLVMTypeRef int_type;
  1321.       LLVMValueRef mask;
  1322.       LLVMValueRef sign;
  1323.       LLVMValueRef one;
  1324.       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
  1325.  
  1326.       int_type = lp_build_int_vec_type(bld->gallivm, type);
  1327.       vec_type = lp_build_vec_type(bld->gallivm, type);
  1328.       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
  1329.  
  1330.       /* Take the sign bit and add it to 1 constant */
  1331.       sign = LLVMBuildBitCast(builder, a, int_type, "");
  1332.       sign = LLVMBuildAnd(builder, sign, mask, "");
  1333.       one = LLVMConstBitCast(bld->one, int_type);
  1334.       res = LLVMBuildOr(builder, sign, one, "");
  1335.       res = LLVMBuildBitCast(builder, res, vec_type, "");
  1336.    }
  1337.    else
  1338.    {
  1339.       /* signed int/norm/fixed point */
  1340.       /* could use psign with sse3 and appropriate vectors here */
  1341.       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
  1342.       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
  1343.       res = lp_build_select(bld, cond, bld->one, minus_one);
  1344.    }
  1345.  
  1346.    /* Handle zero */
  1347.    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
  1348.    res = lp_build_select(bld, cond, bld->zero, res);
  1349.  
  1350.    return res;
  1351. }
  1352.  
  1353.  
  1354. /**
  1355.  * Set the sign of float vector 'a' according to 'sign'.
  1356.  * If sign==0, return abs(a).
  1357.  * If sign==1, return -abs(a);
  1358.  * Other values for sign produce undefined results.
  1359.  */
  1360. LLVMValueRef
  1361. lp_build_set_sign(struct lp_build_context *bld,
  1362.                   LLVMValueRef a, LLVMValueRef sign)
  1363. {
  1364.    LLVMBuilderRef builder = bld->gallivm->builder;
  1365.    const struct lp_type type = bld->type;
  1366.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  1367.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  1368.    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
  1369.    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
  1370.                              ~((unsigned long long) 1 << (type.width - 1)));
  1371.    LLVMValueRef val, res;
  1372.  
  1373.    assert(type.floating);
  1374.    assert(lp_check_value(type, a));
  1375.  
  1376.    /* val = reinterpret_cast<int>(a) */
  1377.    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
  1378.    /* val = val & mask */
  1379.    val = LLVMBuildAnd(builder, val, mask, "");
  1380.    /* sign = sign << shift */
  1381.    sign = LLVMBuildShl(builder, sign, shift, "");
  1382.    /* res = val | sign */
  1383.    res = LLVMBuildOr(builder, val, sign, "");
  1384.    /* res = reinterpret_cast<float>(res) */
  1385.    res = LLVMBuildBitCast(builder, res, vec_type, "");
  1386.  
  1387.    return res;
  1388. }
  1389.  
  1390.  
  1391. /**
  1392.  * Convert vector of (or scalar) int to vector of (or scalar) float.
  1393.  */
  1394. LLVMValueRef
  1395. lp_build_int_to_float(struct lp_build_context *bld,
  1396.                       LLVMValueRef a)
  1397. {
  1398.    LLVMBuilderRef builder = bld->gallivm->builder;
  1399.    const struct lp_type type = bld->type;
  1400.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  1401.  
  1402.    assert(type.floating);
  1403.  
  1404.    return LLVMBuildSIToFP(builder, a, vec_type, "");
  1405. }
  1406.  
  1407. static boolean
  1408. arch_rounding_available(const struct lp_type type)
  1409. {
  1410.    if ((util_cpu_caps.has_sse4_1 &&
  1411.        (type.length == 1 || type.width*type.length == 128)) ||
  1412.        (util_cpu_caps.has_avx && type.width*type.length == 256))
  1413.       return TRUE;
  1414.    else if ((util_cpu_caps.has_altivec &&
  1415.             (type.width == 32 && type.length == 4)))
  1416.       return TRUE;
  1417.  
  1418.    return FALSE;
  1419. }
  1420.  
  1421. enum lp_build_round_mode
  1422. {
  1423.    LP_BUILD_ROUND_NEAREST = 0,
  1424.    LP_BUILD_ROUND_FLOOR = 1,
  1425.    LP_BUILD_ROUND_CEIL = 2,
  1426.    LP_BUILD_ROUND_TRUNCATE = 3
  1427. };
  1428.  
  1429. /**
  1430.  * Helper for SSE4.1's ROUNDxx instructions.
  1431.  *
  1432.  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
  1433.  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
  1434.  */
  1435. static INLINE LLVMValueRef
  1436. lp_build_round_sse41(struct lp_build_context *bld,
  1437.                      LLVMValueRef a,
  1438.                      enum lp_build_round_mode mode)
  1439. {
  1440.    LLVMBuilderRef builder = bld->gallivm->builder;
  1441.    const struct lp_type type = bld->type;
  1442.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  1443.    const char *intrinsic;
  1444.    LLVMValueRef res;
  1445.  
  1446.    assert(type.floating);
  1447.  
  1448.    assert(lp_check_value(type, a));
  1449.    assert(util_cpu_caps.has_sse4_1);
  1450.  
  1451.    if (type.length == 1) {
  1452.       LLVMTypeRef vec_type;
  1453.       LLVMValueRef undef;
  1454.       LLVMValueRef args[3];
  1455.       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
  1456.  
  1457.       switch(type.width) {
  1458.       case 32:
  1459.          intrinsic = "llvm.x86.sse41.round.ss";
  1460.          break;
  1461.       case 64:
  1462.          intrinsic = "llvm.x86.sse41.round.sd";
  1463.          break;
  1464.       default:
  1465.          assert(0);
  1466.          return bld->undef;
  1467.       }
  1468.  
  1469.       vec_type = LLVMVectorType(bld->elem_type, 4);
  1470.  
  1471.       undef = LLVMGetUndef(vec_type);
  1472.  
  1473.       args[0] = undef;
  1474.       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
  1475.       args[2] = LLVMConstInt(i32t, mode, 0);
  1476.  
  1477.       res = lp_build_intrinsic(builder, intrinsic,
  1478.                                vec_type, args, Elements(args));
  1479.  
  1480.       res = LLVMBuildExtractElement(builder, res, index0, "");
  1481.    }
  1482.    else {
  1483.       if (type.width * type.length == 128) {
  1484.          switch(type.width) {
  1485.          case 32:
  1486.             intrinsic = "llvm.x86.sse41.round.ps";
  1487.             break;
  1488.          case 64:
  1489.             intrinsic = "llvm.x86.sse41.round.pd";
  1490.             break;
  1491.          default:
  1492.             assert(0);
  1493.             return bld->undef;
  1494.          }
  1495.       }
  1496.       else {
  1497.          assert(type.width * type.length == 256);
  1498.          assert(util_cpu_caps.has_avx);
  1499.  
  1500.          switch(type.width) {
  1501.          case 32:
  1502.             intrinsic = "llvm.x86.avx.round.ps.256";
  1503.             break;
  1504.          case 64:
  1505.             intrinsic = "llvm.x86.avx.round.pd.256";
  1506.             break;
  1507.          default:
  1508.             assert(0);
  1509.             return bld->undef;
  1510.          }
  1511.       }
  1512.  
  1513.       res = lp_build_intrinsic_binary(builder, intrinsic,
  1514.                                       bld->vec_type, a,
  1515.                                       LLVMConstInt(i32t, mode, 0));
  1516.    }
  1517.  
  1518.    return res;
  1519. }
  1520.  
  1521.  
  1522. static INLINE LLVMValueRef
  1523. lp_build_iround_nearest_sse2(struct lp_build_context *bld,
  1524.                              LLVMValueRef a)
  1525. {
  1526.    LLVMBuilderRef builder = bld->gallivm->builder;
  1527.    const struct lp_type type = bld->type;
  1528.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  1529.    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
  1530.    const char *intrinsic;
  1531.    LLVMValueRef res;
  1532.  
  1533.    assert(type.floating);
  1534.    /* using the double precision conversions is a bit more complicated */
  1535.    assert(type.width == 32);
  1536.  
  1537.    assert(lp_check_value(type, a));
  1538.    assert(util_cpu_caps.has_sse2);
  1539.  
  1540.    /* This is relying on MXCSR rounding mode, which should always be nearest. */
  1541.    if (type.length == 1) {
  1542.       LLVMTypeRef vec_type;
  1543.       LLVMValueRef undef;
  1544.       LLVMValueRef arg;
  1545.       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
  1546.  
  1547.       vec_type = LLVMVectorType(bld->elem_type, 4);
  1548.  
  1549.       intrinsic = "llvm.x86.sse.cvtss2si";
  1550.  
  1551.       undef = LLVMGetUndef(vec_type);
  1552.  
  1553.       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
  1554.  
  1555.       res = lp_build_intrinsic_unary(builder, intrinsic,
  1556.                                      ret_type, arg);
  1557.    }
  1558.    else {
  1559.       if (type.width* type.length == 128) {
  1560.          intrinsic = "llvm.x86.sse2.cvtps2dq";
  1561.       }
  1562.       else {
  1563.          assert(type.width*type.length == 256);
  1564.          assert(util_cpu_caps.has_avx);
  1565.  
  1566.          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
  1567.       }
  1568.       res = lp_build_intrinsic_unary(builder, intrinsic,
  1569.                                      ret_type, a);
  1570.    }
  1571.  
  1572.    return res;
  1573. }
  1574.  
  1575.  
  1576. /*
  1577.  */
  1578. static INLINE LLVMValueRef
  1579. lp_build_round_altivec(struct lp_build_context *bld,
  1580.                        LLVMValueRef a,
  1581.                        enum lp_build_round_mode mode)
  1582. {
  1583.    LLVMBuilderRef builder = bld->gallivm->builder;
  1584.    const struct lp_type type = bld->type;
  1585.    const char *intrinsic = NULL;
  1586.  
  1587.    assert(type.floating);
  1588.  
  1589.    assert(lp_check_value(type, a));
  1590.    assert(util_cpu_caps.has_altivec);
  1591.  
  1592.    switch (mode) {
  1593.    case LP_BUILD_ROUND_NEAREST:
  1594.       intrinsic = "llvm.ppc.altivec.vrfin";
  1595.       break;
  1596.    case LP_BUILD_ROUND_FLOOR:
  1597.       intrinsic = "llvm.ppc.altivec.vrfim";
  1598.       break;
  1599.    case LP_BUILD_ROUND_CEIL:
  1600.       intrinsic = "llvm.ppc.altivec.vrfip";
  1601.       break;
  1602.    case LP_BUILD_ROUND_TRUNCATE:
  1603.       intrinsic = "llvm.ppc.altivec.vrfiz";
  1604.       break;
  1605.    }
  1606.  
  1607.    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  1608. }
  1609.  
  1610. static INLINE LLVMValueRef
  1611. lp_build_round_arch(struct lp_build_context *bld,
  1612.                     LLVMValueRef a,
  1613.                     enum lp_build_round_mode mode)
  1614. {
  1615.    if (util_cpu_caps.has_sse4_1)
  1616.      return lp_build_round_sse41(bld, a, mode);
  1617.    else /* (util_cpu_caps.has_altivec) */
  1618.      return lp_build_round_altivec(bld, a, mode);
  1619. }
  1620.  
  1621. /**
  1622.  * Return the integer part of a float (vector) value (== round toward zero).
  1623.  * The returned value is a float (vector).
  1624.  * Ex: trunc(-1.5) = -1.0
  1625.  */
  1626. LLVMValueRef
  1627. lp_build_trunc(struct lp_build_context *bld,
  1628.                LLVMValueRef a)
  1629. {
  1630.    LLVMBuilderRef builder = bld->gallivm->builder;
  1631.    const struct lp_type type = bld->type;
  1632.  
  1633.    assert(type.floating);
  1634.    assert(lp_check_value(type, a));
  1635.  
  1636.    if (arch_rounding_available(type)) {
  1637.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
  1638.    }
  1639.    else {
  1640.       const struct lp_type type = bld->type;
  1641.       struct lp_type inttype;
  1642.       struct lp_build_context intbld;
  1643.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
  1644.       LLVMValueRef trunc, res, anosign, mask;
  1645.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1646.       LLVMTypeRef vec_type = bld->vec_type;
  1647.  
  1648.       assert(type.width == 32); /* might want to handle doubles at some point */
  1649.  
  1650.       inttype = type;
  1651.       inttype.floating = 0;
  1652.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  1653.  
  1654.       /* round by truncation */
  1655.       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  1656.       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
  1657.  
  1658.       /* mask out sign bit */
  1659.       anosign = lp_build_abs(bld, a);
  1660.       /*
  1661.        * mask out all values if anosign > 2^24
  1662.        * This should work both for large ints (all rounding is no-op for them
  1663.        * because such floats are always exact) as well as special cases like
  1664.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  1665.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  1666.        */
  1667.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  1668.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  1669.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  1670.       return lp_build_select(bld, mask, a, res);
  1671.    }
  1672. }
  1673.  
  1674.  
  1675. /**
  1676.  * Return float (vector) rounded to nearest integer (vector).  The returned
  1677.  * value is a float (vector).
  1678.  * Ex: round(0.9) = 1.0
  1679.  * Ex: round(-1.5) = -2.0
  1680.  */
  1681. LLVMValueRef
  1682. lp_build_round(struct lp_build_context *bld,
  1683.                LLVMValueRef a)
  1684. {
  1685.    LLVMBuilderRef builder = bld->gallivm->builder;
  1686.    const struct lp_type type = bld->type;
  1687.  
  1688.    assert(type.floating);
  1689.    assert(lp_check_value(type, a));
  1690.  
  1691.    if (arch_rounding_available(type)) {
  1692.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
  1693.    }
  1694.    else {
  1695.       const struct lp_type type = bld->type;
  1696.       struct lp_type inttype;
  1697.       struct lp_build_context intbld;
  1698.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
  1699.       LLVMValueRef res, anosign, mask;
  1700.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1701.       LLVMTypeRef vec_type = bld->vec_type;
  1702.  
  1703.       assert(type.width == 32); /* might want to handle doubles at some point */
  1704.  
  1705.       inttype = type;
  1706.       inttype.floating = 0;
  1707.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  1708.  
  1709.       res = lp_build_iround(bld, a);
  1710.       res = LLVMBuildSIToFP(builder, res, vec_type, "");
  1711.  
  1712.       /* mask out sign bit */
  1713.       anosign = lp_build_abs(bld, a);
  1714.       /*
  1715.        * mask out all values if anosign > 2^24
  1716.        * This should work both for large ints (all rounding is no-op for them
  1717.        * because such floats are always exact) as well as special cases like
  1718.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  1719.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  1720.        */
  1721.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  1722.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  1723.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  1724.       return lp_build_select(bld, mask, a, res);
  1725.    }
  1726. }
  1727.  
  1728.  
  1729. /**
  1730.  * Return floor of float (vector), result is a float (vector)
  1731.  * Ex: floor(1.1) = 1.0
  1732.  * Ex: floor(-1.1) = -2.0
  1733.  */
  1734. LLVMValueRef
  1735. lp_build_floor(struct lp_build_context *bld,
  1736.                LLVMValueRef a)
  1737. {
  1738.    LLVMBuilderRef builder = bld->gallivm->builder;
  1739.    const struct lp_type type = bld->type;
  1740.  
  1741.    assert(type.floating);
  1742.    assert(lp_check_value(type, a));
  1743.  
  1744.    if (arch_rounding_available(type)) {
  1745.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
  1746.    }
  1747.    else {
  1748.       const struct lp_type type = bld->type;
  1749.       struct lp_type inttype;
  1750.       struct lp_build_context intbld;
  1751.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
  1752.       LLVMValueRef trunc, res, anosign, mask;
  1753.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1754.       LLVMTypeRef vec_type = bld->vec_type;
  1755.  
  1756.       assert(type.width == 32); /* might want to handle doubles at some point */
  1757.  
  1758.       inttype = type;
  1759.       inttype.floating = 0;
  1760.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  1761.  
  1762.       /* round by truncation */
  1763.       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  1764.       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
  1765.  
  1766.       if (type.sign) {
  1767.          LLVMValueRef tmp;
  1768.  
  1769.          /*
  1770.           * fix values if rounding is wrong (for non-special cases)
  1771.           * - this is the case if trunc > a
  1772.           */
  1773.          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
  1774.          /* tmp = trunc > a ? 1.0 : 0.0 */
  1775.          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
  1776.          tmp = lp_build_and(&intbld, mask, tmp);
  1777.          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
  1778.          res = lp_build_sub(bld, res, tmp);
  1779.       }
  1780.  
  1781.       /* mask out sign bit */
  1782.       anosign = lp_build_abs(bld, a);
  1783.       /*
  1784.        * mask out all values if anosign > 2^24
  1785.        * This should work both for large ints (all rounding is no-op for them
  1786.        * because such floats are always exact) as well as special cases like
  1787.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  1788.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  1789.        */
  1790.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  1791.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  1792.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  1793.       return lp_build_select(bld, mask, a, res);
  1794.    }
  1795. }
  1796.  
  1797.  
  1798. /**
  1799.  * Return ceiling of float (vector), returning float (vector).
  1800.  * Ex: ceil( 1.1) = 2.0
  1801.  * Ex: ceil(-1.1) = -1.0
  1802.  */
  1803. LLVMValueRef
  1804. lp_build_ceil(struct lp_build_context *bld,
  1805.               LLVMValueRef a)
  1806. {
  1807.    LLVMBuilderRef builder = bld->gallivm->builder;
  1808.    const struct lp_type type = bld->type;
  1809.  
  1810.    assert(type.floating);
  1811.    assert(lp_check_value(type, a));
  1812.  
  1813.    if (arch_rounding_available(type)) {
  1814.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
  1815.    }
  1816.    else {
  1817.       const struct lp_type type = bld->type;
  1818.       struct lp_type inttype;
  1819.       struct lp_build_context intbld;
  1820.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 2^24);
  1821.       LLVMValueRef trunc, res, anosign, mask, tmp;
  1822.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1823.       LLVMTypeRef vec_type = bld->vec_type;
  1824.  
  1825.       assert(type.width == 32); /* might want to handle doubles at some point */
  1826.  
  1827.       inttype = type;
  1828.       inttype.floating = 0;
  1829.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  1830.  
  1831.       /* round by truncation */
  1832.       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  1833.       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
  1834.  
  1835.       /*
  1836.        * fix values if rounding is wrong (for non-special cases)
  1837.        * - this is the case if trunc < a
  1838.        */
  1839.       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
  1840.       /* tmp = trunc < a ? 1.0 : 0.0 */
  1841.       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
  1842.       tmp = lp_build_and(&intbld, mask, tmp);
  1843.       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
  1844.       res = lp_build_add(bld, trunc, tmp);
  1845.  
  1846.       /* mask out sign bit */
  1847.       anosign = lp_build_abs(bld, a);
  1848.       /*
  1849.        * mask out all values if anosign > 2^24
  1850.        * This should work both for large ints (all rounding is no-op for them
  1851.        * because such floats are always exact) as well as special cases like
  1852.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  1853.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  1854.        */
  1855.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  1856.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  1857.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  1858.       return lp_build_select(bld, mask, a, res);
  1859.    }
  1860. }
  1861.  
  1862.  
  1863. /**
  1864.  * Return fractional part of 'a' computed as a - floor(a)
  1865.  * Typically used in texture coord arithmetic.
  1866.  */
  1867. LLVMValueRef
  1868. lp_build_fract(struct lp_build_context *bld,
  1869.                LLVMValueRef a)
  1870. {
  1871.    assert(bld->type.floating);
  1872.    return lp_build_sub(bld, a, lp_build_floor(bld, a));
  1873. }
  1874.  
  1875.  
  1876. /**
  1877.  * Prevent returning a fractional part of 1.0 for very small negative values of
  1878.  * 'a' by clamping against 0.99999(9).
  1879.  */
  1880. static inline LLVMValueRef
  1881. clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
  1882. {
  1883.    LLVMValueRef max;
  1884.  
  1885.    /* this is the largest number smaller than 1.0 representable as float */
  1886.    max = lp_build_const_vec(bld->gallivm, bld->type,
  1887.                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
  1888.    return lp_build_min(bld, fract, max);
  1889. }
  1890.  
  1891.  
  1892. /**
  1893.  * Same as lp_build_fract, but guarantees that the result is always smaller
  1894.  * than one.
  1895.  */
  1896. LLVMValueRef
  1897. lp_build_fract_safe(struct lp_build_context *bld,
  1898.                     LLVMValueRef a)
  1899. {
  1900.    return clamp_fract(bld, lp_build_fract(bld, a));
  1901. }
  1902.  
  1903.  
  1904. /**
  1905.  * Return the integer part of a float (vector) value (== round toward zero).
  1906.  * The returned value is an integer (vector).
  1907.  * Ex: itrunc(-1.5) = -1
  1908.  */
  1909. LLVMValueRef
  1910. lp_build_itrunc(struct lp_build_context *bld,
  1911.                 LLVMValueRef a)
  1912. {
  1913.    LLVMBuilderRef builder = bld->gallivm->builder;
  1914.    const struct lp_type type = bld->type;
  1915.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  1916.  
  1917.    assert(type.floating);
  1918.    assert(lp_check_value(type, a));
  1919.  
  1920.    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
  1921. }
  1922.  
  1923.  
  1924. /**
  1925.  * Return float (vector) rounded to nearest integer (vector).  The returned
  1926.  * value is an integer (vector).
  1927.  * Ex: iround(0.9) = 1
  1928.  * Ex: iround(-1.5) = -2
  1929.  */
  1930. LLVMValueRef
  1931. lp_build_iround(struct lp_build_context *bld,
  1932.                 LLVMValueRef a)
  1933. {
  1934.    LLVMBuilderRef builder = bld->gallivm->builder;
  1935.    const struct lp_type type = bld->type;
  1936.    LLVMTypeRef int_vec_type = bld->int_vec_type;
  1937.    LLVMValueRef res;
  1938.  
  1939.    assert(type.floating);
  1940.  
  1941.    assert(lp_check_value(type, a));
  1942.  
  1943.    if ((util_cpu_caps.has_sse2 &&
  1944.        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
  1945.        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
  1946.       return lp_build_iround_nearest_sse2(bld, a);
  1947.    }
  1948.    if (arch_rounding_available(type)) {
  1949.       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
  1950.    }
  1951.    else {
  1952.       LLVMValueRef half;
  1953.  
  1954.       half = lp_build_const_vec(bld->gallivm, type, 0.5);
  1955.  
  1956.       if (type.sign) {
  1957.          LLVMTypeRef vec_type = bld->vec_type;
  1958.          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
  1959.                                     (unsigned long long)1 << (type.width - 1));
  1960.          LLVMValueRef sign;
  1961.  
  1962.          /* get sign bit */
  1963.          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
  1964.          sign = LLVMBuildAnd(builder, sign, mask, "");
  1965.  
  1966.          /* sign * 0.5 */
  1967.          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
  1968.          half = LLVMBuildOr(builder, sign, half, "");
  1969.          half = LLVMBuildBitCast(builder, half, vec_type, "");
  1970.       }
  1971.  
  1972.       res = LLVMBuildFAdd(builder, a, half, "");
  1973.    }
  1974.  
  1975.    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
  1976.  
  1977.    return res;
  1978. }
  1979.  
  1980.  
  1981. /**
  1982.  * Return floor of float (vector), result is an int (vector)
  1983.  * Ex: ifloor(1.1) = 1.0
  1984.  * Ex: ifloor(-1.1) = -2.0
  1985.  */
  1986. LLVMValueRef
  1987. lp_build_ifloor(struct lp_build_context *bld,
  1988.                 LLVMValueRef a)
  1989. {
  1990.    LLVMBuilderRef builder = bld->gallivm->builder;
  1991.    const struct lp_type type = bld->type;
  1992.    LLVMTypeRef int_vec_type = bld->int_vec_type;
  1993.    LLVMValueRef res;
  1994.  
  1995.    assert(type.floating);
  1996.    assert(lp_check_value(type, a));
  1997.  
  1998.    res = a;
  1999.    if (type.sign) {
  2000.       if (arch_rounding_available(type)) {
  2001.          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
  2002.       }
  2003.       else {
  2004.          struct lp_type inttype;
  2005.          struct lp_build_context intbld;
  2006.          LLVMValueRef trunc, itrunc, mask;
  2007.  
  2008.          assert(type.floating);
  2009.          assert(lp_check_value(type, a));
  2010.  
  2011.          inttype = type;
  2012.          inttype.floating = 0;
  2013.          lp_build_context_init(&intbld, bld->gallivm, inttype);
  2014.  
  2015.          /* round by truncation */
  2016.          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2017.          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
  2018.  
  2019.          /*
  2020.           * fix values if rounding is wrong (for non-special cases)
  2021.           * - this is the case if trunc > a
  2022.           * The results of doing this with NaNs, very large values etc.
  2023.           * are undefined but this seems to be the case anyway.
  2024.           */
  2025.          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
  2026.          /* cheapie minus one with mask since the mask is minus one / zero */
  2027.          return lp_build_add(&intbld, itrunc, mask);
  2028.       }
  2029.    }
  2030.  
  2031.    /* round to nearest (toward zero) */
  2032.    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
  2033.  
  2034.    return res;
  2035. }
  2036.  
  2037.  
  2038. /**
  2039.  * Return ceiling of float (vector), returning int (vector).
  2040.  * Ex: iceil( 1.1) = 2
  2041.  * Ex: iceil(-1.1) = -1
  2042.  */
  2043. LLVMValueRef
  2044. lp_build_iceil(struct lp_build_context *bld,
  2045.                LLVMValueRef a)
  2046. {
  2047.    LLVMBuilderRef builder = bld->gallivm->builder;
  2048.    const struct lp_type type = bld->type;
  2049.    LLVMTypeRef int_vec_type = bld->int_vec_type;
  2050.    LLVMValueRef res;
  2051.  
  2052.    assert(type.floating);
  2053.    assert(lp_check_value(type, a));
  2054.  
  2055.    if (arch_rounding_available(type)) {
  2056.       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
  2057.    }
  2058.    else {
  2059.       struct lp_type inttype;
  2060.       struct lp_build_context intbld;
  2061.       LLVMValueRef trunc, itrunc, mask;
  2062.  
  2063.       assert(type.floating);
  2064.       assert(lp_check_value(type, a));
  2065.  
  2066.       inttype = type;
  2067.       inttype.floating = 0;
  2068.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  2069.  
  2070.       /* round by truncation */
  2071.       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2072.       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
  2073.  
  2074.       /*
  2075.        * fix values if rounding is wrong (for non-special cases)
  2076.        * - this is the case if trunc < a
  2077.        * The results of doing this with NaNs, very large values etc.
  2078.        * are undefined but this seems to be the case anyway.
  2079.        */
  2080.       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
  2081.       /* cheapie plus one with mask since the mask is minus one / zero */
  2082.       return lp_build_sub(&intbld, itrunc, mask);
  2083.    }
  2084.  
  2085.    /* round to nearest (toward zero) */
  2086.    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
  2087.  
  2088.    return res;
  2089. }
  2090.  
  2091.  
  2092. /**
  2093.  * Combined ifloor() & fract().
  2094.  *
  2095.  * Preferred to calling the functions separately, as it will ensure that the
  2096.  * strategy (floor() vs ifloor()) that results in less redundant work is used.
  2097.  */
  2098. void
  2099. lp_build_ifloor_fract(struct lp_build_context *bld,
  2100.                       LLVMValueRef a,
  2101.                       LLVMValueRef *out_ipart,
  2102.                       LLVMValueRef *out_fpart)
  2103. {
  2104.    LLVMBuilderRef builder = bld->gallivm->builder;
  2105.    const struct lp_type type = bld->type;
  2106.    LLVMValueRef ipart;
  2107.  
  2108.    assert(type.floating);
  2109.    assert(lp_check_value(type, a));
  2110.  
  2111.    if (arch_rounding_available(type)) {
  2112.       /*
  2113.        * floor() is easier.
  2114.        */
  2115.  
  2116.       ipart = lp_build_floor(bld, a);
  2117.       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
  2118.       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
  2119.    }
  2120.    else {
  2121.       /*
  2122.        * ifloor() is easier.
  2123.        */
  2124.  
  2125.       *out_ipart = lp_build_ifloor(bld, a);
  2126.       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
  2127.       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
  2128.    }
  2129. }
  2130.  
  2131.  
  2132. /**
  2133.  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
  2134.  * always smaller than one.
  2135.  */
  2136. void
  2137. lp_build_ifloor_fract_safe(struct lp_build_context *bld,
  2138.                            LLVMValueRef a,
  2139.                            LLVMValueRef *out_ipart,
  2140.                            LLVMValueRef *out_fpart)
  2141. {
  2142.    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
  2143.    *out_fpart = clamp_fract(bld, *out_fpart);
  2144. }
  2145.  
  2146.  
  2147. LLVMValueRef
  2148. lp_build_sqrt(struct lp_build_context *bld,
  2149.               LLVMValueRef a)
  2150. {
  2151.    LLVMBuilderRef builder = bld->gallivm->builder;
  2152.    const struct lp_type type = bld->type;
  2153.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  2154.    char intrinsic[32];
  2155.  
  2156.    assert(lp_check_value(type, a));
  2157.  
  2158.    /* TODO: optimize the constant case */
  2159.  
  2160.    assert(type.floating);
  2161.    if (type.length == 1) {
  2162.       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
  2163.    }
  2164.    else {
  2165.       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
  2166.    }
  2167.  
  2168.    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
  2169. }
  2170.  
  2171.  
  2172. /**
  2173.  * Do one Newton-Raphson step to improve reciprocate precision:
  2174.  *
  2175.  *   x_{i+1} = x_i * (2 - a * x_i)
  2176.  *
  2177.  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
  2178.  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
  2179.  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
  2180.  * halo. It would be necessary to clamp the argument to prevent this.
  2181.  *
  2182.  * See also:
  2183.  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  2184.  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
  2185.  */
  2186. static INLINE LLVMValueRef
  2187. lp_build_rcp_refine(struct lp_build_context *bld,
  2188.                     LLVMValueRef a,
  2189.                     LLVMValueRef rcp_a)
  2190. {
  2191.    LLVMBuilderRef builder = bld->gallivm->builder;
  2192.    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
  2193.    LLVMValueRef res;
  2194.  
  2195.    res = LLVMBuildFMul(builder, a, rcp_a, "");
  2196.    res = LLVMBuildFSub(builder, two, res, "");
  2197.    res = LLVMBuildFMul(builder, rcp_a, res, "");
  2198.  
  2199.    return res;
  2200. }
  2201.  
  2202.  
  2203. LLVMValueRef
  2204. lp_build_rcp(struct lp_build_context *bld,
  2205.              LLVMValueRef a)
  2206. {
  2207.    LLVMBuilderRef builder = bld->gallivm->builder;
  2208.    const struct lp_type type = bld->type;
  2209.  
  2210.    assert(lp_check_value(type, a));
  2211.  
  2212.    if(a == bld->zero)
  2213.       return bld->undef;
  2214.    if(a == bld->one)
  2215.       return bld->one;
  2216.    if(a == bld->undef)
  2217.       return bld->undef;
  2218.  
  2219.    assert(type.floating);
  2220.  
  2221.    if(LLVMIsConstant(a))
  2222.       return LLVMConstFDiv(bld->one, a);
  2223.  
  2224.    /*
  2225.     * We don't use RCPPS because:
  2226.     * - it only has 10bits of precision
  2227.     * - it doesn't even get the reciprocate of 1.0 exactly
  2228.     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
  2229.     * - for recent processors the benefit over DIVPS is marginal, a case
  2230.     *   dependent
  2231.     *
  2232.     * We could still use it on certain processors if benchmarks show that the
  2233.     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
  2234.     * particular uses that require less workarounds.
  2235.     */
  2236.  
  2237.    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
  2238.          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
  2239.       const unsigned num_iterations = 0;
  2240.       LLVMValueRef res;
  2241.       unsigned i;
  2242.       const char *intrinsic = NULL;
  2243.  
  2244.       if (type.length == 4) {
  2245.          intrinsic = "llvm.x86.sse.rcp.ps";
  2246.       }
  2247.       else {
  2248.          intrinsic = "llvm.x86.avx.rcp.ps.256";
  2249.       }
  2250.  
  2251.       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  2252.  
  2253.       for (i = 0; i < num_iterations; ++i) {
  2254.          res = lp_build_rcp_refine(bld, a, res);
  2255.       }
  2256.  
  2257.       return res;
  2258.    }
  2259.  
  2260.    return LLVMBuildFDiv(builder, bld->one, a, "");
  2261. }
  2262.  
  2263.  
  2264. /**
  2265.  * Do one Newton-Raphson step to improve rsqrt precision:
  2266.  *
  2267.  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
  2268.  *
  2269.  * See also Intel 64 and IA-32 Architectures Optimization Manual.
  2270.  */
  2271. static INLINE LLVMValueRef
  2272. lp_build_rsqrt_refine(struct lp_build_context *bld,
  2273.                       LLVMValueRef a,
  2274.                       LLVMValueRef rsqrt_a)
  2275. {
  2276.    LLVMBuilderRef builder = bld->gallivm->builder;
  2277.    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
  2278.    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
  2279.    LLVMValueRef res;
  2280.  
  2281.    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
  2282.    res = LLVMBuildFMul(builder, a, res, "");
  2283.    res = LLVMBuildFSub(builder, three, res, "");
  2284.    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
  2285.    res = LLVMBuildFMul(builder, half, res, "");
  2286.  
  2287.    return res;
  2288. }
  2289.  
  2290.  
  2291. /**
  2292.  * Generate 1/sqrt(a).
  2293.  * Result is undefined for values < 0, infinity for +0.
  2294.  */
  2295. LLVMValueRef
  2296. lp_build_rsqrt(struct lp_build_context *bld,
  2297.                LLVMValueRef a)
  2298. {
  2299.    LLVMBuilderRef builder = bld->gallivm->builder;
  2300.    const struct lp_type type = bld->type;
  2301.  
  2302.    assert(lp_check_value(type, a));
  2303.  
  2304.    assert(type.floating);
  2305.  
  2306.    /*
  2307.     * This should be faster but all denormals will end up as infinity.
  2308.     */
  2309.    if (0 && lp_build_fast_rsqrt_available(type)) {
  2310.       const unsigned num_iterations = 1;
  2311.       LLVMValueRef res;
  2312.       unsigned i;
  2313.  
  2314.       /* rsqrt(1.0) != 1.0 here */
  2315.       res = lp_build_fast_rsqrt(bld, a);
  2316.  
  2317.       if (num_iterations) {
  2318.          /*
  2319.           * Newton-Raphson will result in NaN instead of infinity for zero,
  2320.           * and NaN instead of zero for infinity.
  2321.           * Also, need to ensure rsqrt(1.0) == 1.0.
  2322.           * All numbers smaller than FLT_MIN will result in +infinity
  2323.           * (rsqrtps treats all denormals as zero).
  2324.           */
  2325.          /*
  2326.           * Certain non-c99 compilers don't know INFINITY and might not support
  2327.           * hacks to evaluate it at compile time neither.
  2328.           */
  2329.          const unsigned posinf_int = 0x7F800000;
  2330.          LLVMValueRef cmp;
  2331.          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
  2332.          LLVMValueRef inf = lp_build_const_int_vec(bld->gallivm, type, posinf_int);
  2333.  
  2334.          inf = LLVMBuildBitCast(builder, inf, lp_build_vec_type(bld->gallivm, type), "");
  2335.  
  2336.          for (i = 0; i < num_iterations; ++i) {
  2337.             res = lp_build_rsqrt_refine(bld, a, res);
  2338.          }
  2339.          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
  2340.          res = lp_build_select(bld, cmp, inf, res);
  2341.          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
  2342.          res = lp_build_select(bld, cmp, bld->zero, res);
  2343.          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
  2344.          res = lp_build_select(bld, cmp, bld->one, res);
  2345.       }
  2346.  
  2347.       return res;
  2348.    }
  2349.  
  2350.    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
  2351. }
  2352.  
  2353. /**
  2354.  * If there's a fast (inaccurate) rsqrt instruction available
  2355.  * (caller may want to avoid to call rsqrt_fast if it's not available,
  2356.  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
  2357.  * unavailable it would result in sqrt/div/mul so obviously
  2358.  * much better to just call sqrt, skipping both div and mul).
  2359.  */
  2360. boolean
  2361. lp_build_fast_rsqrt_available(struct lp_type type)
  2362. {
  2363.    assert(type.floating);
  2364.  
  2365.    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
  2366.        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
  2367.       return true;
  2368.    }
  2369.    return false;
  2370. }
  2371.  
  2372.  
  2373. /**
  2374.  * Generate 1/sqrt(a).
  2375.  * Result is undefined for values < 0, infinity for +0.
  2376.  * Precision is limited, only ~10 bits guaranteed
  2377.  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
  2378.  */
  2379. LLVMValueRef
  2380. lp_build_fast_rsqrt(struct lp_build_context *bld,
  2381.                     LLVMValueRef a)
  2382. {
  2383.    LLVMBuilderRef builder = bld->gallivm->builder;
  2384.    const struct lp_type type = bld->type;
  2385.  
  2386.    assert(lp_check_value(type, a));
  2387.  
  2388.    if (lp_build_fast_rsqrt_available(type)) {
  2389.       const char *intrinsic = NULL;
  2390.  
  2391.       if (type.length == 4) {
  2392.          intrinsic = "llvm.x86.sse.rsqrt.ps";
  2393.       }
  2394.       else {
  2395.          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
  2396.       }
  2397.       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  2398.    }
  2399.    else {
  2400.       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
  2401.    }
  2402.    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
  2403. }
  2404.  
  2405.  
  2406. /**
  2407.  * Generate sin(a) using SSE2
  2408.  */
  2409. LLVMValueRef
  2410. lp_build_sin(struct lp_build_context *bld,
  2411.              LLVMValueRef a)
  2412. {
  2413.    struct gallivm_state *gallivm = bld->gallivm;
  2414.    LLVMBuilderRef builder = gallivm->builder;
  2415.    struct lp_type int_type = lp_int_type(bld->type);
  2416.    LLVMBuilderRef b = builder;
  2417.  
  2418.    /*
  2419.     *  take the absolute value,
  2420.     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
  2421.     */
  2422.  
  2423.    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
  2424.    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
  2425.  
  2426.    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
  2427.    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
  2428.  
  2429.    /*
  2430.     * extract the sign bit (upper one)
  2431.     * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
  2432.     */
  2433.    LLVMValueRef sig_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
  2434.    LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
  2435.  
  2436.    /*
  2437.     * scale by 4/Pi
  2438.     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
  2439.     */
  2440.    
  2441.    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
  2442.    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
  2443.  
  2444.    /*
  2445.     * store the integer part of y in mm0
  2446.     * emm2 = _mm_cvttps_epi32(y);
  2447.     */
  2448.    
  2449.    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
  2450.  
  2451.    /*
  2452.     * j=(j+1) & (~1) (see the cephes sources)
  2453.     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
  2454.     */
  2455.  
  2456.    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
  2457.    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
  2458.    /*
  2459.     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
  2460.     */
  2461.    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
  2462.    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
  2463.  
  2464.    /*
  2465.     * y = _mm_cvtepi32_ps(emm2);
  2466.     */
  2467.    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
  2468.  
  2469.    /* get the swap sign flag
  2470.     * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
  2471.     */
  2472.    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
  2473.    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
  2474.    
  2475.    /*
  2476.     * emm2 = _mm_slli_epi32(emm0, 29);
  2477.     */  
  2478.    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
  2479.    LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
  2480.  
  2481.    /*
  2482.     * get the polynom selection mask
  2483.     * there is one polynom for 0 <= x <= Pi/4
  2484.     * and another one for Pi/4<x<=Pi/2
  2485.     * Both branches will be computed.
  2486.     *  
  2487.     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
  2488.     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  2489.     */
  2490.  
  2491.    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
  2492.    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
  2493.    LLVMValueRef poly_mask = lp_build_compare(gallivm,
  2494.                                              int_type, PIPE_FUNC_EQUAL,
  2495.                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
  2496.    /*
  2497.     *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
  2498.     */
  2499.    LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
  2500.  
  2501.    /*
  2502.     * _PS_CONST(minus_cephes_DP1, -0.78515625);
  2503.     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
  2504.     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
  2505.     */
  2506.    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
  2507.    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
  2508.    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
  2509.  
  2510.    /*
  2511.     * The magic pass: "Extended precision modular arithmetic"
  2512.     * x = ((x - y * DP1) - y * DP2) - y * DP3;
  2513.     * xmm1 = _mm_mul_ps(y, xmm1);
  2514.     * xmm2 = _mm_mul_ps(y, xmm2);
  2515.     * xmm3 = _mm_mul_ps(y, xmm3);
  2516.     */
  2517.    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
  2518.    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
  2519.    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
  2520.  
  2521.    /*
  2522.     * x = _mm_add_ps(x, xmm1);
  2523.     * x = _mm_add_ps(x, xmm2);
  2524.     * x = _mm_add_ps(x, xmm3);
  2525.     */
  2526.  
  2527.    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
  2528.    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
  2529.    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
  2530.  
  2531.    /*
  2532.     * Evaluate the first polynom  (0 <= x <= Pi/4)
  2533.     *
  2534.     * z = _mm_mul_ps(x,x);
  2535.     */
  2536.    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
  2537.  
  2538.    /*
  2539.     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
  2540.     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
  2541.     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
  2542.     */
  2543.    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
  2544.    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
  2545.    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
  2546.  
  2547.    /*
  2548.     * y = *(v4sf*)_ps_coscof_p0;
  2549.     * y = _mm_mul_ps(y, z);
  2550.     */
  2551.    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
  2552.    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
  2553.    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
  2554.    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
  2555.    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
  2556.    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
  2557.  
  2558.  
  2559.    /*
  2560.     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  2561.     * y = _mm_sub_ps(y, tmp);
  2562.     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
  2563.     */
  2564.    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
  2565.    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
  2566.    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
  2567.    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
  2568.    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
  2569.  
  2570.    /*
  2571.     * _PS_CONST(sincof_p0, -1.9515295891E-4);
  2572.     * _PS_CONST(sincof_p1,  8.3321608736E-3);
  2573.     * _PS_CONST(sincof_p2, -1.6666654611E-1);
  2574.     */
  2575.    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
  2576.    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
  2577.    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
  2578.  
  2579.    /*
  2580.     * Evaluate the second polynom  (Pi/4 <= x <= 0)
  2581.     *
  2582.     * y2 = *(v4sf*)_ps_sincof_p0;
  2583.     * y2 = _mm_mul_ps(y2, z);
  2584.     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  2585.     * y2 = _mm_mul_ps(y2, z);
  2586.     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  2587.     * y2 = _mm_mul_ps(y2, z);
  2588.     * y2 = _mm_mul_ps(y2, x);
  2589.     * y2 = _mm_add_ps(y2, x);
  2590.     */
  2591.  
  2592.    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
  2593.    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
  2594.    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
  2595.    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
  2596.    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
  2597.    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
  2598.    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
  2599.  
  2600.    /*
  2601.     * select the correct result from the two polynoms
  2602.     * xmm3 = poly_mask;
  2603.     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  2604.     * y = _mm_andnot_ps(xmm3, y);
  2605.     * y = _mm_or_ps(y,y2);
  2606.     */
  2607.    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
  2608.    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
  2609.    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
  2610.    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
  2611.    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
  2612.    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
  2613.  
  2614.    /*
  2615.     * update the sign
  2616.     * y = _mm_xor_ps(y, sign_bit);
  2617.     */
  2618.    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
  2619.    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
  2620.    return y_result;
  2621. }
  2622.  
  2623.  
  2624. /**
  2625.  * Generate cos(a) using SSE2
  2626.  */
  2627. LLVMValueRef
  2628. lp_build_cos(struct lp_build_context *bld,
  2629.              LLVMValueRef a)
  2630. {
  2631.    struct gallivm_state *gallivm = bld->gallivm;
  2632.    LLVMBuilderRef builder = gallivm->builder;
  2633.    struct lp_type int_type = lp_int_type(bld->type);
  2634.    LLVMBuilderRef b = builder;
  2635.  
  2636.    /*
  2637.     *  take the absolute value,
  2638.     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
  2639.     */
  2640.  
  2641.    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
  2642.    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
  2643.  
  2644.    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
  2645.    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
  2646.  
  2647.    /*
  2648.     * scale by 4/Pi
  2649.     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
  2650.     */
  2651.    
  2652.    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
  2653.    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
  2654.  
  2655.    /*
  2656.     * store the integer part of y in mm0
  2657.     * emm2 = _mm_cvttps_epi32(y);
  2658.     */
  2659.    
  2660.    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
  2661.  
  2662.    /*
  2663.     * j=(j+1) & (~1) (see the cephes sources)
  2664.     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
  2665.     */
  2666.  
  2667.    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
  2668.    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
  2669.    /*
  2670.     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
  2671.     */
  2672.    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
  2673.    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
  2674.  
  2675.    /*
  2676.     * y = _mm_cvtepi32_ps(emm2);
  2677.     */
  2678.    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
  2679.  
  2680.  
  2681.    /*
  2682.     * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
  2683.     */
  2684.    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
  2685.    LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
  2686.  
  2687.  
  2688.    /* get the swap sign flag
  2689.     * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
  2690.     */
  2691.    LLVMValueRef inv = lp_build_const_int_vec(gallivm, bld->type, ~0);
  2692.    LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
  2693.    LLVMValueRef pi32_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
  2694.    LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
  2695.    
  2696.    /*
  2697.     * emm2 = _mm_slli_epi32(emm0, 29);
  2698.     */  
  2699.    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
  2700.    LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
  2701.  
  2702.    /*
  2703.     * get the polynom selection mask
  2704.     * there is one polynom for 0 <= x <= Pi/4
  2705.     * and another one for Pi/4<x<=Pi/2
  2706.     * Both branches will be computed.
  2707.     *  
  2708.     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
  2709.     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  2710.     */
  2711.  
  2712.    LLVMValueRef pi32_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
  2713.    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
  2714.    LLVMValueRef poly_mask = lp_build_compare(gallivm,
  2715.                                              int_type, PIPE_FUNC_EQUAL,
  2716.                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
  2717.  
  2718.    /*
  2719.     * _PS_CONST(minus_cephes_DP1, -0.78515625);
  2720.     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
  2721.     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
  2722.     */
  2723.    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
  2724.    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
  2725.    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
  2726.  
  2727.    /*
  2728.     * The magic pass: "Extended precision modular arithmetic"
  2729.     * x = ((x - y * DP1) - y * DP2) - y * DP3;
  2730.     * xmm1 = _mm_mul_ps(y, xmm1);
  2731.     * xmm2 = _mm_mul_ps(y, xmm2);
  2732.     * xmm3 = _mm_mul_ps(y, xmm3);
  2733.     */
  2734.    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
  2735.    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
  2736.    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
  2737.  
  2738.    /*
  2739.     * x = _mm_add_ps(x, xmm1);
  2740.     * x = _mm_add_ps(x, xmm2);
  2741.     * x = _mm_add_ps(x, xmm3);
  2742.     */
  2743.  
  2744.    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
  2745.    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
  2746.    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
  2747.  
  2748.    /*
  2749.     * Evaluate the first polynom  (0 <= x <= Pi/4)
  2750.     *
  2751.     * z = _mm_mul_ps(x,x);
  2752.     */
  2753.    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
  2754.  
  2755.    /*
  2756.     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
  2757.     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
  2758.     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
  2759.     */
  2760.    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
  2761.    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
  2762.    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
  2763.  
  2764.    /*
  2765.     * y = *(v4sf*)_ps_coscof_p0;
  2766.     * y = _mm_mul_ps(y, z);
  2767.     */
  2768.    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
  2769.    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
  2770.    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
  2771.    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
  2772.    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
  2773.    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
  2774.  
  2775.  
  2776.    /*
  2777.     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  2778.     * y = _mm_sub_ps(y, tmp);
  2779.     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
  2780.     */
  2781.    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
  2782.    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
  2783.    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
  2784.    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
  2785.    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
  2786.  
  2787.    /*
  2788.     * _PS_CONST(sincof_p0, -1.9515295891E-4);
  2789.     * _PS_CONST(sincof_p1,  8.3321608736E-3);
  2790.     * _PS_CONST(sincof_p2, -1.6666654611E-1);
  2791.     */
  2792.    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
  2793.    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
  2794.    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
  2795.  
  2796.    /*
  2797.     * Evaluate the second polynom  (Pi/4 <= x <= 0)
  2798.     *
  2799.     * y2 = *(v4sf*)_ps_sincof_p0;
  2800.     * y2 = _mm_mul_ps(y2, z);
  2801.     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  2802.     * y2 = _mm_mul_ps(y2, z);
  2803.     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  2804.     * y2 = _mm_mul_ps(y2, z);
  2805.     * y2 = _mm_mul_ps(y2, x);
  2806.     * y2 = _mm_add_ps(y2, x);
  2807.     */
  2808.  
  2809.    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
  2810.    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
  2811.    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
  2812.    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
  2813.    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
  2814.    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
  2815.    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
  2816.  
  2817.    /*
  2818.     * select the correct result from the two polynoms
  2819.     * xmm3 = poly_mask;
  2820.     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  2821.     * y = _mm_andnot_ps(xmm3, y);
  2822.     * y = _mm_or_ps(y,y2);
  2823.     */
  2824.    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
  2825.    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
  2826.    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
  2827.    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
  2828.    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
  2829.    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
  2830.  
  2831.    /*
  2832.     * update the sign
  2833.     * y = _mm_xor_ps(y, sign_bit);
  2834.     */
  2835.    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
  2836.    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
  2837.    return y_result;
  2838. }
  2839.  
  2840.  
  2841. /**
  2842.  * Generate pow(x, y)
  2843.  */
  2844. LLVMValueRef
  2845. lp_build_pow(struct lp_build_context *bld,
  2846.              LLVMValueRef x,
  2847.              LLVMValueRef y)
  2848. {
  2849.    /* TODO: optimize the constant case */
  2850.    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  2851.        LLVMIsConstant(x) && LLVMIsConstant(y)) {
  2852.       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  2853.                    __FUNCTION__);
  2854.    }
  2855.  
  2856.    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
  2857. }
  2858.  
  2859.  
  2860. /**
  2861.  * Generate exp(x)
  2862.  */
  2863. LLVMValueRef
  2864. lp_build_exp(struct lp_build_context *bld,
  2865.              LLVMValueRef x)
  2866. {
  2867.    /* log2(e) = 1/log(2) */
  2868.    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
  2869.                                            1.4426950408889634);
  2870.  
  2871.    assert(lp_check_value(bld->type, x));
  2872.  
  2873.    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
  2874. }
  2875.  
  2876.  
  2877. /**
  2878.  * Generate log(x)
  2879.  */
  2880. LLVMValueRef
  2881. lp_build_log(struct lp_build_context *bld,
  2882.              LLVMValueRef x)
  2883. {
  2884.    /* log(2) */
  2885.    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
  2886.                                           0.69314718055994529);
  2887.  
  2888.    assert(lp_check_value(bld->type, x));
  2889.  
  2890.    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
  2891. }
  2892.  
  2893.  
  2894. /**
  2895.  * Generate polynomial.
  2896.  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
  2897.  */
  2898. LLVMValueRef
  2899. lp_build_polynomial(struct lp_build_context *bld,
  2900.                     LLVMValueRef x,
  2901.                     const double *coeffs,
  2902.                     unsigned num_coeffs)
  2903. {
  2904.    const struct lp_type type = bld->type;
  2905.    LLVMValueRef even = NULL, odd = NULL;
  2906.    LLVMValueRef x2;
  2907.    unsigned i;
  2908.  
  2909.    assert(lp_check_value(bld->type, x));
  2910.  
  2911.    /* TODO: optimize the constant case */
  2912.    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  2913.        LLVMIsConstant(x)) {
  2914.       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  2915.                    __FUNCTION__);
  2916.    }
  2917.  
  2918.    /*
  2919.     * Calculate odd and even terms seperately to decrease data dependency
  2920.     * Ex:
  2921.     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
  2922.     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
  2923.     */
  2924.    x2 = lp_build_mul(bld, x, x);
  2925.  
  2926.    for (i = num_coeffs; i--; ) {
  2927.       LLVMValueRef coeff;
  2928.  
  2929.       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
  2930.  
  2931.       if (i % 2 == 0) {
  2932.          if (even)
  2933.             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
  2934.          else
  2935.             even = coeff;
  2936.       } else {
  2937.          if (odd)
  2938.             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
  2939.          else
  2940.             odd = coeff;
  2941.       }
  2942.    }
  2943.  
  2944.    if (odd)
  2945.       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
  2946.    else if (even)
  2947.       return even;
  2948.    else
  2949.       return bld->undef;
  2950. }
  2951.  
  2952.  
  2953. /**
  2954.  * Minimax polynomial fit of 2**x, in range [0, 1[
  2955.  */
  2956. const double lp_build_exp2_polynomial[] = {
  2957. #if EXP_POLY_DEGREE == 5
  2958.    0.999999925063526176901,
  2959.    0.693153073200168932794,
  2960.    0.240153617044375388211,
  2961.    0.0558263180532956664775,
  2962.    0.00898934009049466391101,
  2963.    0.00187757667519147912699
  2964. #elif EXP_POLY_DEGREE == 4
  2965.    1.00000259337069434683,
  2966.    0.693003834469974940458,
  2967.    0.24144275689150793076,
  2968.    0.0520114606103070150235,
  2969.    0.0135341679161270268764
  2970. #elif EXP_POLY_DEGREE == 3
  2971.    0.999925218562710312959,
  2972.    0.695833540494823811697,
  2973.    0.226067155427249155588,
  2974.    0.0780245226406372992967
  2975. #elif EXP_POLY_DEGREE == 2
  2976.    1.00172476321474503578,
  2977.    0.657636275736077639316,
  2978.    0.33718943461968720704
  2979. #else
  2980. #error
  2981. #endif
  2982. };
  2983.  
  2984.  
  2985. void
  2986. lp_build_exp2_approx(struct lp_build_context *bld,
  2987.                      LLVMValueRef x,
  2988.                      LLVMValueRef *p_exp2_int_part,
  2989.                      LLVMValueRef *p_frac_part,
  2990.                      LLVMValueRef *p_exp2)
  2991. {
  2992.    LLVMBuilderRef builder = bld->gallivm->builder;
  2993.    const struct lp_type type = bld->type;
  2994.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  2995.    LLVMValueRef ipart = NULL;
  2996.    LLVMValueRef fpart = NULL;
  2997.    LLVMValueRef expipart = NULL;
  2998.    LLVMValueRef expfpart = NULL;
  2999.    LLVMValueRef res = NULL;
  3000.  
  3001.    assert(lp_check_value(bld->type, x));
  3002.  
  3003.    if(p_exp2_int_part || p_frac_part || p_exp2) {
  3004.       /* TODO: optimize the constant case */
  3005.       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  3006.           LLVMIsConstant(x)) {
  3007.          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  3008.                       __FUNCTION__);
  3009.       }
  3010.  
  3011.       assert(type.floating && type.width == 32);
  3012.  
  3013.       x = lp_build_min(bld, x, lp_build_const_vec(bld->gallivm, type,  129.0));
  3014.       x = lp_build_max(bld, x, lp_build_const_vec(bld->gallivm, type, -126.99999));
  3015.  
  3016.       /* ipart = floor(x) */
  3017.       /* fpart = x - ipart */
  3018.       lp_build_ifloor_fract(bld, x, &ipart, &fpart);
  3019.    }
  3020.  
  3021.    if(p_exp2_int_part || p_exp2) {
  3022.       /* expipart = (float) (1 << ipart) */
  3023.       expipart = LLVMBuildAdd(builder, ipart,
  3024.                               lp_build_const_int_vec(bld->gallivm, type, 127), "");
  3025.       expipart = LLVMBuildShl(builder, expipart,
  3026.                               lp_build_const_int_vec(bld->gallivm, type, 23), "");
  3027.       expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
  3028.    }
  3029.  
  3030.    if(p_exp2) {
  3031.       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
  3032.                                      Elements(lp_build_exp2_polynomial));
  3033.  
  3034.       res = LLVMBuildFMul(builder, expipart, expfpart, "");
  3035.    }
  3036.  
  3037.    if(p_exp2_int_part)
  3038.       *p_exp2_int_part = expipart;
  3039.  
  3040.    if(p_frac_part)
  3041.       *p_frac_part = fpart;
  3042.  
  3043.    if(p_exp2)
  3044.       *p_exp2 = res;
  3045. }
  3046.  
  3047.  
  3048. LLVMValueRef
  3049. lp_build_exp2(struct lp_build_context *bld,
  3050.               LLVMValueRef x)
  3051. {
  3052.    LLVMValueRef res;
  3053.    lp_build_exp2_approx(bld, x, NULL, NULL, &res);
  3054.    return res;
  3055. }
  3056.  
  3057.  
  3058. /**
  3059.  * Extract the exponent of a IEEE-754 floating point value.
  3060.  *
  3061.  * Optionally apply an integer bias.
  3062.  *
  3063.  * Result is an integer value with
  3064.  *
  3065.  *   ifloor(log2(x)) + bias
  3066.  */
  3067. LLVMValueRef
  3068. lp_build_extract_exponent(struct lp_build_context *bld,
  3069.                           LLVMValueRef x,
  3070.                           int bias)
  3071. {
  3072.    LLVMBuilderRef builder = bld->gallivm->builder;
  3073.    const struct lp_type type = bld->type;
  3074.    unsigned mantissa = lp_mantissa(type);
  3075.    LLVMValueRef res;
  3076.  
  3077.    assert(type.floating);
  3078.  
  3079.    assert(lp_check_value(bld->type, x));
  3080.  
  3081.    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
  3082.  
  3083.    res = LLVMBuildLShr(builder, x,
  3084.                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
  3085.    res = LLVMBuildAnd(builder, res,
  3086.                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
  3087.    res = LLVMBuildSub(builder, res,
  3088.                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
  3089.  
  3090.    return res;
  3091. }
  3092.  
  3093.  
  3094. /**
  3095.  * Extract the mantissa of the a floating.
  3096.  *
  3097.  * Result is a floating point value with
  3098.  *
  3099.  *   x / floor(log2(x))
  3100.  */
  3101. LLVMValueRef
  3102. lp_build_extract_mantissa(struct lp_build_context *bld,
  3103.                           LLVMValueRef x)
  3104. {
  3105.    LLVMBuilderRef builder = bld->gallivm->builder;
  3106.    const struct lp_type type = bld->type;
  3107.    unsigned mantissa = lp_mantissa(type);
  3108.    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
  3109.                                                   (1ULL << mantissa) - 1);
  3110.    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
  3111.    LLVMValueRef res;
  3112.  
  3113.    assert(lp_check_value(bld->type, x));
  3114.  
  3115.    assert(type.floating);
  3116.  
  3117.    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
  3118.  
  3119.    /* res = x / 2**ipart */
  3120.    res = LLVMBuildAnd(builder, x, mantmask, "");
  3121.    res = LLVMBuildOr(builder, res, one, "");
  3122.    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
  3123.  
  3124.    return res;
  3125. }
  3126.  
  3127.  
  3128.  
  3129. /**
  3130.  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
  3131.  * These coefficients can be generate with
  3132.  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
  3133.  */
  3134. const double lp_build_log2_polynomial[] = {
  3135. #if LOG_POLY_DEGREE == 5
  3136.    2.88539008148777786488L,
  3137.    0.961796878841293367824L,
  3138.    0.577058946784739859012L,
  3139.    0.412914355135828735411L,
  3140.    0.308591899232910175289L,
  3141.    0.352376952300281371868L,
  3142. #elif LOG_POLY_DEGREE == 4
  3143.    2.88539009343309178325L,
  3144.    0.961791550404184197881L,
  3145.    0.577440339438736392009L,
  3146.    0.403343858251329912514L,
  3147.    0.406718052498846252698L,
  3148. #elif LOG_POLY_DEGREE == 3
  3149.    2.88538959748872753838L,
  3150.    0.961932915889597772928L,
  3151.    0.571118517972136195241L,
  3152.    0.493997535084709500285L,
  3153. #else
  3154. #error
  3155. #endif
  3156. };
  3157.  
  3158. /**
  3159.  * See http://www.devmaster.net/forums/showthread.php?p=43580
  3160.  * http://en.wikipedia.org/wiki/Logarithm#Calculation
  3161.  * http://www.nezumi.demon.co.uk/consult/logx.htm
  3162.  */
  3163. void
  3164. lp_build_log2_approx(struct lp_build_context *bld,
  3165.                      LLVMValueRef x,
  3166.                      LLVMValueRef *p_exp,
  3167.                      LLVMValueRef *p_floor_log2,
  3168.                      LLVMValueRef *p_log2)
  3169. {
  3170.    LLVMBuilderRef builder = bld->gallivm->builder;
  3171.    const struct lp_type type = bld->type;
  3172.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  3173.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  3174.  
  3175.    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
  3176.    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
  3177.    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
  3178.  
  3179.    LLVMValueRef i = NULL;
  3180.    LLVMValueRef y = NULL;
  3181.    LLVMValueRef z = NULL;
  3182.    LLVMValueRef exp = NULL;
  3183.    LLVMValueRef mant = NULL;
  3184.    LLVMValueRef logexp = NULL;
  3185.    LLVMValueRef logmant = NULL;
  3186.    LLVMValueRef res = NULL;
  3187.  
  3188.    assert(lp_check_value(bld->type, x));
  3189.  
  3190.    if(p_exp || p_floor_log2 || p_log2) {
  3191.       /* TODO: optimize the constant case */
  3192.       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  3193.           LLVMIsConstant(x)) {
  3194.          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  3195.                       __FUNCTION__);
  3196.       }
  3197.  
  3198.       assert(type.floating && type.width == 32);
  3199.  
  3200.       /*
  3201.        * We don't explicitly handle denormalized numbers. They will yield a
  3202.        * result in the neighbourhood of -127, which appears to be adequate
  3203.        * enough.
  3204.        */
  3205.  
  3206.       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
  3207.  
  3208.       /* exp = (float) exponent(x) */
  3209.       exp = LLVMBuildAnd(builder, i, expmask, "");
  3210.    }
  3211.  
  3212.    if(p_floor_log2 || p_log2) {
  3213.       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
  3214.       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
  3215.       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
  3216.    }
  3217.  
  3218.    if(p_log2) {
  3219.       /* mant = 1 + (float) mantissa(x) */
  3220.       mant = LLVMBuildAnd(builder, i, mantmask, "");
  3221.       mant = LLVMBuildOr(builder, mant, one, "");
  3222.       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
  3223.  
  3224.       /* y = (mant - 1) / (mant + 1) */
  3225.       y = lp_build_div(bld,
  3226.          lp_build_sub(bld, mant, bld->one),
  3227.          lp_build_add(bld, mant, bld->one)
  3228.       );
  3229.  
  3230.       /* z = y^2 */
  3231.       z = lp_build_mul(bld, y, y);
  3232.  
  3233.       /* compute P(z) */
  3234.       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
  3235.                                     Elements(lp_build_log2_polynomial));
  3236.  
  3237.       /* logmant = y * P(z) */
  3238.       logmant = lp_build_mul(bld, y, logmant);
  3239.  
  3240.       res = lp_build_add(bld, logmant, logexp);
  3241.    }
  3242.  
  3243.    if(p_exp) {
  3244.       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
  3245.       *p_exp = exp;
  3246.    }
  3247.  
  3248.    if(p_floor_log2)
  3249.       *p_floor_log2 = logexp;
  3250.  
  3251.    if(p_log2)
  3252.       *p_log2 = res;
  3253. }
  3254.  
  3255.  
  3256. LLVMValueRef
  3257. lp_build_log2(struct lp_build_context *bld,
  3258.               LLVMValueRef x)
  3259. {
  3260.    LLVMValueRef res;
  3261.    lp_build_log2_approx(bld, x, NULL, NULL, &res);
  3262.    return res;
  3263. }
  3264.  
  3265.  
  3266. /**
  3267.  * Faster (and less accurate) log2.
  3268.  *
  3269.  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
  3270.  *
  3271.  * Piece-wise linear approximation, with exact results when x is a
  3272.  * power of two.
  3273.  *
  3274.  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
  3275.  */
  3276. LLVMValueRef
  3277. lp_build_fast_log2(struct lp_build_context *bld,
  3278.                    LLVMValueRef x)
  3279. {
  3280.    LLVMBuilderRef builder = bld->gallivm->builder;
  3281.    LLVMValueRef ipart;
  3282.    LLVMValueRef fpart;
  3283.  
  3284.    assert(lp_check_value(bld->type, x));
  3285.  
  3286.    assert(bld->type.floating);
  3287.  
  3288.    /* ipart = floor(log2(x)) - 1 */
  3289.    ipart = lp_build_extract_exponent(bld, x, -1);
  3290.    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
  3291.  
  3292.    /* fpart = x / 2**ipart */
  3293.    fpart = lp_build_extract_mantissa(bld, x);
  3294.  
  3295.    /* ipart + fpart */
  3296.    return LLVMBuildFAdd(builder, ipart, fpart, "");
  3297. }
  3298.  
  3299.  
  3300. /**
  3301.  * Fast implementation of iround(log2(x)).
  3302.  *
  3303.  * Not an approximation -- it should give accurate results all the time.
  3304.  */
  3305. LLVMValueRef
  3306. lp_build_ilog2(struct lp_build_context *bld,
  3307.                LLVMValueRef x)
  3308. {
  3309.    LLVMBuilderRef builder = bld->gallivm->builder;
  3310.    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
  3311.    LLVMValueRef ipart;
  3312.  
  3313.    assert(bld->type.floating);
  3314.  
  3315.    assert(lp_check_value(bld->type, x));
  3316.  
  3317.    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
  3318.    x = LLVMBuildFMul(builder, x, sqrt2, "");
  3319.  
  3320.    /* ipart = floor(log2(x) + 0.5)  */
  3321.    ipart = lp_build_extract_exponent(bld, x, 0);
  3322.  
  3323.    return ipart;
  3324. }
  3325.  
  3326. LLVMValueRef
  3327. lp_build_mod(struct lp_build_context *bld,
  3328.              LLVMValueRef x,
  3329.              LLVMValueRef y)
  3330. {
  3331.    LLVMBuilderRef builder = bld->gallivm->builder;
  3332.    LLVMValueRef res;
  3333.    const struct lp_type type = bld->type;
  3334.  
  3335.    assert(lp_check_value(type, x));
  3336.    assert(lp_check_value(type, y));
  3337.  
  3338.    if (type.floating)
  3339.       res = LLVMBuildFRem(builder, x, y, "");
  3340.    else if (type.sign)
  3341.       res = LLVMBuildSRem(builder, x, y, "");
  3342.    else
  3343.       res = LLVMBuildURem(builder, x, y, "");
  3344.    return res;
  3345. }
  3346.