Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009-2010 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28.  
  29. /**
  30.  * @file
  31.  * Helper
  32.  *
  33.  * LLVM IR doesn't support all basic arithmetic operations we care about (most
  34.  * notably min/max and saturated operations), and it is often necessary to
  35.  * resort machine-specific intrinsics directly. The functions here hide all
  36.  * these implementation details from the other modules.
  37.  *
  38.  * We also do simple expressions simplification here. Reasons are:
  39.  * - it is very easy given we have all necessary information readily available
  40.  * - LLVM optimization passes fail to simplify several vector expressions
  41.  * - We often know value constraints which the optimization passes have no way
  42.  *   of knowing, such as when source arguments are known to be in [0, 1] range.
  43.  *
  44.  * @author Jose Fonseca <jfonseca@vmware.com>
  45.  */
  46.  
  47.  
  48. #include <float.h>
  49.  
  50. #include "util/u_memory.h"
  51. #include "util/u_debug.h"
  52. #include "util/u_math.h"
  53. #include "util/u_string.h"
  54. #include "util/u_cpu_detect.h"
  55.  
  56. #include "lp_bld_type.h"
  57. #include "lp_bld_const.h"
  58. #include "lp_bld_init.h"
  59. #include "lp_bld_intr.h"
  60. #include "lp_bld_logic.h"
  61. #include "lp_bld_pack.h"
  62. #include "lp_bld_debug.h"
  63. #include "lp_bld_bitarit.h"
  64. #include "lp_bld_arit.h"
  65. #include "lp_bld_flow.h"
  66.  
  67. #if defined(PIPE_ARCH_SSE)
  68. #include <xmmintrin.h>
  69. #endif
  70.  
  71. #ifndef _MM_DENORMALS_ZERO_MASK
  72. #define _MM_DENORMALS_ZERO_MASK 0x0040
  73. #endif
  74.  
  75. #ifndef _MM_FLUSH_ZERO_MASK
  76. #define _MM_FLUSH_ZERO_MASK 0x8000
  77. #endif
  78.  
  79. #define EXP_POLY_DEGREE 5
  80.  
  81. #define LOG_POLY_DEGREE 4
  82.  
  83.  
  84. /**
  85.  * Generate min(a, b)
  86.  * No checks for special case values of a or b = 1 or 0 are done.
  87.  * NaN's are handled according to the behavior specified by the
  88.  * nan_behavior argument.
  89.  */
  90. static LLVMValueRef
  91. lp_build_min_simple(struct lp_build_context *bld,
  92.                     LLVMValueRef a,
  93.                     LLVMValueRef b,
  94.                     enum gallivm_nan_behavior nan_behavior)
  95. {
  96.    const struct lp_type type = bld->type;
  97.    const char *intrinsic = NULL;
  98.    unsigned intr_size = 0;
  99.    LLVMValueRef cond;
  100.  
  101.    assert(lp_check_value(type, a));
  102.    assert(lp_check_value(type, b));
  103.  
  104.    /* TODO: optimize the constant case */
  105.  
  106.    if (type.floating && util_cpu_caps.has_sse) {
  107.       if (type.width == 32) {
  108.          if (type.length == 1) {
  109.             intrinsic = "llvm.x86.sse.min.ss";
  110.             intr_size = 128;
  111.          }
  112.          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  113.             intrinsic = "llvm.x86.sse.min.ps";
  114.             intr_size = 128;
  115.          }
  116.          else {
  117.             intrinsic = "llvm.x86.avx.min.ps.256";
  118.             intr_size = 256;
  119.          }
  120.       }
  121.       if (type.width == 64 && util_cpu_caps.has_sse2) {
  122.          if (type.length == 1) {
  123.             intrinsic = "llvm.x86.sse2.min.sd";
  124.             intr_size = 128;
  125.          }
  126.          else if (type.length == 2 || !util_cpu_caps.has_avx) {
  127.             intrinsic = "llvm.x86.sse2.min.pd";
  128.             intr_size = 128;
  129.          }
  130.          else {
  131.             intrinsic = "llvm.x86.avx.min.pd.256";
  132.             intr_size = 256;
  133.          }
  134.       }
  135.    }
  136.    else if (type.floating && util_cpu_caps.has_altivec) {
  137.       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
  138.           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
  139.          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
  140.                       __FUNCTION__);
  141.       }
  142.       if (type.width == 32 && type.length == 4) {
  143.          intrinsic = "llvm.ppc.altivec.vminfp";
  144.          intr_size = 128;
  145.       }
  146.    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
  147.       intr_size = 128;
  148.       if ((type.width == 8 || type.width == 16) &&
  149.           (type.width * type.length <= 64) &&
  150.           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
  151.          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
  152.                       __FUNCTION__);
  153.       }
  154.       if (type.width == 8 && !type.sign) {
  155.          intrinsic = "llvm.x86.sse2.pminu.b";
  156.       }
  157.       else if (type.width == 16 && type.sign) {
  158.          intrinsic = "llvm.x86.sse2.pmins.w";
  159.       }
  160.       if (util_cpu_caps.has_sse4_1) {
  161.          if (type.width == 8 && type.sign) {
  162.             intrinsic = "llvm.x86.sse41.pminsb";
  163.          }
  164.          if (type.width == 16 && !type.sign) {
  165.             intrinsic = "llvm.x86.sse41.pminuw";
  166.          }
  167.          if (type.width == 32 && !type.sign) {
  168.             intrinsic = "llvm.x86.sse41.pminud";
  169.          }
  170.          if (type.width == 32 && type.sign) {
  171.             intrinsic = "llvm.x86.sse41.pminsd";
  172.          }
  173.       }
  174.    } else if (util_cpu_caps.has_altivec) {
  175.       intr_size = 128;
  176.       if (type.width == 8) {
  177.          if (!type.sign) {
  178.             intrinsic = "llvm.ppc.altivec.vminub";
  179.          } else {
  180.             intrinsic = "llvm.ppc.altivec.vminsb";
  181.          }
  182.       } else if (type.width == 16) {
  183.          if (!type.sign) {
  184.             intrinsic = "llvm.ppc.altivec.vminuh";
  185.          } else {
  186.             intrinsic = "llvm.ppc.altivec.vminsh";
  187.          }
  188.       } else if (type.width == 32) {
  189.          if (!type.sign) {
  190.             intrinsic = "llvm.ppc.altivec.vminuw";
  191.          } else {
  192.             intrinsic = "llvm.ppc.altivec.vminsw";
  193.          }
  194.       }
  195.    }
  196.  
  197.    if(intrinsic) {
  198.       /* We need to handle nan's for floating point numbers. If one of the
  199.        * inputs is nan the other should be returned (required by both D3D10+
  200.        * and OpenCL).
  201.        * The sse intrinsics return the second operator in case of nan by
  202.        * default so we need to special code to handle those.
  203.        */
  204.       if (util_cpu_caps.has_sse && type.floating &&
  205.           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
  206.           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
  207.           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
  208.          LLVMValueRef isnan, min;
  209.          min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
  210.                                                    type,
  211.                                                    intr_size, a, b);
  212.          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
  213.             isnan = lp_build_isnan(bld, b);
  214.             return lp_build_select(bld, isnan, a, min);
  215.          } else {
  216.             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
  217.             isnan = lp_build_isnan(bld, a);
  218.             return lp_build_select(bld, isnan, a, min);
  219.          }
  220.       } else {
  221.          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
  222.                                                     type,
  223.                                                     intr_size, a, b);
  224.       }
  225.    }
  226.  
  227.    if (type.floating) {
  228.       switch (nan_behavior) {
  229.       case GALLIVM_NAN_RETURN_NAN: {
  230.          LLVMValueRef isnan = lp_build_isnan(bld, b);
  231.          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
  232.          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
  233.          return lp_build_select(bld, cond, a, b);
  234.       }
  235.          break;
  236.       case GALLIVM_NAN_RETURN_OTHER: {
  237.          LLVMValueRef isnan = lp_build_isnan(bld, a);
  238.          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
  239.          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
  240.          return lp_build_select(bld, cond, a, b);
  241.       }
  242.          break;
  243.       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
  244.          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
  245.          return lp_build_select(bld, cond, a, b);
  246.       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
  247.          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
  248.          return lp_build_select(bld, cond, b, a);
  249.       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
  250.          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
  251.          return lp_build_select(bld, cond, a, b);
  252.          break;
  253.       default:
  254.          assert(0);
  255.          cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
  256.          return lp_build_select(bld, cond, a, b);
  257.       }
  258.    } else {
  259.       cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
  260.       return lp_build_select(bld, cond, a, b);
  261.    }
  262. }
  263.  
  264.  
  265. /**
  266.  * Generate max(a, b)
  267.  * No checks for special case values of a or b = 1 or 0 are done.
  268.  * NaN's are handled according to the behavior specified by the
  269.  * nan_behavior argument.
  270.  */
  271. static LLVMValueRef
  272. lp_build_max_simple(struct lp_build_context *bld,
  273.                     LLVMValueRef a,
  274.                     LLVMValueRef b,
  275.                     enum gallivm_nan_behavior nan_behavior)
  276. {
  277.    const struct lp_type type = bld->type;
  278.    const char *intrinsic = NULL;
  279.    unsigned intr_size = 0;
  280.    LLVMValueRef cond;
  281.  
  282.    assert(lp_check_value(type, a));
  283.    assert(lp_check_value(type, b));
  284.  
  285.    /* TODO: optimize the constant case */
  286.  
  287.    if (type.floating && util_cpu_caps.has_sse) {
  288.       if (type.width == 32) {
  289.          if (type.length == 1) {
  290.             intrinsic = "llvm.x86.sse.max.ss";
  291.             intr_size = 128;
  292.          }
  293.          else if (type.length <= 4 || !util_cpu_caps.has_avx) {
  294.             intrinsic = "llvm.x86.sse.max.ps";
  295.             intr_size = 128;
  296.          }
  297.          else {
  298.             intrinsic = "llvm.x86.avx.max.ps.256";
  299.             intr_size = 256;
  300.          }
  301.       }
  302.       if (type.width == 64 && util_cpu_caps.has_sse2) {
  303.          if (type.length == 1) {
  304.             intrinsic = "llvm.x86.sse2.max.sd";
  305.             intr_size = 128;
  306.          }
  307.          else if (type.length == 2 || !util_cpu_caps.has_avx) {
  308.             intrinsic = "llvm.x86.sse2.max.pd";
  309.             intr_size = 128;
  310.          }
  311.          else {
  312.             intrinsic = "llvm.x86.avx.max.pd.256";
  313.             intr_size = 256;
  314.          }
  315.       }
  316.    }
  317.    else if (type.floating && util_cpu_caps.has_altivec) {
  318.       if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
  319.           nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
  320.          debug_printf("%s: altivec doesn't support nan return nan behavior\n",
  321.                       __FUNCTION__);
  322.       }
  323.       if (type.width == 32 || type.length == 4) {
  324.          intrinsic = "llvm.ppc.altivec.vmaxfp";
  325.          intr_size = 128;
  326.       }
  327.    } else if (util_cpu_caps.has_sse2 && type.length >= 2) {
  328.       intr_size = 128;
  329.       if ((type.width == 8 || type.width == 16) &&
  330.           (type.width * type.length <= 64) &&
  331.           (gallivm_debug & GALLIVM_DEBUG_PERF)) {
  332.          debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
  333.                       __FUNCTION__);
  334.          }
  335.       if (type.width == 8 && !type.sign) {
  336.          intrinsic = "llvm.x86.sse2.pmaxu.b";
  337.          intr_size = 128;
  338.       }
  339.       else if (type.width == 16 && type.sign) {
  340.          intrinsic = "llvm.x86.sse2.pmaxs.w";
  341.       }
  342.       if (util_cpu_caps.has_sse4_1) {
  343.          if (type.width == 8 && type.sign) {
  344.             intrinsic = "llvm.x86.sse41.pmaxsb";
  345.          }
  346.          if (type.width == 16 && !type.sign) {
  347.             intrinsic = "llvm.x86.sse41.pmaxuw";
  348.          }
  349.          if (type.width == 32 && !type.sign) {
  350.             intrinsic = "llvm.x86.sse41.pmaxud";
  351.         }
  352.          if (type.width == 32 && type.sign) {
  353.             intrinsic = "llvm.x86.sse41.pmaxsd";
  354.          }
  355.       }
  356.    } else if (util_cpu_caps.has_altivec) {
  357.      intr_size = 128;
  358.      if (type.width == 8) {
  359.        if (!type.sign) {
  360.          intrinsic = "llvm.ppc.altivec.vmaxub";
  361.        } else {
  362.          intrinsic = "llvm.ppc.altivec.vmaxsb";
  363.        }
  364.      } else if (type.width == 16) {
  365.        if (!type.sign) {
  366.          intrinsic = "llvm.ppc.altivec.vmaxuh";
  367.        } else {
  368.          intrinsic = "llvm.ppc.altivec.vmaxsh";
  369.        }
  370.      } else if (type.width == 32) {
  371.        if (!type.sign) {
  372.          intrinsic = "llvm.ppc.altivec.vmaxuw";
  373.        } else {
  374.          intrinsic = "llvm.ppc.altivec.vmaxsw";
  375.        }
  376.      }
  377.    }
  378.  
  379.    if(intrinsic) {
  380.       if (util_cpu_caps.has_sse && type.floating &&
  381.           nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
  382.           nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
  383.           nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
  384.          LLVMValueRef isnan, max;
  385.          max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
  386.                                                    type,
  387.                                                    intr_size, a, b);
  388.          if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
  389.             isnan = lp_build_isnan(bld, b);
  390.             return lp_build_select(bld, isnan, a, max);
  391.          } else {
  392.             assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
  393.             isnan = lp_build_isnan(bld, a);
  394.             return lp_build_select(bld, isnan, a, max);
  395.          }
  396.       } else {
  397.          return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
  398.                                                     type,
  399.                                                     intr_size, a, b);
  400.       }
  401.    }
  402.  
  403.    if (type.floating) {
  404.       switch (nan_behavior) {
  405.       case GALLIVM_NAN_RETURN_NAN: {
  406.          LLVMValueRef isnan = lp_build_isnan(bld, b);
  407.          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  408.          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
  409.          return lp_build_select(bld, cond, a, b);
  410.       }
  411.          break;
  412.       case GALLIVM_NAN_RETURN_OTHER: {
  413.          LLVMValueRef isnan = lp_build_isnan(bld, a);
  414.          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  415.          cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
  416.          return lp_build_select(bld, cond, a, b);
  417.       }
  418.          break;
  419.       case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
  420.          cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
  421.          return lp_build_select(bld, cond, a, b);
  422.       case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
  423.          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
  424.          return lp_build_select(bld, cond, b, a);
  425.       case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
  426.          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  427.          return lp_build_select(bld, cond, a, b);
  428.          break;
  429.       default:
  430.          assert(0);
  431.          cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  432.          return lp_build_select(bld, cond, a, b);
  433.       }
  434.    } else {
  435.       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
  436.       return lp_build_select(bld, cond, a, b);
  437.    }
  438. }
  439.  
  440.  
  441. /**
  442.  * Generate 1 - a, or ~a depending on bld->type.
  443.  */
  444. LLVMValueRef
  445. lp_build_comp(struct lp_build_context *bld,
  446.               LLVMValueRef a)
  447. {
  448.    LLVMBuilderRef builder = bld->gallivm->builder;
  449.    const struct lp_type type = bld->type;
  450.  
  451.    assert(lp_check_value(type, a));
  452.  
  453.    if(a == bld->one)
  454.       return bld->zero;
  455.    if(a == bld->zero)
  456.       return bld->one;
  457.  
  458.    if(type.norm && !type.floating && !type.fixed && !type.sign) {
  459.       if(LLVMIsConstant(a))
  460.          return LLVMConstNot(a);
  461.       else
  462.          return LLVMBuildNot(builder, a, "");
  463.    }
  464.  
  465.    if(LLVMIsConstant(a))
  466.       if (type.floating)
  467.           return LLVMConstFSub(bld->one, a);
  468.       else
  469.           return LLVMConstSub(bld->one, a);
  470.    else
  471.       if (type.floating)
  472.          return LLVMBuildFSub(builder, bld->one, a, "");
  473.       else
  474.          return LLVMBuildSub(builder, bld->one, a, "");
  475. }
  476.  
  477.  
  478. /**
  479.  * Generate a + b
  480.  */
  481. LLVMValueRef
  482. lp_build_add(struct lp_build_context *bld,
  483.              LLVMValueRef a,
  484.              LLVMValueRef b)
  485. {
  486.    LLVMBuilderRef builder = bld->gallivm->builder;
  487.    const struct lp_type type = bld->type;
  488.    LLVMValueRef res;
  489.  
  490.    assert(lp_check_value(type, a));
  491.    assert(lp_check_value(type, b));
  492.  
  493.    if(a == bld->zero)
  494.       return b;
  495.    if(b == bld->zero)
  496.       return a;
  497.    if(a == bld->undef || b == bld->undef)
  498.       return bld->undef;
  499.  
  500.    if(bld->type.norm) {
  501.       const char *intrinsic = NULL;
  502.  
  503.       if(a == bld->one || b == bld->one)
  504.         return bld->one;
  505.  
  506.       if (type.width * type.length == 128 &&
  507.           !type.floating && !type.fixed) {
  508.          if(util_cpu_caps.has_sse2) {
  509.            if(type.width == 8)
  510.              intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
  511.            if(type.width == 16)
  512.              intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
  513.          } else if (util_cpu_caps.has_altivec) {
  514.            if(type.width == 8)
  515.               intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
  516.            if(type.width == 16)
  517.               intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
  518.          }
  519.       }
  520.    
  521.       if(intrinsic)
  522.          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
  523.    }
  524.  
  525.    if(type.norm && !type.floating && !type.fixed) {
  526.       if (type.sign) {
  527.          uint64_t sign = (uint64_t)1 << (type.width - 1);
  528.          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
  529.          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
  530.          /* a_clamp_max is the maximum a for positive b,
  531.             a_clamp_min is the minimum a for negative b. */
  532.          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  533.          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  534.          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
  535.       } else {
  536.          a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  537.       }
  538.    }
  539.  
  540.    if(LLVMIsConstant(a) && LLVMIsConstant(b))
  541.       if (type.floating)
  542.          res = LLVMConstFAdd(a, b);
  543.       else
  544.          res = LLVMConstAdd(a, b);
  545.    else
  546.       if (type.floating)
  547.          res = LLVMBuildFAdd(builder, a, b, "");
  548.       else
  549.          res = LLVMBuildAdd(builder, a, b, "");
  550.  
  551.    /* clamp to ceiling of 1.0 */
  552.    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
  553.       res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  554.  
  555.    /* XXX clamp to floor of -1 or 0??? */
  556.  
  557.    return res;
  558. }
  559.  
  560.  
  561. /** Return the scalar sum of the elements of a.
  562.  * Should avoid this operation whenever possible.
  563.  */
  564. LLVMValueRef
  565. lp_build_horizontal_add(struct lp_build_context *bld,
  566.                         LLVMValueRef a)
  567. {
  568.    LLVMBuilderRef builder = bld->gallivm->builder;
  569.    const struct lp_type type = bld->type;
  570.    LLVMValueRef index, res;
  571.    unsigned i, length;
  572.    LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
  573.    LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
  574.    LLVMValueRef vecres, elem2;
  575.  
  576.    assert(lp_check_value(type, a));
  577.  
  578.    if (type.length == 1) {
  579.       return a;
  580.    }
  581.  
  582.    assert(!bld->type.norm);
  583.  
  584.    /*
  585.     * for byte vectors can do much better with psadbw.
  586.     * Using repeated shuffle/adds here. Note with multiple vectors
  587.     * this can be done more efficiently as outlined in the intel
  588.     * optimization manual.
  589.     * Note: could cause data rearrangement if used with smaller element
  590.     * sizes.
  591.     */
  592.  
  593.    vecres = a;
  594.    length = type.length / 2;
  595.    while (length > 1) {
  596.       LLVMValueRef vec1, vec2;
  597.       for (i = 0; i < length; i++) {
  598.          shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
  599.          shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
  600.       }
  601.       vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
  602.                                     LLVMConstVector(shuffles1, length), "");
  603.       vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
  604.                                     LLVMConstVector(shuffles2, length), "");
  605.       if (type.floating) {
  606.          vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
  607.       }
  608.       else {
  609.          vecres = LLVMBuildAdd(builder, vec1, vec2, "");
  610.       }
  611.       length = length >> 1;
  612.    }
  613.  
  614.    /* always have vector of size 2 here */
  615.    assert(length == 1);
  616.  
  617.    index = lp_build_const_int32(bld->gallivm, 0);
  618.    res = LLVMBuildExtractElement(builder, vecres, index, "");
  619.    index = lp_build_const_int32(bld->gallivm, 1);
  620.    elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
  621.  
  622.    if (type.floating)
  623.       res = LLVMBuildFAdd(builder, res, elem2, "");
  624.     else
  625.       res = LLVMBuildAdd(builder, res, elem2, "");
  626.  
  627.    return res;
  628. }
  629.  
  630. /**
  631.  * Return the horizontal sums of 4 float vectors as a float4 vector.
  632.  * This uses the technique as outlined in Intel Optimization Manual.
  633.  */
  634. static LLVMValueRef
  635. lp_build_horizontal_add4x4f(struct lp_build_context *bld,
  636.                             LLVMValueRef src[4])
  637. {
  638.    struct gallivm_state *gallivm = bld->gallivm;
  639.    LLVMBuilderRef builder = gallivm->builder;
  640.    LLVMValueRef shuffles[4];
  641.    LLVMValueRef tmp[4];
  642.    LLVMValueRef sumtmp[2], shuftmp[2];
  643.  
  644.    /* lower half of regs */
  645.    shuffles[0] = lp_build_const_int32(gallivm, 0);
  646.    shuffles[1] = lp_build_const_int32(gallivm, 1);
  647.    shuffles[2] = lp_build_const_int32(gallivm, 4);
  648.    shuffles[3] = lp_build_const_int32(gallivm, 5);
  649.    tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
  650.                                    LLVMConstVector(shuffles, 4), "");
  651.    tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
  652.                                    LLVMConstVector(shuffles, 4), "");
  653.  
  654.    /* upper half of regs */
  655.    shuffles[0] = lp_build_const_int32(gallivm, 2);
  656.    shuffles[1] = lp_build_const_int32(gallivm, 3);
  657.    shuffles[2] = lp_build_const_int32(gallivm, 6);
  658.    shuffles[3] = lp_build_const_int32(gallivm, 7);
  659.    tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
  660.                                    LLVMConstVector(shuffles, 4), "");
  661.    tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
  662.                                    LLVMConstVector(shuffles, 4), "");
  663.  
  664.    sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
  665.    sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
  666.  
  667.    shuffles[0] = lp_build_const_int32(gallivm, 0);
  668.    shuffles[1] = lp_build_const_int32(gallivm, 2);
  669.    shuffles[2] = lp_build_const_int32(gallivm, 4);
  670.    shuffles[3] = lp_build_const_int32(gallivm, 6);
  671.    shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
  672.                                        LLVMConstVector(shuffles, 4), "");
  673.  
  674.    shuffles[0] = lp_build_const_int32(gallivm, 1);
  675.    shuffles[1] = lp_build_const_int32(gallivm, 3);
  676.    shuffles[2] = lp_build_const_int32(gallivm, 5);
  677.    shuffles[3] = lp_build_const_int32(gallivm, 7);
  678.    shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
  679.                                        LLVMConstVector(shuffles, 4), "");
  680.  
  681.    return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
  682. }
  683.  
  684.  
  685. /*
  686.  * partially horizontally add 2-4 float vectors with length nx4,
  687.  * i.e. only four adjacent values in each vector will be added,
  688.  * assuming values are really grouped in 4 which also determines
  689.  * output order.
  690.  *
  691.  * Return a vector of the same length as the initial vectors,
  692.  * with the excess elements (if any) being undefined.
  693.  * The element order is independent of number of input vectors.
  694.  * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
  695.  * the output order thus will be
  696.  * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
  697.  */
  698. LLVMValueRef
  699. lp_build_hadd_partial4(struct lp_build_context *bld,
  700.                        LLVMValueRef vectors[],
  701.                        unsigned num_vecs)
  702. {
  703.    struct gallivm_state *gallivm = bld->gallivm;
  704.    LLVMBuilderRef builder = gallivm->builder;
  705.    LLVMValueRef ret_vec;
  706.    LLVMValueRef tmp[4];
  707.    const char *intrinsic = NULL;
  708.  
  709.    assert(num_vecs >= 2 && num_vecs <= 4);
  710.    assert(bld->type.floating);
  711.  
  712.    /* only use this with at least 2 vectors, as it is sort of expensive
  713.     * (depending on cpu) and we always need two horizontal adds anyway,
  714.     * so a shuffle/add approach might be better.
  715.     */
  716.  
  717.    tmp[0] = vectors[0];
  718.    tmp[1] = vectors[1];
  719.  
  720.    tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
  721.    tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
  722.  
  723.    if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
  724.        bld->type.length == 4) {
  725.       intrinsic = "llvm.x86.sse3.hadd.ps";
  726.    }
  727.    else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
  728.             bld->type.length == 8) {
  729.       intrinsic = "llvm.x86.avx.hadd.ps.256";
  730.    }
  731.    if (intrinsic) {
  732.       tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
  733.                                        lp_build_vec_type(gallivm, bld->type),
  734.                                        tmp[0], tmp[1]);
  735.       if (num_vecs > 2) {
  736.          tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
  737.                                           lp_build_vec_type(gallivm, bld->type),
  738.                                           tmp[2], tmp[3]);
  739.       }
  740.       else {
  741.          tmp[1] = tmp[0];
  742.       }
  743.       return lp_build_intrinsic_binary(builder, intrinsic,
  744.                                        lp_build_vec_type(gallivm, bld->type),
  745.                                        tmp[0], tmp[1]);
  746.    }
  747.  
  748.    if (bld->type.length == 4) {
  749.       ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
  750.    }
  751.    else {
  752.       LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
  753.       unsigned j;
  754.       unsigned num_iter = bld->type.length / 4;
  755.       struct lp_type parttype = bld->type;
  756.       parttype.length = 4;
  757.       for (j = 0; j < num_iter; j++) {
  758.          LLVMValueRef partsrc[4];
  759.          unsigned i;
  760.          for (i = 0; i < 4; i++) {
  761.             partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
  762.          }
  763.          partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
  764.       }
  765.       ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
  766.    }
  767.    return ret_vec;
  768. }
  769.  
  770. /**
  771.  * Generate a - b
  772.  */
  773. LLVMValueRef
  774. lp_build_sub(struct lp_build_context *bld,
  775.              LLVMValueRef a,
  776.              LLVMValueRef b)
  777. {
  778.    LLVMBuilderRef builder = bld->gallivm->builder;
  779.    const struct lp_type type = bld->type;
  780.    LLVMValueRef res;
  781.  
  782.    assert(lp_check_value(type, a));
  783.    assert(lp_check_value(type, b));
  784.  
  785.    if(b == bld->zero)
  786.       return a;
  787.    if(a == bld->undef || b == bld->undef)
  788.       return bld->undef;
  789.    if(a == b)
  790.       return bld->zero;
  791.  
  792.    if(bld->type.norm) {
  793.       const char *intrinsic = NULL;
  794.  
  795.       if(b == bld->one)
  796.         return bld->zero;
  797.  
  798.       if (type.width * type.length == 128 &&
  799.           !type.floating && !type.fixed) {
  800.          if (util_cpu_caps.has_sse2) {
  801.            if(type.width == 8)
  802.               intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
  803.            if(type.width == 16)
  804.               intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
  805.          } else if (util_cpu_caps.has_altivec) {
  806.            if(type.width == 8)
  807.               intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
  808.            if(type.width == 16)
  809.               intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
  810.          }
  811.       }
  812.    
  813.       if(intrinsic)
  814.          return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
  815.    }
  816.  
  817.    if(type.norm && !type.floating && !type.fixed) {
  818.       if (type.sign) {
  819.          uint64_t sign = (uint64_t)1 << (type.width - 1);
  820.          LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
  821.          LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
  822.          /* a_clamp_max is the maximum a for negative b,
  823.             a_clamp_min is the minimum a for positive b. */
  824.          LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  825.          LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  826.          a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
  827.       } else {
  828.          a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  829.       }
  830.    }
  831.  
  832.    if(LLVMIsConstant(a) && LLVMIsConstant(b))
  833.       if (type.floating)
  834.          res = LLVMConstFSub(a, b);
  835.       else
  836.          res = LLVMConstSub(a, b);
  837.    else
  838.       if (type.floating)
  839.          res = LLVMBuildFSub(builder, a, b, "");
  840.       else
  841.          res = LLVMBuildSub(builder, a, b, "");
  842.  
  843.    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
  844.       res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  845.  
  846.    return res;
  847. }
  848.  
  849.  
  850.  
  851. /**
  852.  * Normalized multiplication.
  853.  *
  854.  * There are several approaches for (using 8-bit normalized multiplication as
  855.  * an example):
  856.  *
  857.  * - alpha plus one
  858.  *
  859.  *     makes the following approximation to the division (Sree)
  860.  *    
  861.  *       a*b/255 ~= (a*(b + 1)) >> 256
  862.  *    
  863.  *     which is the fastest method that satisfies the following OpenGL criteria of
  864.  *    
  865.  *       0*0 = 0 and 255*255 = 255
  866.  *
  867.  * - geometric series
  868.  *
  869.  *     takes the geometric series approximation to the division
  870.  *
  871.  *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
  872.  *
  873.  *     in this case just the first two terms to fit in 16bit arithmetic
  874.  *
  875.  *       t/255 ~= (t + (t >> 8)) >> 8
  876.  *
  877.  *     note that just by itself it doesn't satisfies the OpenGL criteria, as
  878.  *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
  879.  *     must be used.
  880.  *
  881.  * - geometric series plus rounding
  882.  *
  883.  *     when using a geometric series division instead of truncating the result
  884.  *     use roundoff in the approximation (Jim Blinn)
  885.  *
  886.  *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
  887.  *
  888.  *     achieving the exact results.
  889.  *
  890.  *
  891.  *
  892.  * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
  893.  *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
  894.  * @sa Michael Herf, The "double blend trick", May 2000,
  895.  *     http://www.stereopsis.com/doubleblend.html
  896.  */
  897. static LLVMValueRef
  898. lp_build_mul_norm(struct gallivm_state *gallivm,
  899.                   struct lp_type wide_type,
  900.                   LLVMValueRef a, LLVMValueRef b)
  901. {
  902.    LLVMBuilderRef builder = gallivm->builder;
  903.    struct lp_build_context bld;
  904.    unsigned n;
  905.    LLVMValueRef half;
  906.    LLVMValueRef ab;
  907.  
  908.    assert(!wide_type.floating);
  909.    assert(lp_check_value(wide_type, a));
  910.    assert(lp_check_value(wide_type, b));
  911.  
  912.    lp_build_context_init(&bld, gallivm, wide_type);
  913.  
  914.    n = wide_type.width / 2;
  915.    if (wide_type.sign) {
  916.       --n;
  917.    }
  918.  
  919.    /*
  920.     * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
  921.     * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
  922.     */
  923.  
  924.    /*
  925.     * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
  926.     */
  927.  
  928.    ab = LLVMBuildMul(builder, a, b, "");
  929.    ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
  930.  
  931.    /*
  932.     * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
  933.     */
  934.  
  935.    half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
  936.    if (wide_type.sign) {
  937.       LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
  938.       LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
  939.       half = lp_build_select(&bld, sign, minus_half, half);
  940.    }
  941.    ab = LLVMBuildAdd(builder, ab, half, "");
  942.  
  943.    /* Final division */
  944.    ab = lp_build_shr_imm(&bld, ab, n);
  945.  
  946.    return ab;
  947. }
  948.  
  949. /**
  950.  * Generate a * b
  951.  */
  952. LLVMValueRef
  953. lp_build_mul(struct lp_build_context *bld,
  954.              LLVMValueRef a,
  955.              LLVMValueRef b)
  956. {
  957.    LLVMBuilderRef builder = bld->gallivm->builder;
  958.    const struct lp_type type = bld->type;
  959.    LLVMValueRef shift;
  960.    LLVMValueRef res;
  961.  
  962.    assert(lp_check_value(type, a));
  963.    assert(lp_check_value(type, b));
  964.  
  965.    if(a == bld->zero)
  966.       return bld->zero;
  967.    if(a == bld->one)
  968.       return b;
  969.    if(b == bld->zero)
  970.       return bld->zero;
  971.    if(b == bld->one)
  972.       return a;
  973.    if(a == bld->undef || b == bld->undef)
  974.       return bld->undef;
  975.  
  976.    if (!type.floating && !type.fixed && type.norm) {
  977.       struct lp_type wide_type = lp_wider_type(type);
  978.       LLVMValueRef al, ah, bl, bh, abl, abh, ab;
  979.  
  980.       lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
  981.       lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
  982.  
  983.       /* PMULLW, PSRLW, PADDW */
  984.       abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
  985.       abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
  986.  
  987.       ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
  988.  
  989.       return ab;
  990.    }
  991.  
  992.    if(type.fixed)
  993.       shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
  994.    else
  995.       shift = NULL;
  996.  
  997.    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
  998.       if (type.floating)
  999.          res = LLVMConstFMul(a, b);
  1000.       else
  1001.          res = LLVMConstMul(a, b);
  1002.       if(shift) {
  1003.          if(type.sign)
  1004.             res = LLVMConstAShr(res, shift);
  1005.          else
  1006.             res = LLVMConstLShr(res, shift);
  1007.       }
  1008.    }
  1009.    else {
  1010.       if (type.floating)
  1011.          res = LLVMBuildFMul(builder, a, b, "");
  1012.       else
  1013.          res = LLVMBuildMul(builder, a, b, "");
  1014.       if(shift) {
  1015.          if(type.sign)
  1016.             res = LLVMBuildAShr(builder, res, shift, "");
  1017.          else
  1018.             res = LLVMBuildLShr(builder, res, shift, "");
  1019.       }
  1020.    }
  1021.  
  1022.    return res;
  1023. }
  1024.  
  1025.  
  1026. /**
  1027.  * Small vector x scale multiplication optimization.
  1028.  */
  1029. LLVMValueRef
  1030. lp_build_mul_imm(struct lp_build_context *bld,
  1031.                  LLVMValueRef a,
  1032.                  int b)
  1033. {
  1034.    LLVMBuilderRef builder = bld->gallivm->builder;
  1035.    LLVMValueRef factor;
  1036.  
  1037.    assert(lp_check_value(bld->type, a));
  1038.  
  1039.    if(b == 0)
  1040.       return bld->zero;
  1041.  
  1042.    if(b == 1)
  1043.       return a;
  1044.  
  1045.    if(b == -1)
  1046.       return lp_build_negate(bld, a);
  1047.  
  1048.    if(b == 2 && bld->type.floating)
  1049.       return lp_build_add(bld, a, a);
  1050.  
  1051.    if(util_is_power_of_two(b)) {
  1052.       unsigned shift = ffs(b) - 1;
  1053.  
  1054.       if(bld->type.floating) {
  1055. #if 0
  1056.          /*
  1057.           * Power of two multiplication by directly manipulating the exponent.
  1058.           *
  1059.           * XXX: This might not be always faster, it will introduce a small error
  1060.           * for multiplication by zero, and it will produce wrong results
  1061.           * for Inf and NaN.
  1062.           */
  1063.          unsigned mantissa = lp_mantissa(bld->type);
  1064.          factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
  1065.          a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
  1066.          a = LLVMBuildAdd(builder, a, factor, "");
  1067.          a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
  1068.          return a;
  1069. #endif
  1070.       }
  1071.       else {
  1072.          factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
  1073.          return LLVMBuildShl(builder, a, factor, "");
  1074.       }
  1075.    }
  1076.  
  1077.    factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
  1078.    return lp_build_mul(bld, a, factor);
  1079. }
  1080.  
  1081.  
  1082. /**
  1083.  * Generate a / b
  1084.  */
  1085. LLVMValueRef
  1086. lp_build_div(struct lp_build_context *bld,
  1087.              LLVMValueRef a,
  1088.              LLVMValueRef b)
  1089. {
  1090.    LLVMBuilderRef builder = bld->gallivm->builder;
  1091.    const struct lp_type type = bld->type;
  1092.  
  1093.    assert(lp_check_value(type, a));
  1094.    assert(lp_check_value(type, b));
  1095.  
  1096.    if(a == bld->zero)
  1097.       return bld->zero;
  1098.    if(a == bld->one && type.floating)
  1099.       return lp_build_rcp(bld, b);
  1100.    if(b == bld->zero)
  1101.       return bld->undef;
  1102.    if(b == bld->one)
  1103.       return a;
  1104.    if(a == bld->undef || b == bld->undef)
  1105.       return bld->undef;
  1106.  
  1107.    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
  1108.       if (type.floating)
  1109.          return LLVMConstFDiv(a, b);
  1110.       else if (type.sign)
  1111.          return LLVMConstSDiv(a, b);
  1112.       else
  1113.          return LLVMConstUDiv(a, b);
  1114.    }
  1115.  
  1116.    if(((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
  1117.        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
  1118.       type.floating)
  1119.       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
  1120.  
  1121.    if (type.floating)
  1122.       return LLVMBuildFDiv(builder, a, b, "");
  1123.    else if (type.sign)
  1124.       return LLVMBuildSDiv(builder, a, b, "");
  1125.    else
  1126.       return LLVMBuildUDiv(builder, a, b, "");
  1127. }
  1128.  
  1129.  
  1130. /**
  1131.  * Linear interpolation helper.
  1132.  *
  1133.  * @param normalized whether we are interpolating normalized values,
  1134.  *        encoded in normalized integers, twice as wide.
  1135.  *
  1136.  * @sa http://www.stereopsis.com/doubleblend.html
  1137.  */
  1138. static INLINE LLVMValueRef
  1139. lp_build_lerp_simple(struct lp_build_context *bld,
  1140.                      LLVMValueRef x,
  1141.                      LLVMValueRef v0,
  1142.                      LLVMValueRef v1,
  1143.                      unsigned flags)
  1144. {
  1145.    unsigned half_width = bld->type.width/2;
  1146.    LLVMBuilderRef builder = bld->gallivm->builder;
  1147.    LLVMValueRef delta;
  1148.    LLVMValueRef res;
  1149.  
  1150.    assert(lp_check_value(bld->type, x));
  1151.    assert(lp_check_value(bld->type, v0));
  1152.    assert(lp_check_value(bld->type, v1));
  1153.  
  1154.    delta = lp_build_sub(bld, v1, v0);
  1155.  
  1156.    if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
  1157.       if (!bld->type.sign) {
  1158.          if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
  1159.             /*
  1160.              * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
  1161.              * most-significant-bit to the lowest-significant-bit, so that
  1162.              * later we can just divide by 2**n instead of 2**n - 1.
  1163.              */
  1164.  
  1165.             x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
  1166.          }
  1167.  
  1168.          /* (x * delta) >> n */
  1169.          res = lp_build_mul(bld, x, delta);
  1170.          res = lp_build_shr_imm(bld, res, half_width);
  1171.       } else {
  1172.          /*
  1173.           * The rescaling trick above doesn't work for signed numbers, so
  1174.           * use the 2**n - 1 divison approximation in lp_build_mul_norm
  1175.           * instead.
  1176.           */
  1177.          assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
  1178.          res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
  1179.       }
  1180.    } else {
  1181.       assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
  1182.       res = lp_build_mul(bld, x, delta);
  1183.    }
  1184.  
  1185.    res = lp_build_add(bld, v0, res);
  1186.  
  1187.    if (((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) ||
  1188.        bld->type.fixed) {
  1189.       /* We need to mask out the high order bits when lerping 8bit normalized colors stored on 16bits */
  1190.       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
  1191.        * but it will be wrong for true fixed point use cases. Basically we need
  1192.        * a more powerful lp_type, capable of further distinguishing the values
  1193.        * interpretation from the value storage. */
  1194.       res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1), "");
  1195.    }
  1196.  
  1197.    return res;
  1198. }
  1199.  
  1200.  
  1201. /**
  1202.  * Linear interpolation.
  1203.  */
  1204. LLVMValueRef
  1205. lp_build_lerp(struct lp_build_context *bld,
  1206.               LLVMValueRef x,
  1207.               LLVMValueRef v0,
  1208.               LLVMValueRef v1,
  1209.               unsigned flags)
  1210. {
  1211.    const struct lp_type type = bld->type;
  1212.    LLVMValueRef res;
  1213.  
  1214.    assert(lp_check_value(type, x));
  1215.    assert(lp_check_value(type, v0));
  1216.    assert(lp_check_value(type, v1));
  1217.  
  1218.    assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
  1219.  
  1220.    if (type.norm) {
  1221.       struct lp_type wide_type;
  1222.       struct lp_build_context wide_bld;
  1223.       LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
  1224.  
  1225.       assert(type.length >= 2);
  1226.  
  1227.       /*
  1228.        * Create a wider integer type, enough to hold the
  1229.        * intermediate result of the multiplication.
  1230.        */
  1231.       memset(&wide_type, 0, sizeof wide_type);
  1232.       wide_type.sign   = type.sign;
  1233.       wide_type.width  = type.width*2;
  1234.       wide_type.length = type.length/2;
  1235.  
  1236.       lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
  1237.  
  1238.       lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
  1239.       lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
  1240.       lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
  1241.  
  1242.       /*
  1243.        * Lerp both halves.
  1244.        */
  1245.  
  1246.       flags |= LP_BLD_LERP_WIDE_NORMALIZED;
  1247.  
  1248.       resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
  1249.       resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
  1250.  
  1251.       res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
  1252.    } else {
  1253.       res = lp_build_lerp_simple(bld, x, v0, v1, flags);
  1254.    }
  1255.  
  1256.    return res;
  1257. }
  1258.  
  1259.  
  1260. /**
  1261.  * Bilinear interpolation.
  1262.  *
  1263.  * Values indices are in v_{yx}.
  1264.  */
  1265. LLVMValueRef
  1266. lp_build_lerp_2d(struct lp_build_context *bld,
  1267.                  LLVMValueRef x,
  1268.                  LLVMValueRef y,
  1269.                  LLVMValueRef v00,
  1270.                  LLVMValueRef v01,
  1271.                  LLVMValueRef v10,
  1272.                  LLVMValueRef v11,
  1273.                  unsigned flags)
  1274. {
  1275.    LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
  1276.    LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
  1277.    return lp_build_lerp(bld, y, v0, v1, flags);
  1278. }
  1279.  
  1280.  
  1281. LLVMValueRef
  1282. lp_build_lerp_3d(struct lp_build_context *bld,
  1283.                  LLVMValueRef x,
  1284.                  LLVMValueRef y,
  1285.                  LLVMValueRef z,
  1286.                  LLVMValueRef v000,
  1287.                  LLVMValueRef v001,
  1288.                  LLVMValueRef v010,
  1289.                  LLVMValueRef v011,
  1290.                  LLVMValueRef v100,
  1291.                  LLVMValueRef v101,
  1292.                  LLVMValueRef v110,
  1293.                  LLVMValueRef v111,
  1294.                  unsigned flags)
  1295. {
  1296.    LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
  1297.    LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
  1298.    return lp_build_lerp(bld, z, v0, v1, flags);
  1299. }
  1300.  
  1301.  
  1302. /**
  1303.  * Generate min(a, b)
  1304.  * Do checks for special cases but not for nans.
  1305.  */
  1306. LLVMValueRef
  1307. lp_build_min(struct lp_build_context *bld,
  1308.              LLVMValueRef a,
  1309.              LLVMValueRef b)
  1310. {
  1311.    assert(lp_check_value(bld->type, a));
  1312.    assert(lp_check_value(bld->type, b));
  1313.  
  1314.    if(a == bld->undef || b == bld->undef)
  1315.       return bld->undef;
  1316.  
  1317.    if(a == b)
  1318.       return a;
  1319.  
  1320.    if (bld->type.norm) {
  1321.       if (!bld->type.sign) {
  1322.          if (a == bld->zero || b == bld->zero) {
  1323.             return bld->zero;
  1324.          }
  1325.       }
  1326.       if(a == bld->one)
  1327.          return b;
  1328.       if(b == bld->one)
  1329.          return a;
  1330.    }
  1331.  
  1332.    return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  1333. }
  1334.  
  1335.  
  1336. /**
  1337.  * Generate min(a, b)
  1338.  * NaN's are handled according to the behavior specified by the
  1339.  * nan_behavior argument.
  1340.  */
  1341. LLVMValueRef
  1342. lp_build_min_ext(struct lp_build_context *bld,
  1343.                  LLVMValueRef a,
  1344.                  LLVMValueRef b,
  1345.                  enum gallivm_nan_behavior nan_behavior)
  1346. {
  1347.    assert(lp_check_value(bld->type, a));
  1348.    assert(lp_check_value(bld->type, b));
  1349.  
  1350.    if(a == bld->undef || b == bld->undef)
  1351.       return bld->undef;
  1352.  
  1353.    if(a == b)
  1354.       return a;
  1355.  
  1356.    if (bld->type.norm) {
  1357.       if (!bld->type.sign) {
  1358.          if (a == bld->zero || b == bld->zero) {
  1359.             return bld->zero;
  1360.          }
  1361.       }
  1362.       if(a == bld->one)
  1363.          return b;
  1364.       if(b == bld->one)
  1365.          return a;
  1366.    }
  1367.  
  1368.    return lp_build_min_simple(bld, a, b, nan_behavior);
  1369. }
  1370.  
  1371. /**
  1372.  * Generate max(a, b)
  1373.  * Do checks for special cases, but NaN behavior is undefined.
  1374.  */
  1375. LLVMValueRef
  1376. lp_build_max(struct lp_build_context *bld,
  1377.              LLVMValueRef a,
  1378.              LLVMValueRef b)
  1379. {
  1380.    assert(lp_check_value(bld->type, a));
  1381.    assert(lp_check_value(bld->type, b));
  1382.  
  1383.    if(a == bld->undef || b == bld->undef)
  1384.       return bld->undef;
  1385.  
  1386.    if(a == b)
  1387.       return a;
  1388.  
  1389.    if(bld->type.norm) {
  1390.       if(a == bld->one || b == bld->one)
  1391.          return bld->one;
  1392.       if (!bld->type.sign) {
  1393.          if (a == bld->zero) {
  1394.             return b;
  1395.          }
  1396.          if (b == bld->zero) {
  1397.             return a;
  1398.          }
  1399.       }
  1400.    }
  1401.  
  1402.    return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
  1403. }
  1404.  
  1405.  
  1406. /**
  1407.  * Generate max(a, b)
  1408.  * Checks for special cases.
  1409.  * NaN's are handled according to the behavior specified by the
  1410.  * nan_behavior argument.
  1411.  */
  1412. LLVMValueRef
  1413. lp_build_max_ext(struct lp_build_context *bld,
  1414.                   LLVMValueRef a,
  1415.                   LLVMValueRef b,
  1416.                   enum gallivm_nan_behavior nan_behavior)
  1417. {
  1418.    assert(lp_check_value(bld->type, a));
  1419.    assert(lp_check_value(bld->type, b));
  1420.  
  1421.    if(a == bld->undef || b == bld->undef)
  1422.       return bld->undef;
  1423.  
  1424.    if(a == b)
  1425.       return a;
  1426.  
  1427.    if(bld->type.norm) {
  1428.       if(a == bld->one || b == bld->one)
  1429.          return bld->one;
  1430.       if (!bld->type.sign) {
  1431.          if (a == bld->zero) {
  1432.             return b;
  1433.          }
  1434.          if (b == bld->zero) {
  1435.             return a;
  1436.          }
  1437.       }
  1438.    }
  1439.  
  1440.    return lp_build_max_simple(bld, a, b, nan_behavior);
  1441. }
  1442.  
  1443. /**
  1444.  * Generate clamp(a, min, max)
  1445.  * NaN behavior (for any of a, min, max) is undefined.
  1446.  * Do checks for special cases.
  1447.  */
  1448. LLVMValueRef
  1449. lp_build_clamp(struct lp_build_context *bld,
  1450.                LLVMValueRef a,
  1451.                LLVMValueRef min,
  1452.                LLVMValueRef max)
  1453. {
  1454.    assert(lp_check_value(bld->type, a));
  1455.    assert(lp_check_value(bld->type, min));
  1456.    assert(lp_check_value(bld->type, max));
  1457.  
  1458.    a = lp_build_min(bld, a, max);
  1459.    a = lp_build_max(bld, a, min);
  1460.    return a;
  1461. }
  1462.  
  1463.  
  1464. /**
  1465.  * Generate clamp(a, 0, 1)
  1466.  * A NaN will get converted to zero.
  1467.  */
  1468. LLVMValueRef
  1469. lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
  1470.                                 LLVMValueRef a)
  1471. {
  1472.    a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
  1473.    a = lp_build_min(bld, a, bld->one);
  1474.    return a;
  1475. }
  1476.  
  1477.  
  1478. /**
  1479.  * Generate abs(a)
  1480.  */
  1481. LLVMValueRef
  1482. lp_build_abs(struct lp_build_context *bld,
  1483.              LLVMValueRef a)
  1484. {
  1485.    LLVMBuilderRef builder = bld->gallivm->builder;
  1486.    const struct lp_type type = bld->type;
  1487.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  1488.  
  1489.    assert(lp_check_value(type, a));
  1490.  
  1491.    if(!type.sign)
  1492.       return a;
  1493.  
  1494.    if(type.floating) {
  1495.       /* Mask out the sign bit */
  1496.       LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  1497.       unsigned long long absMask = ~(1ULL << (type.width - 1));
  1498.       LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
  1499.       a = LLVMBuildBitCast(builder, a, int_vec_type, "");
  1500.       a = LLVMBuildAnd(builder, a, mask, "");
  1501.       a = LLVMBuildBitCast(builder, a, vec_type, "");
  1502.       return a;
  1503.    }
  1504.  
  1505.    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
  1506.       switch(type.width) {
  1507.       case 8:
  1508.          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
  1509.       case 16:
  1510.          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
  1511.       case 32:
  1512.          return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
  1513.       }
  1514.    }
  1515.    else if (type.width*type.length == 256 && util_cpu_caps.has_ssse3 &&
  1516.             (gallivm_debug & GALLIVM_DEBUG_PERF) &&
  1517.             (type.width == 8 || type.width == 16 || type.width == 32)) {
  1518.       debug_printf("%s: inefficient code, should split vectors manually\n",
  1519.                    __FUNCTION__);
  1520.    }
  1521.  
  1522.    return lp_build_max(bld, a, LLVMBuildNeg(builder, a, ""));
  1523. }
  1524.  
  1525.  
  1526. LLVMValueRef
  1527. lp_build_negate(struct lp_build_context *bld,
  1528.                 LLVMValueRef a)
  1529. {
  1530.    LLVMBuilderRef builder = bld->gallivm->builder;
  1531.  
  1532.    assert(lp_check_value(bld->type, a));
  1533.  
  1534.    if (bld->type.floating)
  1535.       a = LLVMBuildFNeg(builder, a, "");
  1536.    else
  1537.       a = LLVMBuildNeg(builder, a, "");
  1538.  
  1539.    return a;
  1540. }
  1541.  
  1542.  
  1543. /** Return -1, 0 or +1 depending on the sign of a */
  1544. LLVMValueRef
  1545. lp_build_sgn(struct lp_build_context *bld,
  1546.              LLVMValueRef a)
  1547. {
  1548.    LLVMBuilderRef builder = bld->gallivm->builder;
  1549.    const struct lp_type type = bld->type;
  1550.    LLVMValueRef cond;
  1551.    LLVMValueRef res;
  1552.  
  1553.    assert(lp_check_value(type, a));
  1554.  
  1555.    /* Handle non-zero case */
  1556.    if(!type.sign) {
  1557.       /* if not zero then sign must be positive */
  1558.       res = bld->one;
  1559.    }
  1560.    else if(type.floating) {
  1561.       LLVMTypeRef vec_type;
  1562.       LLVMTypeRef int_type;
  1563.       LLVMValueRef mask;
  1564.       LLVMValueRef sign;
  1565.       LLVMValueRef one;
  1566.       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
  1567.  
  1568.       int_type = lp_build_int_vec_type(bld->gallivm, type);
  1569.       vec_type = lp_build_vec_type(bld->gallivm, type);
  1570.       mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
  1571.  
  1572.       /* Take the sign bit and add it to 1 constant */
  1573.       sign = LLVMBuildBitCast(builder, a, int_type, "");
  1574.       sign = LLVMBuildAnd(builder, sign, mask, "");
  1575.       one = LLVMConstBitCast(bld->one, int_type);
  1576.       res = LLVMBuildOr(builder, sign, one, "");
  1577.       res = LLVMBuildBitCast(builder, res, vec_type, "");
  1578.    }
  1579.    else
  1580.    {
  1581.       /* signed int/norm/fixed point */
  1582.       /* could use psign with sse3 and appropriate vectors here */
  1583.       LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
  1584.       cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
  1585.       res = lp_build_select(bld, cond, bld->one, minus_one);
  1586.    }
  1587.  
  1588.    /* Handle zero */
  1589.    cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
  1590.    res = lp_build_select(bld, cond, bld->zero, res);
  1591.  
  1592.    return res;
  1593. }
  1594.  
  1595.  
  1596. /**
  1597.  * Set the sign of float vector 'a' according to 'sign'.
  1598.  * If sign==0, return abs(a).
  1599.  * If sign==1, return -abs(a);
  1600.  * Other values for sign produce undefined results.
  1601.  */
  1602. LLVMValueRef
  1603. lp_build_set_sign(struct lp_build_context *bld,
  1604.                   LLVMValueRef a, LLVMValueRef sign)
  1605. {
  1606.    LLVMBuilderRef builder = bld->gallivm->builder;
  1607.    const struct lp_type type = bld->type;
  1608.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  1609.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  1610.    LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
  1611.    LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
  1612.                              ~((unsigned long long) 1 << (type.width - 1)));
  1613.    LLVMValueRef val, res;
  1614.  
  1615.    assert(type.floating);
  1616.    assert(lp_check_value(type, a));
  1617.  
  1618.    /* val = reinterpret_cast<int>(a) */
  1619.    val = LLVMBuildBitCast(builder, a, int_vec_type, "");
  1620.    /* val = val & mask */
  1621.    val = LLVMBuildAnd(builder, val, mask, "");
  1622.    /* sign = sign << shift */
  1623.    sign = LLVMBuildShl(builder, sign, shift, "");
  1624.    /* res = val | sign */
  1625.    res = LLVMBuildOr(builder, val, sign, "");
  1626.    /* res = reinterpret_cast<float>(res) */
  1627.    res = LLVMBuildBitCast(builder, res, vec_type, "");
  1628.  
  1629.    return res;
  1630. }
  1631.  
  1632.  
  1633. /**
  1634.  * Convert vector of (or scalar) int to vector of (or scalar) float.
  1635.  */
  1636. LLVMValueRef
  1637. lp_build_int_to_float(struct lp_build_context *bld,
  1638.                       LLVMValueRef a)
  1639. {
  1640.    LLVMBuilderRef builder = bld->gallivm->builder;
  1641.    const struct lp_type type = bld->type;
  1642.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  1643.  
  1644.    assert(type.floating);
  1645.  
  1646.    return LLVMBuildSIToFP(builder, a, vec_type, "");
  1647. }
  1648.  
  1649. static boolean
  1650. arch_rounding_available(const struct lp_type type)
  1651. {
  1652.    if ((util_cpu_caps.has_sse4_1 &&
  1653.        (type.length == 1 || type.width*type.length == 128)) ||
  1654.        (util_cpu_caps.has_avx && type.width*type.length == 256))
  1655.       return TRUE;
  1656.    else if ((util_cpu_caps.has_altivec &&
  1657.             (type.width == 32 && type.length == 4)))
  1658.       return TRUE;
  1659.  
  1660.    return FALSE;
  1661. }
  1662.  
  1663. enum lp_build_round_mode
  1664. {
  1665.    LP_BUILD_ROUND_NEAREST = 0,
  1666.    LP_BUILD_ROUND_FLOOR = 1,
  1667.    LP_BUILD_ROUND_CEIL = 2,
  1668.    LP_BUILD_ROUND_TRUNCATE = 3
  1669. };
  1670.  
  1671. /**
  1672.  * Helper for SSE4.1's ROUNDxx instructions.
  1673.  *
  1674.  * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
  1675.  * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
  1676.  */
  1677. static INLINE LLVMValueRef
  1678. lp_build_round_sse41(struct lp_build_context *bld,
  1679.                      LLVMValueRef a,
  1680.                      enum lp_build_round_mode mode)
  1681. {
  1682.    LLVMBuilderRef builder = bld->gallivm->builder;
  1683.    const struct lp_type type = bld->type;
  1684.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  1685.    const char *intrinsic;
  1686.    LLVMValueRef res;
  1687.  
  1688.    assert(type.floating);
  1689.  
  1690.    assert(lp_check_value(type, a));
  1691.    assert(util_cpu_caps.has_sse4_1);
  1692.  
  1693.    if (type.length == 1) {
  1694.       LLVMTypeRef vec_type;
  1695.       LLVMValueRef undef;
  1696.       LLVMValueRef args[3];
  1697.       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
  1698.  
  1699.       switch(type.width) {
  1700.       case 32:
  1701.          intrinsic = "llvm.x86.sse41.round.ss";
  1702.          break;
  1703.       case 64:
  1704.          intrinsic = "llvm.x86.sse41.round.sd";
  1705.          break;
  1706.       default:
  1707.          assert(0);
  1708.          return bld->undef;
  1709.       }
  1710.  
  1711.       vec_type = LLVMVectorType(bld->elem_type, 4);
  1712.  
  1713.       undef = LLVMGetUndef(vec_type);
  1714.  
  1715.       args[0] = undef;
  1716.       args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
  1717.       args[2] = LLVMConstInt(i32t, mode, 0);
  1718.  
  1719.       res = lp_build_intrinsic(builder, intrinsic,
  1720.                                vec_type, args, Elements(args));
  1721.  
  1722.       res = LLVMBuildExtractElement(builder, res, index0, "");
  1723.    }
  1724.    else {
  1725.       if (type.width * type.length == 128) {
  1726.          switch(type.width) {
  1727.          case 32:
  1728.             intrinsic = "llvm.x86.sse41.round.ps";
  1729.             break;
  1730.          case 64:
  1731.             intrinsic = "llvm.x86.sse41.round.pd";
  1732.             break;
  1733.          default:
  1734.             assert(0);
  1735.             return bld->undef;
  1736.          }
  1737.       }
  1738.       else {
  1739.          assert(type.width * type.length == 256);
  1740.          assert(util_cpu_caps.has_avx);
  1741.  
  1742.          switch(type.width) {
  1743.          case 32:
  1744.             intrinsic = "llvm.x86.avx.round.ps.256";
  1745.             break;
  1746.          case 64:
  1747.             intrinsic = "llvm.x86.avx.round.pd.256";
  1748.             break;
  1749.          default:
  1750.             assert(0);
  1751.             return bld->undef;
  1752.          }
  1753.       }
  1754.  
  1755.       res = lp_build_intrinsic_binary(builder, intrinsic,
  1756.                                       bld->vec_type, a,
  1757.                                       LLVMConstInt(i32t, mode, 0));
  1758.    }
  1759.  
  1760.    return res;
  1761. }
  1762.  
  1763.  
  1764. static INLINE LLVMValueRef
  1765. lp_build_iround_nearest_sse2(struct lp_build_context *bld,
  1766.                              LLVMValueRef a)
  1767. {
  1768.    LLVMBuilderRef builder = bld->gallivm->builder;
  1769.    const struct lp_type type = bld->type;
  1770.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  1771.    LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
  1772.    const char *intrinsic;
  1773.    LLVMValueRef res;
  1774.  
  1775.    assert(type.floating);
  1776.    /* using the double precision conversions is a bit more complicated */
  1777.    assert(type.width == 32);
  1778.  
  1779.    assert(lp_check_value(type, a));
  1780.    assert(util_cpu_caps.has_sse2);
  1781.  
  1782.    /* This is relying on MXCSR rounding mode, which should always be nearest. */
  1783.    if (type.length == 1) {
  1784.       LLVMTypeRef vec_type;
  1785.       LLVMValueRef undef;
  1786.       LLVMValueRef arg;
  1787.       LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
  1788.  
  1789.       vec_type = LLVMVectorType(bld->elem_type, 4);
  1790.  
  1791.       intrinsic = "llvm.x86.sse.cvtss2si";
  1792.  
  1793.       undef = LLVMGetUndef(vec_type);
  1794.  
  1795.       arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
  1796.  
  1797.       res = lp_build_intrinsic_unary(builder, intrinsic,
  1798.                                      ret_type, arg);
  1799.    }
  1800.    else {
  1801.       if (type.width* type.length == 128) {
  1802.          intrinsic = "llvm.x86.sse2.cvtps2dq";
  1803.       }
  1804.       else {
  1805.          assert(type.width*type.length == 256);
  1806.          assert(util_cpu_caps.has_avx);
  1807.  
  1808.          intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
  1809.       }
  1810.       res = lp_build_intrinsic_unary(builder, intrinsic,
  1811.                                      ret_type, a);
  1812.    }
  1813.  
  1814.    return res;
  1815. }
  1816.  
  1817.  
  1818. /*
  1819.  */
  1820. static INLINE LLVMValueRef
  1821. lp_build_round_altivec(struct lp_build_context *bld,
  1822.                        LLVMValueRef a,
  1823.                        enum lp_build_round_mode mode)
  1824. {
  1825.    LLVMBuilderRef builder = bld->gallivm->builder;
  1826.    const struct lp_type type = bld->type;
  1827.    const char *intrinsic = NULL;
  1828.  
  1829.    assert(type.floating);
  1830.  
  1831.    assert(lp_check_value(type, a));
  1832.    assert(util_cpu_caps.has_altivec);
  1833.  
  1834.    (void)type;
  1835.  
  1836.    switch (mode) {
  1837.    case LP_BUILD_ROUND_NEAREST:
  1838.       intrinsic = "llvm.ppc.altivec.vrfin";
  1839.       break;
  1840.    case LP_BUILD_ROUND_FLOOR:
  1841.       intrinsic = "llvm.ppc.altivec.vrfim";
  1842.       break;
  1843.    case LP_BUILD_ROUND_CEIL:
  1844.       intrinsic = "llvm.ppc.altivec.vrfip";
  1845.       break;
  1846.    case LP_BUILD_ROUND_TRUNCATE:
  1847.       intrinsic = "llvm.ppc.altivec.vrfiz";
  1848.       break;
  1849.    }
  1850.  
  1851.    return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  1852. }
  1853.  
  1854. static INLINE LLVMValueRef
  1855. lp_build_round_arch(struct lp_build_context *bld,
  1856.                     LLVMValueRef a,
  1857.                     enum lp_build_round_mode mode)
  1858. {
  1859.    if (util_cpu_caps.has_sse4_1)
  1860.      return lp_build_round_sse41(bld, a, mode);
  1861.    else /* (util_cpu_caps.has_altivec) */
  1862.      return lp_build_round_altivec(bld, a, mode);
  1863. }
  1864.  
  1865. /**
  1866.  * Return the integer part of a float (vector) value (== round toward zero).
  1867.  * The returned value is a float (vector).
  1868.  * Ex: trunc(-1.5) = -1.0
  1869.  */
  1870. LLVMValueRef
  1871. lp_build_trunc(struct lp_build_context *bld,
  1872.                LLVMValueRef a)
  1873. {
  1874.    LLVMBuilderRef builder = bld->gallivm->builder;
  1875.    const struct lp_type type = bld->type;
  1876.  
  1877.    assert(type.floating);
  1878.    assert(lp_check_value(type, a));
  1879.  
  1880.    if (arch_rounding_available(type)) {
  1881.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
  1882.    }
  1883.    else {
  1884.       const struct lp_type type = bld->type;
  1885.       struct lp_type inttype;
  1886.       struct lp_build_context intbld;
  1887.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
  1888.       LLVMValueRef trunc, res, anosign, mask;
  1889.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1890.       LLVMTypeRef vec_type = bld->vec_type;
  1891.  
  1892.       assert(type.width == 32); /* might want to handle doubles at some point */
  1893.  
  1894.       inttype = type;
  1895.       inttype.floating = 0;
  1896.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  1897.  
  1898.       /* round by truncation */
  1899.       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  1900.       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
  1901.  
  1902.       /* mask out sign bit */
  1903.       anosign = lp_build_abs(bld, a);
  1904.       /*
  1905.        * mask out all values if anosign > 2^24
  1906.        * This should work both for large ints (all rounding is no-op for them
  1907.        * because such floats are always exact) as well as special cases like
  1908.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  1909.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  1910.        */
  1911.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  1912.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  1913.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  1914.       return lp_build_select(bld, mask, a, res);
  1915.    }
  1916. }
  1917.  
  1918.  
  1919. /**
  1920.  * Return float (vector) rounded to nearest integer (vector).  The returned
  1921.  * value is a float (vector).
  1922.  * Ex: round(0.9) = 1.0
  1923.  * Ex: round(-1.5) = -2.0
  1924.  */
  1925. LLVMValueRef
  1926. lp_build_round(struct lp_build_context *bld,
  1927.                LLVMValueRef a)
  1928. {
  1929.    LLVMBuilderRef builder = bld->gallivm->builder;
  1930.    const struct lp_type type = bld->type;
  1931.  
  1932.    assert(type.floating);
  1933.    assert(lp_check_value(type, a));
  1934.  
  1935.    if (arch_rounding_available(type)) {
  1936.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
  1937.    }
  1938.    else {
  1939.       const struct lp_type type = bld->type;
  1940.       struct lp_type inttype;
  1941.       struct lp_build_context intbld;
  1942.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
  1943.       LLVMValueRef res, anosign, mask;
  1944.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1945.       LLVMTypeRef vec_type = bld->vec_type;
  1946.  
  1947.       assert(type.width == 32); /* might want to handle doubles at some point */
  1948.  
  1949.       inttype = type;
  1950.       inttype.floating = 0;
  1951.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  1952.  
  1953.       res = lp_build_iround(bld, a);
  1954.       res = LLVMBuildSIToFP(builder, res, vec_type, "");
  1955.  
  1956.       /* mask out sign bit */
  1957.       anosign = lp_build_abs(bld, a);
  1958.       /*
  1959.        * mask out all values if anosign > 2^24
  1960.        * This should work both for large ints (all rounding is no-op for them
  1961.        * because such floats are always exact) as well as special cases like
  1962.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  1963.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  1964.        */
  1965.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  1966.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  1967.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  1968.       return lp_build_select(bld, mask, a, res);
  1969.    }
  1970. }
  1971.  
  1972.  
  1973. /**
  1974.  * Return floor of float (vector), result is a float (vector)
  1975.  * Ex: floor(1.1) = 1.0
  1976.  * Ex: floor(-1.1) = -2.0
  1977.  */
  1978. LLVMValueRef
  1979. lp_build_floor(struct lp_build_context *bld,
  1980.                LLVMValueRef a)
  1981. {
  1982.    LLVMBuilderRef builder = bld->gallivm->builder;
  1983.    const struct lp_type type = bld->type;
  1984.  
  1985.    assert(type.floating);
  1986.    assert(lp_check_value(type, a));
  1987.  
  1988.    if (arch_rounding_available(type)) {
  1989.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
  1990.    }
  1991.    else {
  1992.       const struct lp_type type = bld->type;
  1993.       struct lp_type inttype;
  1994.       struct lp_build_context intbld;
  1995.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
  1996.       LLVMValueRef trunc, res, anosign, mask;
  1997.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  1998.       LLVMTypeRef vec_type = bld->vec_type;
  1999.  
  2000.       assert(type.width == 32); /* might want to handle doubles at some point */
  2001.  
  2002.       inttype = type;
  2003.       inttype.floating = 0;
  2004.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  2005.  
  2006.       /* round by truncation */
  2007.       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2008.       res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
  2009.  
  2010.       if (type.sign) {
  2011.          LLVMValueRef tmp;
  2012.  
  2013.          /*
  2014.           * fix values if rounding is wrong (for non-special cases)
  2015.           * - this is the case if trunc > a
  2016.           */
  2017.          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
  2018.          /* tmp = trunc > a ? 1.0 : 0.0 */
  2019.          tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
  2020.          tmp = lp_build_and(&intbld, mask, tmp);
  2021.          tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
  2022.          res = lp_build_sub(bld, res, tmp);
  2023.       }
  2024.  
  2025.       /* mask out sign bit */
  2026.       anosign = lp_build_abs(bld, a);
  2027.       /*
  2028.        * mask out all values if anosign > 2^24
  2029.        * This should work both for large ints (all rounding is no-op for them
  2030.        * because such floats are always exact) as well as special cases like
  2031.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  2032.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  2033.        */
  2034.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  2035.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  2036.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  2037.       return lp_build_select(bld, mask, a, res);
  2038.    }
  2039. }
  2040.  
  2041.  
  2042. /**
  2043.  * Return ceiling of float (vector), returning float (vector).
  2044.  * Ex: ceil( 1.1) = 2.0
  2045.  * Ex: ceil(-1.1) = -1.0
  2046.  */
  2047. LLVMValueRef
  2048. lp_build_ceil(struct lp_build_context *bld,
  2049.               LLVMValueRef a)
  2050. {
  2051.    LLVMBuilderRef builder = bld->gallivm->builder;
  2052.    const struct lp_type type = bld->type;
  2053.  
  2054.    assert(type.floating);
  2055.    assert(lp_check_value(type, a));
  2056.  
  2057.    if (arch_rounding_available(type)) {
  2058.       return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
  2059.    }
  2060.    else {
  2061.       const struct lp_type type = bld->type;
  2062.       struct lp_type inttype;
  2063.       struct lp_build_context intbld;
  2064.       LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
  2065.       LLVMValueRef trunc, res, anosign, mask, tmp;
  2066.       LLVMTypeRef int_vec_type = bld->int_vec_type;
  2067.       LLVMTypeRef vec_type = bld->vec_type;
  2068.  
  2069.       assert(type.width == 32); /* might want to handle doubles at some point */
  2070.  
  2071.       inttype = type;
  2072.       inttype.floating = 0;
  2073.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  2074.  
  2075.       /* round by truncation */
  2076.       trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2077.       trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
  2078.  
  2079.       /*
  2080.        * fix values if rounding is wrong (for non-special cases)
  2081.        * - this is the case if trunc < a
  2082.        */
  2083.       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
  2084.       /* tmp = trunc < a ? 1.0 : 0.0 */
  2085.       tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
  2086.       tmp = lp_build_and(&intbld, mask, tmp);
  2087.       tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
  2088.       res = lp_build_add(bld, trunc, tmp);
  2089.  
  2090.       /* mask out sign bit */
  2091.       anosign = lp_build_abs(bld, a);
  2092.       /*
  2093.        * mask out all values if anosign > 2^24
  2094.        * This should work both for large ints (all rounding is no-op for them
  2095.        * because such floats are always exact) as well as special cases like
  2096.        * NaNs, Infs (taking advantage of the fact they use max exponent).
  2097.        * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
  2098.        */
  2099.       anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
  2100.       cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
  2101.       mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
  2102.       return lp_build_select(bld, mask, a, res);
  2103.    }
  2104. }
  2105.  
  2106.  
  2107. /**
  2108.  * Return fractional part of 'a' computed as a - floor(a)
  2109.  * Typically used in texture coord arithmetic.
  2110.  */
  2111. LLVMValueRef
  2112. lp_build_fract(struct lp_build_context *bld,
  2113.                LLVMValueRef a)
  2114. {
  2115.    assert(bld->type.floating);
  2116.    return lp_build_sub(bld, a, lp_build_floor(bld, a));
  2117. }
  2118.  
  2119.  
  2120. /**
  2121.  * Prevent returning a fractional part of 1.0 for very small negative values of
  2122.  * 'a' by clamping against 0.99999(9).
  2123.  */
  2124. static inline LLVMValueRef
  2125. clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
  2126. {
  2127.    LLVMValueRef max;
  2128.  
  2129.    /* this is the largest number smaller than 1.0 representable as float */
  2130.    max = lp_build_const_vec(bld->gallivm, bld->type,
  2131.                             1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
  2132.    return lp_build_min(bld, fract, max);
  2133. }
  2134.  
  2135.  
  2136. /**
  2137.  * Same as lp_build_fract, but guarantees that the result is always smaller
  2138.  * than one.
  2139.  */
  2140. LLVMValueRef
  2141. lp_build_fract_safe(struct lp_build_context *bld,
  2142.                     LLVMValueRef a)
  2143. {
  2144.    return clamp_fract(bld, lp_build_fract(bld, a));
  2145. }
  2146.  
  2147.  
  2148. /**
  2149.  * Return the integer part of a float (vector) value (== round toward zero).
  2150.  * The returned value is an integer (vector).
  2151.  * Ex: itrunc(-1.5) = -1
  2152.  */
  2153. LLVMValueRef
  2154. lp_build_itrunc(struct lp_build_context *bld,
  2155.                 LLVMValueRef a)
  2156. {
  2157.    LLVMBuilderRef builder = bld->gallivm->builder;
  2158.    const struct lp_type type = bld->type;
  2159.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  2160.  
  2161.    assert(type.floating);
  2162.    assert(lp_check_value(type, a));
  2163.  
  2164.    return LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2165. }
  2166.  
  2167.  
  2168. /**
  2169.  * Return float (vector) rounded to nearest integer (vector).  The returned
  2170.  * value is an integer (vector).
  2171.  * Ex: iround(0.9) = 1
  2172.  * Ex: iround(-1.5) = -2
  2173.  */
  2174. LLVMValueRef
  2175. lp_build_iround(struct lp_build_context *bld,
  2176.                 LLVMValueRef a)
  2177. {
  2178.    LLVMBuilderRef builder = bld->gallivm->builder;
  2179.    const struct lp_type type = bld->type;
  2180.    LLVMTypeRef int_vec_type = bld->int_vec_type;
  2181.    LLVMValueRef res;
  2182.  
  2183.    assert(type.floating);
  2184.  
  2185.    assert(lp_check_value(type, a));
  2186.  
  2187.    if ((util_cpu_caps.has_sse2 &&
  2188.        ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
  2189.        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
  2190.       return lp_build_iround_nearest_sse2(bld, a);
  2191.    }
  2192.    if (arch_rounding_available(type)) {
  2193.       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
  2194.    }
  2195.    else {
  2196.       LLVMValueRef half;
  2197.  
  2198.       half = lp_build_const_vec(bld->gallivm, type, 0.5);
  2199.  
  2200.       if (type.sign) {
  2201.          LLVMTypeRef vec_type = bld->vec_type;
  2202.          LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
  2203.                                     (unsigned long long)1 << (type.width - 1));
  2204.          LLVMValueRef sign;
  2205.  
  2206.          /* get sign bit */
  2207.          sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
  2208.          sign = LLVMBuildAnd(builder, sign, mask, "");
  2209.  
  2210.          /* sign * 0.5 */
  2211.          half = LLVMBuildBitCast(builder, half, int_vec_type, "");
  2212.          half = LLVMBuildOr(builder, sign, half, "");
  2213.          half = LLVMBuildBitCast(builder, half, vec_type, "");
  2214.       }
  2215.  
  2216.       res = LLVMBuildFAdd(builder, a, half, "");
  2217.    }
  2218.  
  2219.    res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
  2220.  
  2221.    return res;
  2222. }
  2223.  
  2224.  
  2225. /**
  2226.  * Return floor of float (vector), result is an int (vector)
  2227.  * Ex: ifloor(1.1) = 1.0
  2228.  * Ex: ifloor(-1.1) = -2.0
  2229.  */
  2230. LLVMValueRef
  2231. lp_build_ifloor(struct lp_build_context *bld,
  2232.                 LLVMValueRef a)
  2233. {
  2234.    LLVMBuilderRef builder = bld->gallivm->builder;
  2235.    const struct lp_type type = bld->type;
  2236.    LLVMTypeRef int_vec_type = bld->int_vec_type;
  2237.    LLVMValueRef res;
  2238.  
  2239.    assert(type.floating);
  2240.    assert(lp_check_value(type, a));
  2241.  
  2242.    res = a;
  2243.    if (type.sign) {
  2244.       if (arch_rounding_available(type)) {
  2245.          res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
  2246.       }
  2247.       else {
  2248.          struct lp_type inttype;
  2249.          struct lp_build_context intbld;
  2250.          LLVMValueRef trunc, itrunc, mask;
  2251.  
  2252.          assert(type.floating);
  2253.          assert(lp_check_value(type, a));
  2254.  
  2255.          inttype = type;
  2256.          inttype.floating = 0;
  2257.          lp_build_context_init(&intbld, bld->gallivm, inttype);
  2258.  
  2259.          /* round by truncation */
  2260.          itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2261.          trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
  2262.  
  2263.          /*
  2264.           * fix values if rounding is wrong (for non-special cases)
  2265.           * - this is the case if trunc > a
  2266.           * The results of doing this with NaNs, very large values etc.
  2267.           * are undefined but this seems to be the case anyway.
  2268.           */
  2269.          mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
  2270.          /* cheapie minus one with mask since the mask is minus one / zero */
  2271.          return lp_build_add(&intbld, itrunc, mask);
  2272.       }
  2273.    }
  2274.  
  2275.    /* round to nearest (toward zero) */
  2276.    res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
  2277.  
  2278.    return res;
  2279. }
  2280.  
  2281.  
  2282. /**
  2283.  * Return ceiling of float (vector), returning int (vector).
  2284.  * Ex: iceil( 1.1) = 2
  2285.  * Ex: iceil(-1.1) = -1
  2286.  */
  2287. LLVMValueRef
  2288. lp_build_iceil(struct lp_build_context *bld,
  2289.                LLVMValueRef a)
  2290. {
  2291.    LLVMBuilderRef builder = bld->gallivm->builder;
  2292.    const struct lp_type type = bld->type;
  2293.    LLVMTypeRef int_vec_type = bld->int_vec_type;
  2294.    LLVMValueRef res;
  2295.  
  2296.    assert(type.floating);
  2297.    assert(lp_check_value(type, a));
  2298.  
  2299.    if (arch_rounding_available(type)) {
  2300.       res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
  2301.    }
  2302.    else {
  2303.       struct lp_type inttype;
  2304.       struct lp_build_context intbld;
  2305.       LLVMValueRef trunc, itrunc, mask;
  2306.  
  2307.       assert(type.floating);
  2308.       assert(lp_check_value(type, a));
  2309.  
  2310.       inttype = type;
  2311.       inttype.floating = 0;
  2312.       lp_build_context_init(&intbld, bld->gallivm, inttype);
  2313.  
  2314.       /* round by truncation */
  2315.       itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
  2316.       trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
  2317.  
  2318.       /*
  2319.        * fix values if rounding is wrong (for non-special cases)
  2320.        * - this is the case if trunc < a
  2321.        * The results of doing this with NaNs, very large values etc.
  2322.        * are undefined but this seems to be the case anyway.
  2323.        */
  2324.       mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
  2325.       /* cheapie plus one with mask since the mask is minus one / zero */
  2326.       return lp_build_sub(&intbld, itrunc, mask);
  2327.    }
  2328.  
  2329.    /* round to nearest (toward zero) */
  2330.    res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
  2331.  
  2332.    return res;
  2333. }
  2334.  
  2335.  
  2336. /**
  2337.  * Combined ifloor() & fract().
  2338.  *
  2339.  * Preferred to calling the functions separately, as it will ensure that the
  2340.  * strategy (floor() vs ifloor()) that results in less redundant work is used.
  2341.  */
  2342. void
  2343. lp_build_ifloor_fract(struct lp_build_context *bld,
  2344.                       LLVMValueRef a,
  2345.                       LLVMValueRef *out_ipart,
  2346.                       LLVMValueRef *out_fpart)
  2347. {
  2348.    LLVMBuilderRef builder = bld->gallivm->builder;
  2349.    const struct lp_type type = bld->type;
  2350.    LLVMValueRef ipart;
  2351.  
  2352.    assert(type.floating);
  2353.    assert(lp_check_value(type, a));
  2354.  
  2355.    if (arch_rounding_available(type)) {
  2356.       /*
  2357.        * floor() is easier.
  2358.        */
  2359.  
  2360.       ipart = lp_build_floor(bld, a);
  2361.       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
  2362.       *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
  2363.    }
  2364.    else {
  2365.       /*
  2366.        * ifloor() is easier.
  2367.        */
  2368.  
  2369.       *out_ipart = lp_build_ifloor(bld, a);
  2370.       ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
  2371.       *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
  2372.    }
  2373. }
  2374.  
  2375.  
  2376. /**
  2377.  * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
  2378.  * always smaller than one.
  2379.  */
  2380. void
  2381. lp_build_ifloor_fract_safe(struct lp_build_context *bld,
  2382.                            LLVMValueRef a,
  2383.                            LLVMValueRef *out_ipart,
  2384.                            LLVMValueRef *out_fpart)
  2385. {
  2386.    lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
  2387.    *out_fpart = clamp_fract(bld, *out_fpart);
  2388. }
  2389.  
  2390.  
  2391. LLVMValueRef
  2392. lp_build_sqrt(struct lp_build_context *bld,
  2393.               LLVMValueRef a)
  2394. {
  2395.    LLVMBuilderRef builder = bld->gallivm->builder;
  2396.    const struct lp_type type = bld->type;
  2397.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  2398.    char intrinsic[32];
  2399.  
  2400.    assert(lp_check_value(type, a));
  2401.  
  2402.    /* TODO: optimize the constant case */
  2403.  
  2404.    assert(type.floating);
  2405.    if (type.length == 1) {
  2406.       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.f%u", type.width);
  2407.    }
  2408.    else {
  2409.       util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
  2410.    }
  2411.  
  2412.    return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
  2413. }
  2414.  
  2415.  
  2416. /**
  2417.  * Do one Newton-Raphson step to improve reciprocate precision:
  2418.  *
  2419.  *   x_{i+1} = x_i * (2 - a * x_i)
  2420.  *
  2421.  * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
  2422.  * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
  2423.  * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
  2424.  * halo. It would be necessary to clamp the argument to prevent this.
  2425.  *
  2426.  * See also:
  2427.  * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
  2428.  * - http://softwarecommunity.intel.com/articles/eng/1818.htm
  2429.  */
  2430. static INLINE LLVMValueRef
  2431. lp_build_rcp_refine(struct lp_build_context *bld,
  2432.                     LLVMValueRef a,
  2433.                     LLVMValueRef rcp_a)
  2434. {
  2435.    LLVMBuilderRef builder = bld->gallivm->builder;
  2436.    LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
  2437.    LLVMValueRef res;
  2438.  
  2439.    res = LLVMBuildFMul(builder, a, rcp_a, "");
  2440.    res = LLVMBuildFSub(builder, two, res, "");
  2441.    res = LLVMBuildFMul(builder, rcp_a, res, "");
  2442.  
  2443.    return res;
  2444. }
  2445.  
  2446.  
  2447. LLVMValueRef
  2448. lp_build_rcp(struct lp_build_context *bld,
  2449.              LLVMValueRef a)
  2450. {
  2451.    LLVMBuilderRef builder = bld->gallivm->builder;
  2452.    const struct lp_type type = bld->type;
  2453.  
  2454.    assert(lp_check_value(type, a));
  2455.  
  2456.    if(a == bld->zero)
  2457.       return bld->undef;
  2458.    if(a == bld->one)
  2459.       return bld->one;
  2460.    if(a == bld->undef)
  2461.       return bld->undef;
  2462.  
  2463.    assert(type.floating);
  2464.  
  2465.    if(LLVMIsConstant(a))
  2466.       return LLVMConstFDiv(bld->one, a);
  2467.  
  2468.    /*
  2469.     * We don't use RCPPS because:
  2470.     * - it only has 10bits of precision
  2471.     * - it doesn't even get the reciprocate of 1.0 exactly
  2472.     * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
  2473.     * - for recent processors the benefit over DIVPS is marginal, a case
  2474.     *   dependent
  2475.     *
  2476.     * We could still use it on certain processors if benchmarks show that the
  2477.     * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
  2478.     * particular uses that require less workarounds.
  2479.     */
  2480.  
  2481.    if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
  2482.          (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
  2483.       const unsigned num_iterations = 0;
  2484.       LLVMValueRef res;
  2485.       unsigned i;
  2486.       const char *intrinsic = NULL;
  2487.  
  2488.       if (type.length == 4) {
  2489.          intrinsic = "llvm.x86.sse.rcp.ps";
  2490.       }
  2491.       else {
  2492.          intrinsic = "llvm.x86.avx.rcp.ps.256";
  2493.       }
  2494.  
  2495.       res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  2496.  
  2497.       for (i = 0; i < num_iterations; ++i) {
  2498.          res = lp_build_rcp_refine(bld, a, res);
  2499.       }
  2500.  
  2501.       return res;
  2502.    }
  2503.  
  2504.    return LLVMBuildFDiv(builder, bld->one, a, "");
  2505. }
  2506.  
  2507.  
  2508. /**
  2509.  * Do one Newton-Raphson step to improve rsqrt precision:
  2510.  *
  2511.  *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
  2512.  *
  2513.  * See also Intel 64 and IA-32 Architectures Optimization Manual.
  2514.  */
  2515. static INLINE LLVMValueRef
  2516. lp_build_rsqrt_refine(struct lp_build_context *bld,
  2517.                       LLVMValueRef a,
  2518.                       LLVMValueRef rsqrt_a)
  2519. {
  2520.    LLVMBuilderRef builder = bld->gallivm->builder;
  2521.    LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
  2522.    LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
  2523.    LLVMValueRef res;
  2524.  
  2525.    res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
  2526.    res = LLVMBuildFMul(builder, a, res, "");
  2527.    res = LLVMBuildFSub(builder, three, res, "");
  2528.    res = LLVMBuildFMul(builder, rsqrt_a, res, "");
  2529.    res = LLVMBuildFMul(builder, half, res, "");
  2530.  
  2531.    return res;
  2532. }
  2533.  
  2534.  
  2535. /**
  2536.  * Generate 1/sqrt(a).
  2537.  * Result is undefined for values < 0, infinity for +0.
  2538.  */
  2539. LLVMValueRef
  2540. lp_build_rsqrt(struct lp_build_context *bld,
  2541.                LLVMValueRef a)
  2542. {
  2543.    const struct lp_type type = bld->type;
  2544.  
  2545.    assert(lp_check_value(type, a));
  2546.  
  2547.    assert(type.floating);
  2548.  
  2549.    /*
  2550.     * This should be faster but all denormals will end up as infinity.
  2551.     */
  2552.    if (0 && lp_build_fast_rsqrt_available(type)) {
  2553.       const unsigned num_iterations = 1;
  2554.       LLVMValueRef res;
  2555.       unsigned i;
  2556.  
  2557.       /* rsqrt(1.0) != 1.0 here */
  2558.       res = lp_build_fast_rsqrt(bld, a);
  2559.  
  2560.       if (num_iterations) {
  2561.          /*
  2562.           * Newton-Raphson will result in NaN instead of infinity for zero,
  2563.           * and NaN instead of zero for infinity.
  2564.           * Also, need to ensure rsqrt(1.0) == 1.0.
  2565.           * All numbers smaller than FLT_MIN will result in +infinity
  2566.           * (rsqrtps treats all denormals as zero).
  2567.           */
  2568.          LLVMValueRef cmp;
  2569.          LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
  2570.          LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
  2571.  
  2572.          for (i = 0; i < num_iterations; ++i) {
  2573.             res = lp_build_rsqrt_refine(bld, a, res);
  2574.          }
  2575.          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
  2576.          res = lp_build_select(bld, cmp, inf, res);
  2577.          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
  2578.          res = lp_build_select(bld, cmp, bld->zero, res);
  2579.          cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
  2580.          res = lp_build_select(bld, cmp, bld->one, res);
  2581.       }
  2582.  
  2583.       return res;
  2584.    }
  2585.  
  2586.    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
  2587. }
  2588.  
  2589. /**
  2590.  * If there's a fast (inaccurate) rsqrt instruction available
  2591.  * (caller may want to avoid to call rsqrt_fast if it's not available,
  2592.  * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
  2593.  * unavailable it would result in sqrt/div/mul so obviously
  2594.  * much better to just call sqrt, skipping both div and mul).
  2595.  */
  2596. boolean
  2597. lp_build_fast_rsqrt_available(struct lp_type type)
  2598. {
  2599.    assert(type.floating);
  2600.  
  2601.    if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
  2602.        (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
  2603.       return true;
  2604.    }
  2605.    return false;
  2606. }
  2607.  
  2608.  
  2609. /**
  2610.  * Generate 1/sqrt(a).
  2611.  * Result is undefined for values < 0, infinity for +0.
  2612.  * Precision is limited, only ~10 bits guaranteed
  2613.  * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
  2614.  */
  2615. LLVMValueRef
  2616. lp_build_fast_rsqrt(struct lp_build_context *bld,
  2617.                     LLVMValueRef a)
  2618. {
  2619.    LLVMBuilderRef builder = bld->gallivm->builder;
  2620.    const struct lp_type type = bld->type;
  2621.  
  2622.    assert(lp_check_value(type, a));
  2623.  
  2624.    if (lp_build_fast_rsqrt_available(type)) {
  2625.       const char *intrinsic = NULL;
  2626.  
  2627.       if (type.length == 4) {
  2628.          intrinsic = "llvm.x86.sse.rsqrt.ps";
  2629.       }
  2630.       else {
  2631.          intrinsic = "llvm.x86.avx.rsqrt.ps.256";
  2632.       }
  2633.       return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
  2634.    }
  2635.    else {
  2636.       debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
  2637.    }
  2638.    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
  2639. }
  2640.  
  2641.  
  2642. /**
  2643.  * Generate sin(a) or cos(a) using polynomial approximation.
  2644.  * TODO: it might be worth recognizing sin and cos using same source
  2645.  * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
  2646.  * would be way cheaper than calculating (nearly) everything twice...
  2647.  * Not sure it's common enough to be worth bothering however, scs
  2648.  * opcode could also benefit from calculating both though.
  2649.  */
  2650. static LLVMValueRef
  2651. lp_build_sin_or_cos(struct lp_build_context *bld,
  2652.                     LLVMValueRef a,
  2653.                     boolean cos)
  2654. {
  2655.    struct gallivm_state *gallivm = bld->gallivm;
  2656.    LLVMBuilderRef b = gallivm->builder;
  2657.    struct lp_type int_type = lp_int_type(bld->type);
  2658.  
  2659.    /*
  2660.     *  take the absolute value,
  2661.     *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
  2662.     */
  2663.  
  2664.    LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
  2665.    LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
  2666.  
  2667.    LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
  2668.    LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
  2669.  
  2670.    /*
  2671.     * scale by 4/Pi
  2672.     * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
  2673.     */
  2674.  
  2675.    LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
  2676.    LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
  2677.  
  2678.    /*
  2679.     * store the integer part of y in mm0
  2680.     * emm2 = _mm_cvttps_epi32(y);
  2681.     */
  2682.  
  2683.    LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
  2684.  
  2685.    /*
  2686.     * j=(j+1) & (~1) (see the cephes sources)
  2687.     * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
  2688.     */
  2689.  
  2690.    LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
  2691.    LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
  2692.    /*
  2693.     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
  2694.     */
  2695.    LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
  2696.    LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
  2697.  
  2698.    /*
  2699.     * y = _mm_cvtepi32_ps(emm2);
  2700.     */
  2701.    LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
  2702.  
  2703.    LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
  2704.    LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
  2705.    LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
  2706.    LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
  2707.  
  2708.    /*
  2709.     * Argument used for poly selection and sign bit determination
  2710.     * is different for sin vs. cos.
  2711.     */
  2712.    LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
  2713.                                emm2_and;
  2714.  
  2715.    LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
  2716.                                                               LLVMBuildNot(b, emm2_2, ""), ""),
  2717.                                               const_29, "sign_bit") :
  2718.                                  LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
  2719.                                                               LLVMBuildShl(b, emm2_add,
  2720.                                                                            const_29, ""), ""),
  2721.                                               sign_mask, "sign_bit");
  2722.  
  2723.    /*
  2724.     * get the polynom selection mask
  2725.     * there is one polynom for 0 <= x <= Pi/4
  2726.     * and another one for Pi/4<x<=Pi/2
  2727.     * Both branches will be computed.
  2728.     *
  2729.     * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
  2730.     * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  2731.     */
  2732.  
  2733.    LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
  2734.    LLVMValueRef poly_mask = lp_build_compare(gallivm,
  2735.                                              int_type, PIPE_FUNC_EQUAL,
  2736.                                              emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
  2737.  
  2738.    /*
  2739.     * _PS_CONST(minus_cephes_DP1, -0.78515625);
  2740.     * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
  2741.     * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
  2742.     */
  2743.    LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
  2744.    LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
  2745.    LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
  2746.  
  2747.    /*
  2748.     * The magic pass: "Extended precision modular arithmetic"
  2749.     * x = ((x - y * DP1) - y * DP2) - y * DP3;
  2750.     * xmm1 = _mm_mul_ps(y, xmm1);
  2751.     * xmm2 = _mm_mul_ps(y, xmm2);
  2752.     * xmm3 = _mm_mul_ps(y, xmm3);
  2753.     */
  2754.    LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
  2755.    LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
  2756.    LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
  2757.  
  2758.    /*
  2759.     * x = _mm_add_ps(x, xmm1);
  2760.     * x = _mm_add_ps(x, xmm2);
  2761.     * x = _mm_add_ps(x, xmm3);
  2762.     */
  2763.  
  2764.    LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
  2765.    LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
  2766.    LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
  2767.  
  2768.    /*
  2769.     * Evaluate the first polynom  (0 <= x <= Pi/4)
  2770.     *
  2771.     * z = _mm_mul_ps(x,x);
  2772.     */
  2773.    LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
  2774.  
  2775.    /*
  2776.     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
  2777.     * _PS_CONST(coscof_p1, -1.388731625493765E-003);
  2778.     * _PS_CONST(coscof_p2,  4.166664568298827E-002);
  2779.     */
  2780.    LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
  2781.    LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
  2782.    LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
  2783.  
  2784.    /*
  2785.     * y = *(v4sf*)_ps_coscof_p0;
  2786.     * y = _mm_mul_ps(y, z);
  2787.     */
  2788.    LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
  2789.    LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
  2790.    LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
  2791.    LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
  2792.    LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
  2793.    LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
  2794.  
  2795.  
  2796.    /*
  2797.     * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
  2798.     * y = _mm_sub_ps(y, tmp);
  2799.     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
  2800.     */
  2801.    LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
  2802.    LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
  2803.    LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
  2804.    LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
  2805.    LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
  2806.  
  2807.    /*
  2808.     * _PS_CONST(sincof_p0, -1.9515295891E-4);
  2809.     * _PS_CONST(sincof_p1,  8.3321608736E-3);
  2810.     * _PS_CONST(sincof_p2, -1.6666654611E-1);
  2811.     */
  2812.    LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
  2813.    LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
  2814.    LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
  2815.  
  2816.    /*
  2817.     * Evaluate the second polynom  (Pi/4 <= x <= 0)
  2818.     *
  2819.     * y2 = *(v4sf*)_ps_sincof_p0;
  2820.     * y2 = _mm_mul_ps(y2, z);
  2821.     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
  2822.     * y2 = _mm_mul_ps(y2, z);
  2823.     * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
  2824.     * y2 = _mm_mul_ps(y2, z);
  2825.     * y2 = _mm_mul_ps(y2, x);
  2826.     * y2 = _mm_add_ps(y2, x);
  2827.     */
  2828.  
  2829.    LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
  2830.    LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
  2831.    LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
  2832.    LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
  2833.    LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
  2834.    LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
  2835.    LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
  2836.  
  2837.    /*
  2838.     * select the correct result from the two polynoms
  2839.     * xmm3 = poly_mask;
  2840.     * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
  2841.     * y = _mm_andnot_ps(xmm3, y);
  2842.     * y = _mm_or_ps(y,y2);
  2843.     */
  2844.    LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
  2845.    LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
  2846.    LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
  2847.    LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
  2848.    LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
  2849.    LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
  2850.  
  2851.    /*
  2852.     * update the sign
  2853.     * y = _mm_xor_ps(y, sign_bit);
  2854.     */
  2855.    LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
  2856.    LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
  2857.  
  2858.    LLVMValueRef isfinite = lp_build_isfinite(bld, a);
  2859.  
  2860.    /* clamp output to be within [-1, 1] */
  2861.    y_result = lp_build_clamp(bld, y_result,
  2862.                              lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
  2863.                              lp_build_const_vec(bld->gallivm, bld->type,  1.f));
  2864.    /* If a is -inf, inf or NaN then return NaN */
  2865.    y_result = lp_build_select(bld, isfinite, y_result,
  2866.                               lp_build_const_vec(bld->gallivm, bld->type,  NAN));
  2867.    return y_result;
  2868. }
  2869.  
  2870.  
  2871. /**
  2872.  * Generate sin(a)
  2873.  */
  2874. LLVMValueRef
  2875. lp_build_sin(struct lp_build_context *bld,
  2876.              LLVMValueRef a)
  2877. {
  2878.    return lp_build_sin_or_cos(bld, a, FALSE);
  2879. }
  2880.  
  2881.  
  2882. /**
  2883.  * Generate cos(a)
  2884.  */
  2885. LLVMValueRef
  2886. lp_build_cos(struct lp_build_context *bld,
  2887.              LLVMValueRef a)
  2888. {
  2889.    return lp_build_sin_or_cos(bld, a, TRUE);
  2890. }
  2891.  
  2892.  
  2893. /**
  2894.  * Generate pow(x, y)
  2895.  */
  2896. LLVMValueRef
  2897. lp_build_pow(struct lp_build_context *bld,
  2898.              LLVMValueRef x,
  2899.              LLVMValueRef y)
  2900. {
  2901.    /* TODO: optimize the constant case */
  2902.    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  2903.        LLVMIsConstant(x) && LLVMIsConstant(y)) {
  2904.       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  2905.                    __FUNCTION__);
  2906.    }
  2907.  
  2908.    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
  2909. }
  2910.  
  2911.  
  2912. /**
  2913.  * Generate exp(x)
  2914.  */
  2915. LLVMValueRef
  2916. lp_build_exp(struct lp_build_context *bld,
  2917.              LLVMValueRef x)
  2918. {
  2919.    /* log2(e) = 1/log(2) */
  2920.    LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
  2921.                                            1.4426950408889634);
  2922.  
  2923.    assert(lp_check_value(bld->type, x));
  2924.  
  2925.    return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
  2926. }
  2927.  
  2928.  
  2929. /**
  2930.  * Generate log(x)
  2931.  * Behavior is undefined with infs, 0s and nans
  2932.  */
  2933. LLVMValueRef
  2934. lp_build_log(struct lp_build_context *bld,
  2935.              LLVMValueRef x)
  2936. {
  2937.    /* log(2) */
  2938.    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
  2939.                                           0.69314718055994529);
  2940.  
  2941.    assert(lp_check_value(bld->type, x));
  2942.  
  2943.    return lp_build_mul(bld, log2, lp_build_log2(bld, x));
  2944. }
  2945.  
  2946. /**
  2947.  * Generate log(x) that handles edge cases (infs, 0s and nans)
  2948.  */
  2949. LLVMValueRef
  2950. lp_build_log_safe(struct lp_build_context *bld,
  2951.                   LLVMValueRef x)
  2952. {
  2953.    /* log(2) */
  2954.    LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
  2955.                                           0.69314718055994529);
  2956.  
  2957.    assert(lp_check_value(bld->type, x));
  2958.  
  2959.    return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
  2960. }
  2961.  
  2962.  
  2963. /**
  2964.  * Generate polynomial.
  2965.  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
  2966.  */
  2967. LLVMValueRef
  2968. lp_build_polynomial(struct lp_build_context *bld,
  2969.                     LLVMValueRef x,
  2970.                     const double *coeffs,
  2971.                     unsigned num_coeffs)
  2972. {
  2973.    const struct lp_type type = bld->type;
  2974.    LLVMValueRef even = NULL, odd = NULL;
  2975.    LLVMValueRef x2;
  2976.    unsigned i;
  2977.  
  2978.    assert(lp_check_value(bld->type, x));
  2979.  
  2980.    /* TODO: optimize the constant case */
  2981.    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  2982.        LLVMIsConstant(x)) {
  2983.       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  2984.                    __FUNCTION__);
  2985.    }
  2986.  
  2987.    /*
  2988.     * Calculate odd and even terms seperately to decrease data dependency
  2989.     * Ex:
  2990.     *     c[0] + x^2 * c[2] + x^4 * c[4] ...
  2991.     *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
  2992.     */
  2993.    x2 = lp_build_mul(bld, x, x);
  2994.  
  2995.    for (i = num_coeffs; i--; ) {
  2996.       LLVMValueRef coeff;
  2997.  
  2998.       coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
  2999.  
  3000.       if (i % 2 == 0) {
  3001.          if (even)
  3002.             even = lp_build_add(bld, coeff, lp_build_mul(bld, x2, even));
  3003.          else
  3004.             even = coeff;
  3005.       } else {
  3006.          if (odd)
  3007.             odd = lp_build_add(bld, coeff, lp_build_mul(bld, x2, odd));
  3008.          else
  3009.             odd = coeff;
  3010.       }
  3011.    }
  3012.  
  3013.    if (odd)
  3014.       return lp_build_add(bld, lp_build_mul(bld, odd, x), even);
  3015.    else if (even)
  3016.       return even;
  3017.    else
  3018.       return bld->undef;
  3019. }
  3020.  
  3021.  
  3022. /**
  3023.  * Minimax polynomial fit of 2**x, in range [0, 1[
  3024.  */
  3025. const double lp_build_exp2_polynomial[] = {
  3026. #if EXP_POLY_DEGREE == 5
  3027.    1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
  3028.    0.693153073200168932794,
  3029.    0.240153617044375388211,
  3030.    0.0558263180532956664775,
  3031.    0.00898934009049466391101,
  3032.    0.00187757667519147912699
  3033. #elif EXP_POLY_DEGREE == 4
  3034.    1.00000259337069434683,
  3035.    0.693003834469974940458,
  3036.    0.24144275689150793076,
  3037.    0.0520114606103070150235,
  3038.    0.0135341679161270268764
  3039. #elif EXP_POLY_DEGREE == 3
  3040.    0.999925218562710312959,
  3041.    0.695833540494823811697,
  3042.    0.226067155427249155588,
  3043.    0.0780245226406372992967
  3044. #elif EXP_POLY_DEGREE == 2
  3045.    1.00172476321474503578,
  3046.    0.657636275736077639316,
  3047.    0.33718943461968720704
  3048. #else
  3049. #error
  3050. #endif
  3051. };
  3052.  
  3053.  
  3054. LLVMValueRef
  3055. lp_build_exp2(struct lp_build_context *bld,
  3056.               LLVMValueRef x)
  3057. {
  3058.    LLVMBuilderRef builder = bld->gallivm->builder;
  3059.    const struct lp_type type = bld->type;
  3060.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  3061.    LLVMValueRef ipart = NULL;
  3062.    LLVMValueRef fpart = NULL;
  3063.    LLVMValueRef expipart = NULL;
  3064.    LLVMValueRef expfpart = NULL;
  3065.    LLVMValueRef res = NULL;
  3066.  
  3067.    assert(lp_check_value(bld->type, x));
  3068.  
  3069.    /* TODO: optimize the constant case */
  3070.    if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  3071.        LLVMIsConstant(x)) {
  3072.       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  3073.                    __FUNCTION__);
  3074.    }
  3075.  
  3076.    assert(type.floating && type.width == 32);
  3077.  
  3078.    /* We want to preserve NaN and make sure than for exp2 if x > 128,
  3079.     * the result is INF  and if it's smaller than -126.9 the result is 0 */
  3080.    x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
  3081.                         GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
  3082.    x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
  3083.                         x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
  3084.  
  3085.    /* ipart = floor(x) */
  3086.    /* fpart = x - ipart */
  3087.    lp_build_ifloor_fract(bld, x, &ipart, &fpart);
  3088.  
  3089.    /* expipart = (float) (1 << ipart) */
  3090.    expipart = LLVMBuildAdd(builder, ipart,
  3091.                            lp_build_const_int_vec(bld->gallivm, type, 127), "");
  3092.    expipart = LLVMBuildShl(builder, expipart,
  3093.                            lp_build_const_int_vec(bld->gallivm, type, 23), "");
  3094.    expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
  3095.  
  3096.    expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
  3097.                                   Elements(lp_build_exp2_polynomial));
  3098.  
  3099.    res = LLVMBuildFMul(builder, expipart, expfpart, "");
  3100.  
  3101.    return res;
  3102. }
  3103.  
  3104.  
  3105.  
  3106. /**
  3107.  * Extract the exponent of a IEEE-754 floating point value.
  3108.  *
  3109.  * Optionally apply an integer bias.
  3110.  *
  3111.  * Result is an integer value with
  3112.  *
  3113.  *   ifloor(log2(x)) + bias
  3114.  */
  3115. LLVMValueRef
  3116. lp_build_extract_exponent(struct lp_build_context *bld,
  3117.                           LLVMValueRef x,
  3118.                           int bias)
  3119. {
  3120.    LLVMBuilderRef builder = bld->gallivm->builder;
  3121.    const struct lp_type type = bld->type;
  3122.    unsigned mantissa = lp_mantissa(type);
  3123.    LLVMValueRef res;
  3124.  
  3125.    assert(type.floating);
  3126.  
  3127.    assert(lp_check_value(bld->type, x));
  3128.  
  3129.    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
  3130.  
  3131.    res = LLVMBuildLShr(builder, x,
  3132.                        lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
  3133.    res = LLVMBuildAnd(builder, res,
  3134.                       lp_build_const_int_vec(bld->gallivm, type, 255), "");
  3135.    res = LLVMBuildSub(builder, res,
  3136.                       lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
  3137.  
  3138.    return res;
  3139. }
  3140.  
  3141.  
  3142. /**
  3143.  * Extract the mantissa of the a floating.
  3144.  *
  3145.  * Result is a floating point value with
  3146.  *
  3147.  *   x / floor(log2(x))
  3148.  */
  3149. LLVMValueRef
  3150. lp_build_extract_mantissa(struct lp_build_context *bld,
  3151.                           LLVMValueRef x)
  3152. {
  3153.    LLVMBuilderRef builder = bld->gallivm->builder;
  3154.    const struct lp_type type = bld->type;
  3155.    unsigned mantissa = lp_mantissa(type);
  3156.    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
  3157.                                                   (1ULL << mantissa) - 1);
  3158.    LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
  3159.    LLVMValueRef res;
  3160.  
  3161.    assert(lp_check_value(bld->type, x));
  3162.  
  3163.    assert(type.floating);
  3164.  
  3165.    x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
  3166.  
  3167.    /* res = x / 2**ipart */
  3168.    res = LLVMBuildAnd(builder, x, mantmask, "");
  3169.    res = LLVMBuildOr(builder, res, one, "");
  3170.    res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
  3171.  
  3172.    return res;
  3173. }
  3174.  
  3175.  
  3176.  
  3177. /**
  3178.  * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
  3179.  * These coefficients can be generate with
  3180.  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
  3181.  */
  3182. const double lp_build_log2_polynomial[] = {
  3183. #if LOG_POLY_DEGREE == 5
  3184.    2.88539008148777786488L,
  3185.    0.961796878841293367824L,
  3186.    0.577058946784739859012L,
  3187.    0.412914355135828735411L,
  3188.    0.308591899232910175289L,
  3189.    0.352376952300281371868L,
  3190. #elif LOG_POLY_DEGREE == 4
  3191.    2.88539009343309178325L,
  3192.    0.961791550404184197881L,
  3193.    0.577440339438736392009L,
  3194.    0.403343858251329912514L,
  3195.    0.406718052498846252698L,
  3196. #elif LOG_POLY_DEGREE == 3
  3197.    2.88538959748872753838L,
  3198.    0.961932915889597772928L,
  3199.    0.571118517972136195241L,
  3200.    0.493997535084709500285L,
  3201. #else
  3202. #error
  3203. #endif
  3204. };
  3205.  
  3206. /**
  3207.  * See http://www.devmaster.net/forums/showthread.php?p=43580
  3208.  * http://en.wikipedia.org/wiki/Logarithm#Calculation
  3209.  * http://www.nezumi.demon.co.uk/consult/logx.htm
  3210.  *
  3211.  * If handle_edge_cases is true the function will perform computations
  3212.  * to match the required D3D10+ behavior for each of the edge cases.
  3213.  * That means that if input is:
  3214.  * - less than zero (to and including -inf) then NaN will be returned
  3215.  * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
  3216.  * - +infinity, then +infinity will be returned
  3217.  * - NaN, then NaN will be returned
  3218.  *
  3219.  * Those checks are fairly expensive so if you don't need them make sure
  3220.  * handle_edge_cases is false.
  3221.  */
  3222. void
  3223. lp_build_log2_approx(struct lp_build_context *bld,
  3224.                      LLVMValueRef x,
  3225.                      LLVMValueRef *p_exp,
  3226.                      LLVMValueRef *p_floor_log2,
  3227.                      LLVMValueRef *p_log2,
  3228.                      boolean handle_edge_cases)
  3229. {
  3230.    LLVMBuilderRef builder = bld->gallivm->builder;
  3231.    const struct lp_type type = bld->type;
  3232.    LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
  3233.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
  3234.  
  3235.    LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
  3236.    LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
  3237.    LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
  3238.  
  3239.    LLVMValueRef i = NULL;
  3240.    LLVMValueRef y = NULL;
  3241.    LLVMValueRef z = NULL;
  3242.    LLVMValueRef exp = NULL;
  3243.    LLVMValueRef mant = NULL;
  3244.    LLVMValueRef logexp = NULL;
  3245.    LLVMValueRef logmant = NULL;
  3246.    LLVMValueRef res = NULL;
  3247.  
  3248.    assert(lp_check_value(bld->type, x));
  3249.  
  3250.    if(p_exp || p_floor_log2 || p_log2) {
  3251.       /* TODO: optimize the constant case */
  3252.       if (gallivm_debug & GALLIVM_DEBUG_PERF &&
  3253.           LLVMIsConstant(x)) {
  3254.          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
  3255.                       __FUNCTION__);
  3256.       }
  3257.  
  3258.       assert(type.floating && type.width == 32);
  3259.  
  3260.       /*
  3261.        * We don't explicitly handle denormalized numbers. They will yield a
  3262.        * result in the neighbourhood of -127, which appears to be adequate
  3263.        * enough.
  3264.        */
  3265.  
  3266.       i = LLVMBuildBitCast(builder, x, int_vec_type, "");
  3267.  
  3268.       /* exp = (float) exponent(x) */
  3269.       exp = LLVMBuildAnd(builder, i, expmask, "");
  3270.    }
  3271.  
  3272.    if(p_floor_log2 || p_log2) {
  3273.       logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
  3274.       logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
  3275.       logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
  3276.    }
  3277.  
  3278.    if(p_log2) {
  3279.       /* mant = 1 + (float) mantissa(x) */
  3280.       mant = LLVMBuildAnd(builder, i, mantmask, "");
  3281.       mant = LLVMBuildOr(builder, mant, one, "");
  3282.       mant = LLVMBuildBitCast(builder, mant, vec_type, "");
  3283.  
  3284.       /* y = (mant - 1) / (mant + 1) */
  3285.       y = lp_build_div(bld,
  3286.          lp_build_sub(bld, mant, bld->one),
  3287.          lp_build_add(bld, mant, bld->one)
  3288.       );
  3289.  
  3290.       /* z = y^2 */
  3291.       z = lp_build_mul(bld, y, y);
  3292.  
  3293.       /* compute P(z) */
  3294.       logmant = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
  3295.                                     Elements(lp_build_log2_polynomial));
  3296.  
  3297.       /* logmant = y * P(z) */
  3298.       logmant = lp_build_mul(bld, y, logmant);
  3299.  
  3300.       res = lp_build_add(bld, logmant, logexp);
  3301.  
  3302.       if (type.floating && handle_edge_cases) {
  3303.          LLVMValueRef negmask, infmask,  zmask;
  3304.          negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
  3305.                                 lp_build_const_vec(bld->gallivm, type,  0.0f));
  3306.          zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
  3307.                               lp_build_const_vec(bld->gallivm, type,  0.0f));
  3308.          infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
  3309.                                 lp_build_const_vec(bld->gallivm, type,  INFINITY));
  3310.  
  3311.          /* If x is qual to inf make sure we return inf */
  3312.          res = lp_build_select(bld, infmask,
  3313.                                lp_build_const_vec(bld->gallivm, type,  INFINITY),
  3314.                                res);
  3315.          /* If x is qual to 0, return -inf */
  3316.          res = lp_build_select(bld, zmask,
  3317.                                lp_build_const_vec(bld->gallivm, type,  -INFINITY),
  3318.                                res);
  3319.          /* If x is nan or less than 0, return nan */
  3320.          res = lp_build_select(bld, negmask,
  3321.                                lp_build_const_vec(bld->gallivm, type,  NAN),
  3322.                                res);
  3323.       }
  3324.    }
  3325.  
  3326.    if(p_exp) {
  3327.       exp = LLVMBuildBitCast(builder, exp, vec_type, "");
  3328.       *p_exp = exp;
  3329.    }
  3330.  
  3331.    if(p_floor_log2)
  3332.       *p_floor_log2 = logexp;
  3333.  
  3334.    if(p_log2)
  3335.       *p_log2 = res;
  3336. }
  3337.  
  3338.  
  3339. /*
  3340.  * log2 implementation which doesn't have special code to
  3341.  * handle edge cases (-inf, 0, inf, NaN). It's faster but
  3342.  * the results for those cases are undefined.
  3343.  */
  3344. LLVMValueRef
  3345. lp_build_log2(struct lp_build_context *bld,
  3346.               LLVMValueRef x)
  3347. {
  3348.    LLVMValueRef res;
  3349.    lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
  3350.    return res;
  3351. }
  3352.  
  3353. /*
  3354.  * Version of log2 which handles all edge cases.
  3355.  * Look at documentation of lp_build_log2_approx for
  3356.  * description of the behavior for each of the edge cases.
  3357.  */
  3358. LLVMValueRef
  3359. lp_build_log2_safe(struct lp_build_context *bld,
  3360.                    LLVMValueRef x)
  3361. {
  3362.    LLVMValueRef res;
  3363.    lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
  3364.    return res;
  3365. }
  3366.  
  3367.  
  3368. /**
  3369.  * Faster (and less accurate) log2.
  3370.  *
  3371.  *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
  3372.  *
  3373.  * Piece-wise linear approximation, with exact results when x is a
  3374.  * power of two.
  3375.  *
  3376.  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
  3377.  */
  3378. LLVMValueRef
  3379. lp_build_fast_log2(struct lp_build_context *bld,
  3380.                    LLVMValueRef x)
  3381. {
  3382.    LLVMBuilderRef builder = bld->gallivm->builder;
  3383.    LLVMValueRef ipart;
  3384.    LLVMValueRef fpart;
  3385.  
  3386.    assert(lp_check_value(bld->type, x));
  3387.  
  3388.    assert(bld->type.floating);
  3389.  
  3390.    /* ipart = floor(log2(x)) - 1 */
  3391.    ipart = lp_build_extract_exponent(bld, x, -1);
  3392.    ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
  3393.  
  3394.    /* fpart = x / 2**ipart */
  3395.    fpart = lp_build_extract_mantissa(bld, x);
  3396.  
  3397.    /* ipart + fpart */
  3398.    return LLVMBuildFAdd(builder, ipart, fpart, "");
  3399. }
  3400.  
  3401.  
  3402. /**
  3403.  * Fast implementation of iround(log2(x)).
  3404.  *
  3405.  * Not an approximation -- it should give accurate results all the time.
  3406.  */
  3407. LLVMValueRef
  3408. lp_build_ilog2(struct lp_build_context *bld,
  3409.                LLVMValueRef x)
  3410. {
  3411.    LLVMBuilderRef builder = bld->gallivm->builder;
  3412.    LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
  3413.    LLVMValueRef ipart;
  3414.  
  3415.    assert(bld->type.floating);
  3416.  
  3417.    assert(lp_check_value(bld->type, x));
  3418.  
  3419.    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
  3420.    x = LLVMBuildFMul(builder, x, sqrt2, "");
  3421.  
  3422.    /* ipart = floor(log2(x) + 0.5)  */
  3423.    ipart = lp_build_extract_exponent(bld, x, 0);
  3424.  
  3425.    return ipart;
  3426. }
  3427.  
  3428. LLVMValueRef
  3429. lp_build_mod(struct lp_build_context *bld,
  3430.              LLVMValueRef x,
  3431.              LLVMValueRef y)
  3432. {
  3433.    LLVMBuilderRef builder = bld->gallivm->builder;
  3434.    LLVMValueRef res;
  3435.    const struct lp_type type = bld->type;
  3436.  
  3437.    assert(lp_check_value(type, x));
  3438.    assert(lp_check_value(type, y));
  3439.  
  3440.    if (type.floating)
  3441.       res = LLVMBuildFRem(builder, x, y, "");
  3442.    else if (type.sign)
  3443.       res = LLVMBuildSRem(builder, x, y, "");
  3444.    else
  3445.       res = LLVMBuildURem(builder, x, y, "");
  3446.    return res;
  3447. }
  3448.  
  3449.  
  3450. /*
  3451.  * For floating inputs it creates and returns a mask
  3452.  * which is all 1's for channels which are NaN.
  3453.  * Channels inside x which are not NaN will be 0.
  3454.  */
  3455. LLVMValueRef
  3456. lp_build_isnan(struct lp_build_context *bld,
  3457.                LLVMValueRef x)
  3458. {
  3459.    LLVMValueRef mask;
  3460.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
  3461.  
  3462.    assert(bld->type.floating);
  3463.    assert(lp_check_value(bld->type, x));
  3464.  
  3465.    mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
  3466.                         "isnotnan");
  3467.    mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
  3468.    mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
  3469.    return mask;
  3470. }
  3471.  
  3472. /* Returns all 1's for floating point numbers that are
  3473.  * finite numbers and returns all zeros for -inf,
  3474.  * inf and nan's */
  3475. LLVMValueRef
  3476. lp_build_isfinite(struct lp_build_context *bld,
  3477.                   LLVMValueRef x)
  3478. {
  3479.    LLVMBuilderRef builder = bld->gallivm->builder;
  3480.    LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
  3481.    struct lp_type int_type = lp_int_type(bld->type);
  3482.    LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
  3483.    LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
  3484.                                                     0x7f800000);
  3485.  
  3486.    if (!bld->type.floating) {
  3487.       return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
  3488.    }
  3489.    assert(bld->type.floating);
  3490.    assert(lp_check_value(bld->type, x));
  3491.    assert(bld->type.width == 32);
  3492.  
  3493.    intx = LLVMBuildAnd(builder, intx, infornan32, "");
  3494.    return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
  3495.                            intx, infornan32);
  3496. }
  3497.  
  3498. /*
  3499.  * Returns true if the number is nan or inf and false otherwise.
  3500.  * The input has to be a floating point vector.
  3501.  */
  3502. LLVMValueRef
  3503. lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
  3504.                        const struct lp_type type,
  3505.                        LLVMValueRef x)
  3506. {
  3507.    LLVMBuilderRef builder = gallivm->builder;
  3508.    struct lp_type int_type = lp_int_type(type);
  3509.    LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
  3510.                                                 0x7f800000);
  3511.    LLVMValueRef ret;
  3512.  
  3513.    assert(type.floating);
  3514.  
  3515.    ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
  3516.    ret = LLVMBuildAnd(builder, ret, const0, "");
  3517.    ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
  3518.                           ret, const0);
  3519.  
  3520.    return ret;
  3521. }
  3522.  
  3523.  
  3524. LLVMValueRef
  3525. lp_build_fpstate_get(struct gallivm_state *gallivm)
  3526. {
  3527.    if (util_cpu_caps.has_sse) {
  3528.       LLVMBuilderRef builder = gallivm->builder;
  3529.       LLVMValueRef mxcsr_ptr = lp_build_alloca(
  3530.          gallivm,
  3531.          LLVMInt32TypeInContext(gallivm->context),
  3532.          "mxcsr_ptr");
  3533.       LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
  3534.           LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
  3535.       lp_build_intrinsic(builder,
  3536.                          "llvm.x86.sse.stmxcsr",
  3537.                          LLVMVoidTypeInContext(gallivm->context),
  3538.                          &mxcsr_ptr8, 1);
  3539.       return mxcsr_ptr;
  3540.    }
  3541.    return 0;
  3542. }
  3543.  
  3544. void
  3545. lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
  3546.                                   boolean zero)
  3547. {
  3548.    if (util_cpu_caps.has_sse) {
  3549.       /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
  3550.       int daz_ftz = _MM_FLUSH_ZERO_MASK;
  3551.  
  3552.       LLVMBuilderRef builder = gallivm->builder;
  3553.       LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
  3554.       LLVMValueRef mxcsr =
  3555.          LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
  3556.  
  3557.       if (util_cpu_caps.has_daz) {
  3558.          /* Enable denormals are zero mode */
  3559.          daz_ftz |= _MM_DENORMALS_ZERO_MASK;
  3560.       }
  3561.       if (zero) {
  3562.          mxcsr = LLVMBuildOr(builder, mxcsr,
  3563.                              LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
  3564.       } else {
  3565.          mxcsr = LLVMBuildAnd(builder, mxcsr,
  3566.                               LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
  3567.       }
  3568.  
  3569.       LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
  3570.       lp_build_fpstate_set(gallivm, mxcsr_ptr);
  3571.    }
  3572. }
  3573.  
  3574. void
  3575. lp_build_fpstate_set(struct gallivm_state *gallivm,
  3576.                      LLVMValueRef mxcsr_ptr)
  3577. {
  3578.    if (util_cpu_caps.has_sse) {
  3579.       LLVMBuilderRef builder = gallivm->builder;
  3580.       mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
  3581.                      LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
  3582.       lp_build_intrinsic(builder,
  3583.                          "llvm.x86.sse.ldmxcsr",
  3584.                          LLVMVoidTypeInContext(gallivm->context),
  3585.                          &mxcsr_ptr, 1);
  3586.    }
  3587. }
  3588.