Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28. /**
  29.  * @file
  30.  * Texture sampling -- common code.
  31.  *
  32.  * @author Jose Fonseca <jfonseca@vmware.com>
  33.  */
  34.  
  35. #include "pipe/p_defines.h"
  36. #include "pipe/p_state.h"
  37. #include "util/u_format.h"
  38. #include "util/u_math.h"
  39. #include "lp_bld_arit.h"
  40. #include "lp_bld_const.h"
  41. #include "lp_bld_debug.h"
  42. #include "lp_bld_printf.h"
  43. #include "lp_bld_flow.h"
  44. #include "lp_bld_sample.h"
  45. #include "lp_bld_swizzle.h"
  46. #include "lp_bld_type.h"
  47. #include "lp_bld_logic.h"
  48. #include "lp_bld_pack.h"
  49. #include "lp_bld_quad.h"
  50.  
  51.  
  52. /*
  53.  * Bri-linear factor. Should be greater than one.
  54.  */
  55. #define BRILINEAR_FACTOR 2
  56.  
  57. /**
  58.  * Does the given texture wrap mode allow sampling the texture border color?
  59.  * XXX maybe move this into gallium util code.
  60.  */
  61. boolean
  62. lp_sampler_wrap_mode_uses_border_color(unsigned mode,
  63.                                        unsigned min_img_filter,
  64.                                        unsigned mag_img_filter)
  65. {
  66.    switch (mode) {
  67.    case PIPE_TEX_WRAP_REPEAT:
  68.    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
  69.    case PIPE_TEX_WRAP_MIRROR_REPEAT:
  70.    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
  71.       return FALSE;
  72.    case PIPE_TEX_WRAP_CLAMP:
  73.    case PIPE_TEX_WRAP_MIRROR_CLAMP:
  74.       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
  75.           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
  76.          return FALSE;
  77.       } else {
  78.          return TRUE;
  79.       }
  80.    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
  81.    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
  82.       return TRUE;
  83.    default:
  84.       assert(0 && "unexpected wrap mode");
  85.       return FALSE;
  86.    }
  87. }
  88.  
  89.  
  90. /**
  91.  * Initialize lp_sampler_static_texture_state object with the gallium
  92.  * texture/sampler_view state (this contains the parts which are
  93.  * considered static).
  94.  */
  95. void
  96. lp_sampler_static_texture_state(struct lp_static_texture_state *state,
  97.                                 const struct pipe_sampler_view *view)
  98. {
  99.    const struct pipe_resource *texture;
  100.  
  101.    memset(state, 0, sizeof *state);
  102.  
  103.    if (!view || !view->texture)
  104.       return;
  105.  
  106.    texture = view->texture;
  107.  
  108.    state->format            = view->format;
  109.    state->swizzle_r         = view->swizzle_r;
  110.    state->swizzle_g         = view->swizzle_g;
  111.    state->swizzle_b         = view->swizzle_b;
  112.    state->swizzle_a         = view->swizzle_a;
  113.  
  114.    state->target            = texture->target;
  115.    state->pot_width         = util_is_power_of_two(texture->width0);
  116.    state->pot_height        = util_is_power_of_two(texture->height0);
  117.    state->pot_depth         = util_is_power_of_two(texture->depth0);
  118.    state->level_zero_only   = !view->u.tex.last_level;
  119.  
  120.    /*
  121.     * the layer / element / level parameters are all either dynamic
  122.     * state or handled transparently wrt execution.
  123.     */
  124. }
  125.  
  126.  
  127. /**
  128.  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
  129.  * state (this contains the parts which are considered static).
  130.  */
  131. void
  132. lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
  133.                                 const struct pipe_sampler_state *sampler)
  134. {
  135.    memset(state, 0, sizeof *state);
  136.  
  137.    if (!sampler)
  138.       return;
  139.  
  140.    /*
  141.     * We don't copy sampler state over unless it is actually enabled, to avoid
  142.     * spurious recompiles, as the sampler static state is part of the shader
  143.     * key.
  144.     *
  145.     * Ideally the state tracker or cso_cache module would make all state
  146.     * canonical, but until that happens it's better to be safe than sorry here.
  147.     *
  148.     * XXX: Actually there's much more than can be done here, especially
  149.     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
  150.     */
  151.  
  152.    state->wrap_s            = sampler->wrap_s;
  153.    state->wrap_t            = sampler->wrap_t;
  154.    state->wrap_r            = sampler->wrap_r;
  155.    state->min_img_filter    = sampler->min_img_filter;
  156.    state->mag_img_filter    = sampler->mag_img_filter;
  157.  
  158.    if (sampler->max_lod > 0.0f) {
  159.       state->min_mip_filter = sampler->min_mip_filter;
  160.    } else {
  161.       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
  162.    }
  163.  
  164.    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
  165.       if (sampler->lod_bias != 0.0f) {
  166.          state->lod_bias_non_zero = 1;
  167.       }
  168.  
  169.       /* If min_lod == max_lod we can greatly simplify mipmap selection.
  170.        * This is a case that occurs during automatic mipmap generation.
  171.        */
  172.       if (sampler->min_lod == sampler->max_lod) {
  173.          state->min_max_lod_equal = 1;
  174.       } else {
  175.          if (sampler->min_lod > 0.0f) {
  176.             state->apply_min_lod = 1;
  177.          }
  178.  
  179.          /*
  180.           * XXX this won't do anything with the mesa state tracker which always
  181.           * sets max_lod to not more than actually present mip maps...
  182.           */
  183.          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
  184.             state->apply_max_lod = 1;
  185.          }
  186.       }
  187.    }
  188.  
  189.    state->compare_mode      = sampler->compare_mode;
  190.    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
  191.       state->compare_func   = sampler->compare_func;
  192.    }
  193.  
  194.    state->normalized_coords = sampler->normalized_coords;
  195. }
  196.  
  197.  
  198. /**
  199.  * Generate code to compute coordinate gradient (rho).
  200.  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  201.  *
  202.  * The resulting rho is scalar per quad.
  203.  */
  204. static LLVMValueRef
  205. lp_build_rho(struct lp_build_sample_context *bld,
  206.              unsigned texture_unit,
  207.              LLVMValueRef s,
  208.              LLVMValueRef t,
  209.              LLVMValueRef r,
  210.              LLVMValueRef cube_rho,
  211.              const struct lp_derivatives *derivs)
  212. {
  213.    struct gallivm_state *gallivm = bld->gallivm;
  214.    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
  215.    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
  216.    struct lp_build_context *float_bld = &bld->float_bld;
  217.    struct lp_build_context *coord_bld = &bld->coord_bld;
  218.    struct lp_build_context *levelf_bld = &bld->levelf_bld;
  219.    const unsigned dims = bld->dims;
  220.    LLVMValueRef ddx_ddy[2];
  221.    LLVMBuilderRef builder = bld->gallivm->builder;
  222.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  223.    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
  224.    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
  225.    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
  226.    LLVMValueRef rho_vec;
  227.    LLVMValueRef int_size, float_size;
  228.    LLVMValueRef rho;
  229.    LLVMValueRef first_level, first_level_vec;
  230.    unsigned length = coord_bld->type.length;
  231.    unsigned num_quads = length / 4;
  232.    unsigned i;
  233.    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
  234.    LLVMValueRef rho_xvec, rho_yvec;
  235.  
  236.    /* Note that all simplified calculations will only work for isotropic filtering */
  237.  
  238.    assert(bld->num_lods != length);
  239.  
  240.    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
  241.                                                  bld->gallivm, texture_unit);
  242.    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
  243.    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
  244.    float_size = lp_build_int_to_float(float_size_bld, int_size);
  245.  
  246.    if (cube_rho) {
  247.       LLVMValueRef cubesize;
  248.       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
  249.       /*
  250.        * Cube map code did already everything except size mul and per-quad extraction.
  251.        */
  252.       rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  253.                                       levelf_bld->type, cube_rho, 0);
  254.       if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
  255.          rho = lp_build_sqrt(levelf_bld, rho);
  256.       }
  257.       /* Could optimize this for single quad just skip the broadcast */
  258.       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
  259.                                             levelf_bld->type, float_size, index0);
  260.       rho = lp_build_mul(levelf_bld, cubesize, rho);
  261.    }
  262.    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
  263.       LLVMValueRef ddmax[3], ddx[3], ddy[3];
  264.       for (i = 0; i < dims; i++) {
  265.          LLVMValueRef floatdim;
  266.          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
  267.  
  268.          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
  269.                                                coord_bld->type, float_size, indexi);
  270.  
  271.          if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
  272.             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
  273.             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
  274.             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
  275.             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
  276.          }
  277.          else {
  278.             LLVMValueRef tmpx, tmpy;
  279.             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
  280.             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
  281.             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
  282.             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
  283.          }
  284.       }
  285.       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
  286.          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
  287.          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
  288.          if (dims > 2) {
  289.             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
  290.             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
  291.          }
  292.          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  293.          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  294.                                          levelf_bld->type, rho_vec, 0);
  295.          /*
  296.           * note that as long as we don't care about per-pixel lod could reduce math
  297.           * more (at some shuffle cost), but for now only do sqrt after packing.
  298.           */
  299.          rho = lp_build_sqrt(levelf_bld, rho);
  300.       }
  301.       else {
  302.          rho_vec = ddmax[0];
  303.          if (dims > 1) {
  304.             rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
  305.             if (dims > 2) {
  306.                rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
  307.             }
  308.          }
  309.          /*
  310.           * rho_vec now still contains per-pixel rho, convert to scalar per quad
  311.           * since we can't handle per-pixel rho/lod from now on (TODO).
  312.           */
  313.          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  314.                                          levelf_bld->type, rho_vec, 0);
  315.       }
  316.    }
  317.    else {
  318.       /*
  319.        * This looks all a bit complex, but it's not that bad
  320.        * (the shuffle code makes it look worse than it is).
  321.        * Still, might not be ideal for all cases.
  322.        */
  323.       static const unsigned char swizzle0[] = { /* no-op swizzle */
  324.          0, LP_BLD_SWIZZLE_DONTCARE,
  325.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  326.       };
  327.       static const unsigned char swizzle1[] = {
  328.          1, LP_BLD_SWIZZLE_DONTCARE,
  329.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  330.       };
  331.       static const unsigned char swizzle2[] = {
  332.          2, LP_BLD_SWIZZLE_DONTCARE,
  333.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  334.       };
  335.  
  336.       if (dims < 2) {
  337.          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
  338.       }
  339.       else if (dims >= 2) {
  340.          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  341.          if (dims > 2) {
  342.             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
  343.          }
  344.       }
  345.  
  346.       if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
  347.          static const unsigned char swizzle01[] = { /* no-op swizzle */
  348.             0, 1,
  349.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  350.          };
  351.          static const unsigned char swizzle23[] = {
  352.             2, 3,
  353.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  354.          };
  355.          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
  356.  
  357.          for (i = 0; i < num_quads; i++) {
  358.             shuffles[i*4+0] = shuffles[i*4+1] = index0;
  359.             shuffles[i*4+2] = shuffles[i*4+3] = index1;
  360.          }
  361.          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
  362.                                            LLVMConstVector(shuffles, length), "");
  363.          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
  364.          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
  365.          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  366.          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  367.          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
  368.  
  369.          if (dims > 2) {
  370.             static const unsigned char swizzle02[] = {
  371.                0, 2,
  372.                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  373.             };
  374.             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
  375.                                                   coord_bld->type, float_size, index2);
  376.             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
  377.             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
  378.             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
  379.             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
  380.          }
  381.          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  382.          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  383.          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  384.  
  385.          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  386.                                          levelf_bld->type, rho_vec, 0);
  387.          rho = lp_build_sqrt(levelf_bld, rho);
  388.       }
  389.       else {
  390.          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
  391.          if (dims > 2) {
  392.             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
  393.          }
  394.  
  395.          if (dims < 2) {
  396.             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
  397.             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
  398.          }
  399.          else if (dims == 2) {
  400.             static const unsigned char swizzle02[] = {
  401.                0, 2,
  402.                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  403.             };
  404.             static const unsigned char swizzle13[] = {
  405.                1, 3,
  406.                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  407.             };
  408.             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
  409.             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
  410.          }
  411.          else {
  412.             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
  413.             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
  414.             assert(dims == 3);
  415.             for (i = 0; i < num_quads; i++) {
  416.                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
  417.                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
  418.                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
  419.                shuffles1[4*i + 3] = i32undef;
  420.                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
  421.                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
  422.                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
  423.                shuffles2[4*i + 3] = i32undef;
  424.             }
  425.             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
  426.                                               LLVMConstVector(shuffles1, length), "");
  427.             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
  428.                                               LLVMConstVector(shuffles2, length), "");
  429.          }
  430.  
  431.          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  432.  
  433.          if (bld->coord_type.length > 4) {
  434.             /* expand size to each quad */
  435.             if (dims > 1) {
  436.                /* could use some broadcast_vector helper for this? */
  437.                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
  438.                for (i = 0; i < num_quads; i++) {
  439.                   src[i] = float_size;
  440.                }
  441.                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
  442.             }
  443.             else {
  444.                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
  445.             }
  446.             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
  447.  
  448.             if (dims <= 1) {
  449.                rho = rho_vec;
  450.             }
  451.             else {
  452.                if (dims >= 2) {
  453.                   LLVMValueRef rho_s, rho_t, rho_r;
  454.  
  455.                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  456.                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  457.  
  458.                   rho = lp_build_max(coord_bld, rho_s, rho_t);
  459.  
  460.                   if (dims >= 3) {
  461.                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
  462.                      rho = lp_build_max(coord_bld, rho, rho_r);
  463.                   }
  464.                }
  465.             }
  466.             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  467.                                             levelf_bld->type, rho, 0);
  468.          }
  469.          else {
  470.             if (dims <= 1) {
  471.                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
  472.             }
  473.             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
  474.  
  475.             if (dims <= 1) {
  476.                rho = rho_vec;
  477.             }
  478.             else {
  479.                if (dims >= 2) {
  480.                   LLVMValueRef rho_s, rho_t, rho_r;
  481.  
  482.                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
  483.                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
  484.  
  485.                   rho = lp_build_max(float_bld, rho_s, rho_t);
  486.  
  487.                   if (dims >= 3) {
  488.                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
  489.                      rho = lp_build_max(float_bld, rho, rho_r);
  490.                   }
  491.                }
  492.             }
  493.          }
  494.       }
  495.    }
  496.  
  497.    return rho;
  498. }
  499.  
  500.  
  501. /*
  502.  * Bri-linear lod computation
  503.  *
  504.  * Use a piece-wise linear approximation of log2 such that:
  505.  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
  506.  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
  507.  *   with the steepness specified in 'factor'
  508.  * - exact result for 0.5, 1.5, etc.
  509.  *
  510.  *
  511.  *   1.0 -              /----*
  512.  *                     /
  513.  *                    /
  514.  *                   /
  515.  *   0.5 -          *
  516.  *                 /
  517.  *                /
  518.  *               /
  519.  *   0.0 - *----/
  520.  *
  521.  *         |                 |
  522.  *        2^0               2^1
  523.  *
  524.  * This is a technique also commonly used in hardware:
  525.  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
  526.  *
  527.  * TODO: For correctness, this should only be applied when texture is known to
  528.  * have regular mipmaps, i.e., mipmaps derived from the base level.
  529.  *
  530.  * TODO: This could be done in fixed point, where applicable.
  531.  */
  532. static void
  533. lp_build_brilinear_lod(struct lp_build_context *bld,
  534.                        LLVMValueRef lod,
  535.                        double factor,
  536.                        LLVMValueRef *out_lod_ipart,
  537.                        LLVMValueRef *out_lod_fpart)
  538. {
  539.    LLVMValueRef lod_fpart;
  540.    double pre_offset = (factor - 0.5)/factor - 0.5;
  541.    double post_offset = 1 - factor;
  542.  
  543.    if (0) {
  544.       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
  545.    }
  546.  
  547.    lod = lp_build_add(bld, lod,
  548.                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
  549.  
  550.    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
  551.  
  552.    lod_fpart = lp_build_mul(bld, lod_fpart,
  553.                             lp_build_const_vec(bld->gallivm, bld->type, factor));
  554.  
  555.    lod_fpart = lp_build_add(bld, lod_fpart,
  556.                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
  557.  
  558.    /*
  559.     * It's not necessary to clamp lod_fpart since:
  560.     * - the above expression will never produce numbers greater than one.
  561.     * - the mip filtering branch is only taken if lod_fpart is positive
  562.     */
  563.  
  564.    *out_lod_fpart = lod_fpart;
  565.  
  566.    if (0) {
  567.       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
  568.       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
  569.    }
  570. }
  571.  
  572.  
  573. /*
  574.  * Combined log2 and brilinear lod computation.
  575.  *
  576.  * It's in all identical to calling lp_build_fast_log2() and
  577.  * lp_build_brilinear_lod() above, but by combining we can compute the integer
  578.  * and fractional part independently.
  579.  */
  580. static void
  581. lp_build_brilinear_rho(struct lp_build_context *bld,
  582.                        LLVMValueRef rho,
  583.                        double factor,
  584.                        LLVMValueRef *out_lod_ipart,
  585.                        LLVMValueRef *out_lod_fpart)
  586. {
  587.    LLVMValueRef lod_ipart;
  588.    LLVMValueRef lod_fpart;
  589.  
  590.    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
  591.    const double post_offset = 1 - 2*factor;
  592.  
  593.    assert(bld->type.floating);
  594.  
  595.    assert(lp_check_value(bld->type, rho));
  596.  
  597.    /*
  598.     * The pre factor will make the intersections with the exact powers of two
  599.     * happen precisely where we want then to be, which means that the integer
  600.     * part will not need any post adjustments.
  601.     */
  602.    rho = lp_build_mul(bld, rho,
  603.                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
  604.  
  605.    /* ipart = ifloor(log2(rho)) */
  606.    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
  607.  
  608.    /* fpart = rho / 2**ipart */
  609.    lod_fpart = lp_build_extract_mantissa(bld, rho);
  610.  
  611.    lod_fpart = lp_build_mul(bld, lod_fpart,
  612.                             lp_build_const_vec(bld->gallivm, bld->type, factor));
  613.  
  614.    lod_fpart = lp_build_add(bld, lod_fpart,
  615.                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
  616.  
  617.    /*
  618.     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
  619.     * - the above expression will never produce numbers greater than one.
  620.     * - the mip filtering branch is only taken if lod_fpart is positive
  621.     */
  622.  
  623.    *out_lod_ipart = lod_ipart;
  624.    *out_lod_fpart = lod_fpart;
  625. }
  626.  
  627.  
  628. /**
  629.  * Generate code to compute texture level of detail (lambda).
  630.  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  631.  * \param lod_bias  optional float vector with the shader lod bias
  632.  * \param explicit_lod  optional float vector with the explicit lod
  633.  * \param width  scalar int texture width
  634.  * \param height  scalar int texture height
  635.  * \param depth  scalar int texture depth
  636.  *
  637.  * The resulting lod is scalar per quad, so only the first value per quad
  638.  * passed in from lod_bias, explicit_lod is used.
  639.  */
  640. void
  641. lp_build_lod_selector(struct lp_build_sample_context *bld,
  642.                       unsigned texture_unit,
  643.                       unsigned sampler_unit,
  644.                       LLVMValueRef s,
  645.                       LLVMValueRef t,
  646.                       LLVMValueRef r,
  647.                       LLVMValueRef cube_rho,
  648.                       const struct lp_derivatives *derivs,
  649.                       LLVMValueRef lod_bias, /* optional */
  650.                       LLVMValueRef explicit_lod, /* optional */
  651.                       unsigned mip_filter,
  652.                       LLVMValueRef *out_lod_ipart,
  653.                       LLVMValueRef *out_lod_fpart)
  654.  
  655. {
  656.    LLVMBuilderRef builder = bld->gallivm->builder;
  657.    struct lp_build_context *levelf_bld = &bld->levelf_bld;
  658.    LLVMValueRef lod;
  659.  
  660.    *out_lod_ipart = bld->leveli_bld.zero;
  661.    *out_lod_fpart = levelf_bld->zero;
  662.  
  663.    if (bld->static_sampler_state->min_max_lod_equal) {
  664.       /* User is forcing sampling from a particular mipmap level.
  665.        * This is hit during mipmap generation.
  666.        */
  667.       LLVMValueRef min_lod =
  668.          bld->dynamic_state->min_lod(bld->dynamic_state,
  669.                                      bld->gallivm, sampler_unit);
  670.  
  671.       lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
  672.    }
  673.    else {
  674.       if (explicit_lod) {
  675.          if (bld->num_lods != bld->coord_type.length)
  676.             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
  677.                                             levelf_bld->type, explicit_lod, 0);
  678.          else
  679.             lod = explicit_lod;
  680.       }
  681.       else {
  682.          LLVMValueRef rho;
  683.  
  684.          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
  685.  
  686.          /*
  687.           * Compute lod = log2(rho)
  688.           */
  689.  
  690.          if (!lod_bias &&
  691.              !bld->static_sampler_state->lod_bias_non_zero &&
  692.              !bld->static_sampler_state->apply_max_lod &&
  693.              !bld->static_sampler_state->apply_min_lod) {
  694.             /*
  695.              * Special case when there are no post-log2 adjustments, which
  696.              * saves instructions but keeping the integer and fractional lod
  697.              * computations separate from the start.
  698.              */
  699.  
  700.             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
  701.                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
  702.                *out_lod_ipart = lp_build_ilog2(levelf_bld, rho);
  703.                *out_lod_fpart = levelf_bld->zero;
  704.                return;
  705.             }
  706.             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
  707.                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
  708.                lp_build_brilinear_rho(levelf_bld, rho, BRILINEAR_FACTOR,
  709.                                       out_lod_ipart, out_lod_fpart);
  710.                return;
  711.             }
  712.          }
  713.  
  714.          if (0) {
  715.             lod = lp_build_log2(levelf_bld, rho);
  716.          }
  717.          else {
  718.             lod = lp_build_fast_log2(levelf_bld, rho);
  719.          }
  720.  
  721.          /* add shader lod bias */
  722.          if (lod_bias) {
  723.             lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
  724.                   levelf_bld->type, lod_bias, 0);
  725.             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
  726.          }
  727.       }
  728.  
  729.       /* add sampler lod bias */
  730.       if (bld->static_sampler_state->lod_bias_non_zero) {
  731.          LLVMValueRef sampler_lod_bias =
  732.             bld->dynamic_state->lod_bias(bld->dynamic_state,
  733.                                          bld->gallivm, sampler_unit);
  734.          sampler_lod_bias = lp_build_broadcast_scalar(levelf_bld,
  735.                                                       sampler_lod_bias);
  736.          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
  737.       }
  738.  
  739.       /* clamp lod */
  740.       if (bld->static_sampler_state->apply_max_lod) {
  741.          LLVMValueRef max_lod =
  742.             bld->dynamic_state->max_lod(bld->dynamic_state,
  743.                                         bld->gallivm, sampler_unit);
  744.          max_lod = lp_build_broadcast_scalar(levelf_bld, max_lod);
  745.  
  746.          lod = lp_build_min(levelf_bld, lod, max_lod);
  747.       }
  748.       if (bld->static_sampler_state->apply_min_lod) {
  749.          LLVMValueRef min_lod =
  750.             bld->dynamic_state->min_lod(bld->dynamic_state,
  751.                                         bld->gallivm, sampler_unit);
  752.          min_lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
  753.  
  754.          lod = lp_build_max(levelf_bld, lod, min_lod);
  755.       }
  756.    }
  757.  
  758.    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
  759.       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
  760.          lp_build_brilinear_lod(levelf_bld, lod, BRILINEAR_FACTOR,
  761.                                 out_lod_ipart, out_lod_fpart);
  762.       }
  763.       else {
  764.          lp_build_ifloor_fract(levelf_bld, lod, out_lod_ipart, out_lod_fpart);
  765.       }
  766.  
  767.       lp_build_name(*out_lod_fpart, "lod_fpart");
  768.    }
  769.    else {
  770.       *out_lod_ipart = lp_build_iround(levelf_bld, lod);
  771.    }
  772.  
  773.    lp_build_name(*out_lod_ipart, "lod_ipart");
  774.  
  775.    return;
  776. }
  777.  
  778.  
  779. /**
  780.  * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
  781.  * mipmap level index.
  782.  * Note: this is all scalar per quad code.
  783.  * \param lod_ipart  int texture level of detail
  784.  * \param level_out  returns integer
  785.  */
  786. void
  787. lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
  788.                            unsigned texture_unit,
  789.                            LLVMValueRef lod_ipart,
  790.                            LLVMValueRef *level_out)
  791. {
  792.    struct lp_build_context *leveli_bld = &bld->leveli_bld;
  793.    LLVMValueRef first_level, last_level, level;
  794.  
  795.    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
  796.                                                  bld->gallivm, texture_unit);
  797.    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
  798.                                                bld->gallivm, texture_unit);
  799.    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
  800.    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
  801.  
  802.    level = lp_build_add(leveli_bld, lod_ipart, first_level);
  803.  
  804.    /* clamp level to legal range of levels */
  805.    *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
  806. }
  807.  
  808.  
  809. /**
  810.  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
  811.  * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
  812.  * Later, we'll sample from those two mipmap levels and interpolate between them.
  813.  */
  814. void
  815. lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
  816.                            unsigned texture_unit,
  817.                            LLVMValueRef lod_ipart,
  818.                            LLVMValueRef *lod_fpart_inout,
  819.                            LLVMValueRef *level0_out,
  820.                            LLVMValueRef *level1_out)
  821. {
  822.    LLVMBuilderRef builder = bld->gallivm->builder;
  823.    struct lp_build_context *leveli_bld = &bld->leveli_bld;
  824.    struct lp_build_context *levelf_bld = &bld->levelf_bld;
  825.    LLVMValueRef first_level, last_level;
  826.    LLVMValueRef clamp_min;
  827.    LLVMValueRef clamp_max;
  828.  
  829.    first_level = bld->dynamic_state->first_level(bld->dynamic_state,
  830.                                                  bld->gallivm, texture_unit);
  831.    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
  832.                                                bld->gallivm, texture_unit);
  833.    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
  834.    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
  835.  
  836.    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
  837.    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
  838.  
  839.    /*
  840.     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
  841.     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
  842.     * ends in the process.
  843.     */
  844.  
  845.    /*
  846.     * This code (vector select in particular) only works with llvm 3.1
  847.     * (if there's more than one quad, with x86 backend). Might consider
  848.     * converting to our lp_bld_logic helpers.
  849.     */
  850. #if HAVE_LLVM < 0x0301
  851.    assert(leveli_bld->type.length == 1);
  852. #endif
  853.  
  854.    /* *level0_out < first_level */
  855.    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
  856.                              *level0_out, first_level,
  857.                              "clamp_lod_to_first");
  858.  
  859.    *level0_out = LLVMBuildSelect(builder, clamp_min,
  860.                                  first_level, *level0_out, "");
  861.  
  862.    *level1_out = LLVMBuildSelect(builder, clamp_min,
  863.                                  first_level, *level1_out, "");
  864.  
  865.    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
  866.                                       levelf_bld->zero, *lod_fpart_inout, "");
  867.  
  868.    /* *level0_out >= last_level */
  869.    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
  870.                              *level0_out, last_level,
  871.                              "clamp_lod_to_last");
  872.  
  873.    *level0_out = LLVMBuildSelect(builder, clamp_max,
  874.                                  last_level, *level0_out, "");
  875.  
  876.    *level1_out = LLVMBuildSelect(builder, clamp_max,
  877.                                  last_level, *level1_out, "");
  878.  
  879.    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
  880.                                       levelf_bld->zero, *lod_fpart_inout, "");
  881.  
  882.    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
  883.    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
  884.    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
  885. }
  886.  
  887.  
  888. /**
  889.  * Return pointer to a single mipmap level.
  890.  * \param level  integer mipmap level
  891.  */
  892. LLVMValueRef
  893. lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
  894.                           LLVMValueRef level)
  895. {
  896.    LLVMBuilderRef builder = bld->gallivm->builder;
  897.    LLVMValueRef indexes[2], data_ptr, mip_offset;
  898.  
  899.    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
  900.    indexes[1] = level;
  901.    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  902.    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
  903.    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
  904.    return data_ptr;
  905. }
  906.  
  907. /**
  908.  * Return (per-pixel) offsets to mip levels.
  909.  * \param level  integer mipmap level
  910.  */
  911. LLVMValueRef
  912. lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
  913.                          LLVMValueRef level)
  914. {
  915.    LLVMBuilderRef builder = bld->gallivm->builder;
  916.    LLVMValueRef indexes[2], offsets, offset1;
  917.  
  918.    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
  919.    if (bld->num_lods == 1) {
  920.       indexes[1] = level;
  921.       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  922.       offset1 = LLVMBuildLoad(builder, offset1, "");
  923.       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
  924.    }
  925.    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
  926.       unsigned i;
  927.  
  928.       offsets = bld->int_coord_bld.undef;
  929.       for (i = 0; i < bld->num_lods; i++) {
  930.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  931.          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
  932.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  933.          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  934.          offset1 = LLVMBuildLoad(builder, offset1, "");
  935.          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
  936.       }
  937.       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
  938.    }
  939.    else {
  940.       unsigned i;
  941.  
  942.       assert (bld->num_lods == bld->coord_bld.type.length);
  943.  
  944.       offsets = bld->int_coord_bld.undef;
  945.       for (i = 0; i < bld->num_lods; i++) {
  946.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  947.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  948.          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  949.          offset1 = LLVMBuildLoad(builder, offset1, "");
  950.          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
  951.       }
  952.    }
  953.    return offsets;
  954. }
  955.  
  956.  
  957. /**
  958.  * Codegen equivalent for u_minify().
  959.  * Return max(1, base_size >> level);
  960.  */
  961. LLVMValueRef
  962. lp_build_minify(struct lp_build_context *bld,
  963.                 LLVMValueRef base_size,
  964.                 LLVMValueRef level)
  965. {
  966.    LLVMBuilderRef builder = bld->gallivm->builder;
  967.    assert(lp_check_value(bld->type, base_size));
  968.    assert(lp_check_value(bld->type, level));
  969.  
  970.    if (level == bld->zero) {
  971.       /* if we're using mipmap level zero, no minification is needed */
  972.       return base_size;
  973.    }
  974.    else {
  975.       LLVMValueRef size =
  976.          LLVMBuildLShr(builder, base_size, level, "minify");
  977.       assert(bld->type.sign);
  978.       size = lp_build_max(bld, size, bld->one);
  979.       return size;
  980.    }
  981. }
  982.  
  983.  
  984. /**
  985.  * Dereference stride_array[mipmap_level] array to get a stride.
  986.  * Return stride as a vector.
  987.  */
  988. static LLVMValueRef
  989. lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
  990.                               LLVMValueRef stride_array, LLVMValueRef level)
  991. {
  992.    LLVMBuilderRef builder = bld->gallivm->builder;
  993.    LLVMValueRef indexes[2], stride, stride1;
  994.    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
  995.    if (bld->num_lods == 1) {
  996.       indexes[1] = level;
  997.       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
  998.       stride1 = LLVMBuildLoad(builder, stride1, "");
  999.       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
  1000.    }
  1001.    else if (bld->num_lods == bld->coord_bld.type.length / 4) {
  1002.       LLVMValueRef stride1;
  1003.       unsigned i;
  1004.  
  1005.       stride = bld->int_coord_bld.undef;
  1006.       for (i = 0; i < bld->num_lods; i++) {
  1007.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1008.          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
  1009.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  1010.          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
  1011.          stride1 = LLVMBuildLoad(builder, stride1, "");
  1012.          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
  1013.       }
  1014.       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
  1015.    }
  1016.    else {
  1017.       LLVMValueRef stride1;
  1018.       unsigned i;
  1019.  
  1020.       assert (bld->num_lods == bld->coord_bld.type.length);
  1021.  
  1022.       stride = bld->int_coord_bld.undef;
  1023.       for (i = 0; i < bld->coord_bld.type.length; i++) {
  1024.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1025.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  1026.          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
  1027.          stride1 = LLVMBuildLoad(builder, stride1, "");
  1028.          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
  1029.       }
  1030.    }
  1031.    return stride;
  1032. }
  1033.  
  1034.  
  1035. /**
  1036.  * When sampling a mipmap, we need to compute the width, height, depth
  1037.  * of the source levels from the level indexes.  This helper function
  1038.  * does that.
  1039.  */
  1040. void
  1041. lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
  1042.                             LLVMValueRef ilevel,
  1043.                             LLVMValueRef *out_size,
  1044.                             LLVMValueRef *row_stride_vec,
  1045.                             LLVMValueRef *img_stride_vec)
  1046. {
  1047.    const unsigned dims = bld->dims;
  1048.    LLVMValueRef ilevel_vec;
  1049.  
  1050.    /*
  1051.     * Compute width, height, depth at mipmap level 'ilevel'
  1052.     */
  1053.    if (bld->num_lods == 1) {
  1054.       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
  1055.       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
  1056.    }
  1057.    else {
  1058.       LLVMValueRef int_size_vec;
  1059.       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
  1060.       unsigned num_quads = bld->coord_bld.type.length / 4;
  1061.       unsigned i;
  1062.  
  1063.       if (bld->num_lods == num_quads) {
  1064.          /*
  1065.           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
  1066.           * intel "forgot" the variable shift count instruction until avx2.
  1067.           * A harmless 8x32 shift gets translated into 32 instructions
  1068.           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
  1069.           * unable to recognize if there are really just 2 different shift
  1070.           * count values. So do the shift 4-wide before expansion.
  1071.           */
  1072.          struct lp_build_context bld4;
  1073.          struct lp_type type4;
  1074.  
  1075.          type4 = bld->int_coord_bld.type;
  1076.          type4.length = 4;
  1077.  
  1078.          lp_build_context_init(&bld4, bld->gallivm, type4);
  1079.  
  1080.          if (bld->dims == 1) {
  1081.             assert(bld->int_size_in_bld.type.length == 1);
  1082.             int_size_vec = lp_build_broadcast_scalar(&bld4,
  1083.                                                      bld->int_size);
  1084.          }
  1085.          else {
  1086.             assert(bld->int_size_in_bld.type.length == 4);
  1087.             int_size_vec = bld->int_size;
  1088.          }
  1089.  
  1090.          for (i = 0; i < num_quads; i++) {
  1091.             LLVMValueRef ileveli;
  1092.             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1093.  
  1094.             ileveli = lp_build_extract_broadcast(bld->gallivm,
  1095.                                                  bld->leveli_bld.type,
  1096.                                                  bld4.type,
  1097.                                                  ilevel,
  1098.                                                  indexi);
  1099.             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
  1100.          }
  1101.          /*
  1102.           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
  1103.           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
  1104.           */
  1105.          *out_size = lp_build_concat(bld->gallivm,
  1106.                                      tmp,
  1107.                                      bld4.type,
  1108.                                      num_quads);
  1109.       }
  1110.       else {
  1111.         /* FIXME: this is terrible and results in _huge_ vector
  1112.          * (for the dims > 1 case).
  1113.          * Should refactor this (together with extract_image_sizes) and do
  1114.          * something more useful. Could for instance if we have width,height
  1115.          * with 4-wide vector pack all elements into a 8xi16 vector
  1116.          * (on which we can still do useful math) instead of using a 16xi32
  1117.          * vector.
  1118.          * FIXME: some callers can't handle this yet.
  1119.          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
  1120.          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
  1121.          */
  1122.          assert(bld->num_lods == bld->coord_bld.type.length);
  1123.          if (bld->dims == 1) {
  1124.             assert(bld->int_size_in_bld.type.length == 1);
  1125.             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
  1126.                                                      bld->int_size);
  1127.             /* vector shift with variable shift count alert... */
  1128.             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
  1129.          }
  1130.          else {
  1131.             LLVMValueRef ilevel1;
  1132.             for (i = 0; i < bld->num_lods; i++) {
  1133.                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1134.                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
  1135.                                                     bld->int_size_in_bld.type, ilevel, indexi);
  1136.                tmp[i] = bld->int_size;
  1137.                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
  1138.             }
  1139.             *out_size = lp_build_concat(bld->gallivm, tmp,
  1140.                                         bld->int_size_in_bld.type,
  1141.                                         bld->num_lods);
  1142.          }
  1143.       }
  1144.    }
  1145.  
  1146.    if (dims >= 2) {
  1147.       *row_stride_vec = lp_build_get_level_stride_vec(bld,
  1148.                                                       bld->row_stride_array,
  1149.                                                       ilevel);
  1150.    }
  1151.    if (dims == 3 ||
  1152.        bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
  1153.        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
  1154.        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
  1155.       *img_stride_vec = lp_build_get_level_stride_vec(bld,
  1156.                                                       bld->img_stride_array,
  1157.                                                       ilevel);
  1158.    }
  1159. }
  1160.  
  1161.  
  1162. /**
  1163.  * Extract and broadcast texture size.
  1164.  *
  1165.  * @param size_type   type of the texture size vector (either
  1166.  *                    bld->int_size_type or bld->float_size_type)
  1167.  * @param coord_type  type of the texture size vector (either
  1168.  *                    bld->int_coord_type or bld->coord_type)
  1169.  * @param size        vector with the texture size (width, height, depth)
  1170.  */
  1171. void
  1172. lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
  1173.                              struct lp_build_context *size_bld,
  1174.                              struct lp_type coord_type,
  1175.                              LLVMValueRef size,
  1176.                              LLVMValueRef *out_width,
  1177.                              LLVMValueRef *out_height,
  1178.                              LLVMValueRef *out_depth)
  1179. {
  1180.    const unsigned dims = bld->dims;
  1181.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  1182.    struct lp_type size_type = size_bld->type;
  1183.  
  1184.    if (bld->num_lods == 1) {
  1185.       *out_width = lp_build_extract_broadcast(bld->gallivm,
  1186.                                               size_type,
  1187.                                               coord_type,
  1188.                                               size,
  1189.                                               LLVMConstInt(i32t, 0, 0));
  1190.       if (dims >= 2) {
  1191.          *out_height = lp_build_extract_broadcast(bld->gallivm,
  1192.                                                   size_type,
  1193.                                                   coord_type,
  1194.                                                   size,
  1195.                                                   LLVMConstInt(i32t, 1, 0));
  1196.          if (dims == 3) {
  1197.             *out_depth = lp_build_extract_broadcast(bld->gallivm,
  1198.                                                     size_type,
  1199.                                                     coord_type,
  1200.                                                     size,
  1201.                                                     LLVMConstInt(i32t, 2, 0));
  1202.          }
  1203.       }
  1204.    }
  1205.    else {
  1206.       unsigned num_quads = bld->coord_bld.type.length / 4;
  1207.  
  1208.       if (dims == 1) {
  1209.          *out_width = size;
  1210.       }
  1211.       else if (bld->num_lods == num_quads) {
  1212.          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
  1213.          if (dims >= 2) {
  1214.             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
  1215.             if (dims == 3) {
  1216.                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
  1217.             }
  1218.          }
  1219.       }
  1220.       else {
  1221.          assert(bld->num_lods == bld->coord_type.length);
  1222.          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
  1223.                                                 coord_type, size, 0);
  1224.          if (dims >= 2) {
  1225.             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
  1226.                                                     coord_type, size, 1);
  1227.             if (dims == 3) {
  1228.                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
  1229.                                                       coord_type, size, 2);
  1230.             }
  1231.          }
  1232.       }
  1233.    }
  1234. }
  1235.  
  1236.  
  1237. /**
  1238.  * Unnormalize coords.
  1239.  *
  1240.  * @param flt_size  vector with the integer texture size (width, height, depth)
  1241.  */
  1242. void
  1243. lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
  1244.                              LLVMValueRef flt_size,
  1245.                              LLVMValueRef *s,
  1246.                              LLVMValueRef *t,
  1247.                              LLVMValueRef *r)
  1248. {
  1249.    const unsigned dims = bld->dims;
  1250.    LLVMValueRef width;
  1251.    LLVMValueRef height;
  1252.    LLVMValueRef depth;
  1253.  
  1254.    lp_build_extract_image_sizes(bld,
  1255.                                 &bld->float_size_bld,
  1256.                                 bld->coord_type,
  1257.                                 flt_size,
  1258.                                 &width,
  1259.                                 &height,
  1260.                                 &depth);
  1261.  
  1262.    /* s = s * width, t = t * height */
  1263.    *s = lp_build_mul(&bld->coord_bld, *s, width);
  1264.    if (dims >= 2) {
  1265.       *t = lp_build_mul(&bld->coord_bld, *t, height);
  1266.       if (dims >= 3) {
  1267.          *r = lp_build_mul(&bld->coord_bld, *r, depth);
  1268.       }
  1269.    }
  1270. }
  1271.  
  1272.  
  1273. /** Helper used by lp_build_cube_lookup() */
  1274. static LLVMValueRef
  1275. lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
  1276. {
  1277.    /* ima = +0.5 / abs(coord); */
  1278.    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
  1279.    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
  1280.    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
  1281.    return ima;
  1282. }
  1283.  
  1284. /** Helper used by lp_build_cube_lookup() */
  1285. static LLVMValueRef
  1286. lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
  1287. {
  1288.    /* ima = -0.5 / abs(coord); */
  1289.    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
  1290.    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
  1291.    LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
  1292.    return ima;
  1293. }
  1294.  
  1295. /**
  1296.  * Helper used by lp_build_cube_lookup()
  1297.  * FIXME: the sign here can also be 0.
  1298.  * Arithmetically this could definitely make a difference. Either
  1299.  * fix the comment or use other (simpler) sign function, not sure
  1300.  * which one it should be.
  1301.  * \param sign  scalar +1 or -1
  1302.  * \param coord  float vector
  1303.  * \param ima  float vector
  1304.  */
  1305. static LLVMValueRef
  1306. lp_build_cube_coord(struct lp_build_context *coord_bld,
  1307.                     LLVMValueRef sign, int negate_coord,
  1308.                     LLVMValueRef coord, LLVMValueRef ima)
  1309. {
  1310.    /* return negate(coord) * ima * sign + 0.5; */
  1311.    LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
  1312.    LLVMValueRef res;
  1313.  
  1314.    assert(negate_coord == +1 || negate_coord == -1);
  1315.  
  1316.    if (negate_coord == -1) {
  1317.       coord = lp_build_negate(coord_bld, coord);
  1318.    }
  1319.  
  1320.    res = lp_build_mul(coord_bld, coord, ima);
  1321.    if (sign) {
  1322.       sign = lp_build_broadcast_scalar(coord_bld, sign);
  1323.       res = lp_build_mul(coord_bld, res, sign);
  1324.    }
  1325.    res = lp_build_add(coord_bld, res, half);
  1326.  
  1327.    return res;
  1328. }
  1329.  
  1330.  
  1331. /** Helper used by lp_build_cube_lookup()
  1332.  * Return (major_coord >= 0) ? pos_face : neg_face;
  1333.  */
  1334. static LLVMValueRef
  1335. lp_build_cube_face(struct lp_build_sample_context *bld,
  1336.                    LLVMValueRef major_coord,
  1337.                    unsigned pos_face, unsigned neg_face)
  1338. {
  1339.    struct gallivm_state *gallivm = bld->gallivm;
  1340.    LLVMBuilderRef builder = gallivm->builder;
  1341.    LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
  1342.                                     major_coord,
  1343.                                     bld->float_bld.zero, "");
  1344.    LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
  1345.    LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
  1346.    LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
  1347.    return res;
  1348. }
  1349.  
  1350.  
  1351.  
  1352. /**
  1353.  * Generate code to do cube face selection and compute per-face texcoords.
  1354.  */
  1355. void
  1356. lp_build_cube_lookup(struct lp_build_sample_context *bld,
  1357.                      LLVMValueRef s,
  1358.                      LLVMValueRef t,
  1359.                      LLVMValueRef r,
  1360.                      const struct lp_derivatives *derivs, /* optional */
  1361.                      LLVMValueRef *face,
  1362.                      LLVMValueRef *face_s,
  1363.                      LLVMValueRef *face_t,
  1364.                      LLVMValueRef *rho,
  1365.                      boolean need_derivs)
  1366. {
  1367.    struct lp_build_context *coord_bld = &bld->coord_bld;
  1368.    LLVMBuilderRef builder = bld->gallivm->builder;
  1369.    struct gallivm_state *gallivm = bld->gallivm;
  1370.    LLVMValueRef si, ti, ri;
  1371.  
  1372.    if (1 || coord_bld->type.length > 4) {
  1373.       /*
  1374.        * Do per-pixel face selection. We cannot however (as we used to do)
  1375.        * simply calculate the derivs afterwards (which is very bogus for
  1376.        * explicit derivs btw) because the values would be "random" when
  1377.        * not all pixels lie on the same face. So what we do here is just
  1378.        * calculate the derivatives after scaling the coords by the absolute
  1379.        * value of the inverse major axis, and essentially do rho calculation
  1380.        * steps as if it were a 3d texture. This is perfect if all pixels hit
  1381.        * the same face, but not so great at edges, I believe the max error
  1382.        * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
  1383.        * the 3d distance between 2 points on the cube instead of measuring up/down
  1384.        * the edge). Still this is possibly a win over just selecting the same face
  1385.        * for all pixels. Unfortunately, something like that doesn't work for
  1386.        * explicit derivatives.
  1387.        * TODO: handle explicit derivatives by transforming them alongside coords
  1388.        * somehow.
  1389.        */
  1390.       struct lp_build_context *cint_bld = &bld->int_coord_bld;
  1391.       struct lp_type intctype = cint_bld->type;
  1392.       LLVMValueRef signs, signt, signr, signma;
  1393.       LLVMValueRef as, at, ar;
  1394.       LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
  1395.       LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
  1396.       LLVMValueRef tnegi, rnegi;
  1397.       LLVMValueRef ma, mai, ima;
  1398.       LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
  1399.       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
  1400.                                                      1 << (intctype.width - 1));
  1401.       LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
  1402.                                                       intctype.width -1);
  1403.       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
  1404.       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
  1405.       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
  1406.  
  1407.       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
  1408.       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
  1409.       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
  1410.  
  1411.       /*
  1412.        * get absolute value (for x/y/z face selection) and sign bit
  1413.        * (for mirroring minor coords and pos/neg face selection)
  1414.        * of the original coords.
  1415.        */
  1416.       as = lp_build_abs(&bld->coord_bld, s);
  1417.       at = lp_build_abs(&bld->coord_bld, t);
  1418.       ar = lp_build_abs(&bld->coord_bld, r);
  1419.  
  1420.       /*
  1421.        * major face determination: select x if x > y else select y
  1422.        * select z if z >= max(x,y) else select previous result
  1423.        * if some axis are the same we chose z over y, y over x - the
  1424.        * dx10 spec seems to ask for it while OpenGL doesn't care (if we
  1425.        * wouldn't care could save a select or two if using different
  1426.        * compares and doing at_g_as_ar last since tnewx and tnewz are the
  1427.        * same).
  1428.        */
  1429.       as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
  1430.       maxasat = lp_build_max(coord_bld, as, at);
  1431.       ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
  1432.  
  1433.       if (need_derivs) {
  1434.          LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
  1435.          static const unsigned char swizzle0[] = { /* no-op swizzle */
  1436.             0, LP_BLD_SWIZZLE_DONTCARE,
  1437.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1438.          };
  1439.          static const unsigned char swizzle1[] = {
  1440.             1, LP_BLD_SWIZZLE_DONTCARE,
  1441.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1442.          };
  1443.          static const unsigned char swizzle01[] = { /* no-op swizzle */
  1444.             0, 1,
  1445.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1446.          };
  1447.          static const unsigned char swizzle23[] = {
  1448.             2, 3,
  1449.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1450.          };
  1451.          static const unsigned char swizzle02[] = {
  1452.             0, 2,
  1453.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1454.          };
  1455.  
  1456.          /*
  1457.           * scale the s/t/r coords pre-select/mirror so we can calculate
  1458.           * "reasonable" derivs.
  1459.           */
  1460.          ma = lp_build_select(coord_bld, as_ge_at, s, t);
  1461.          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
  1462.          ima = lp_build_cube_imapos(coord_bld, ma);
  1463.          s = lp_build_mul(coord_bld, s, ima);
  1464.          t = lp_build_mul(coord_bld, t, ima);
  1465.          r = lp_build_mul(coord_bld, r, ima);
  1466.  
  1467.          /*
  1468.           * This isn't quite the same as the "ordinary" (3d deriv) path since we
  1469.           * know the texture is square which simplifies things (we can omit the
  1470.           * size mul which happens very early completely here and do it at the
  1471.           * very end).
  1472.           */
  1473.          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  1474.          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
  1475.  
  1476.          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
  1477.             ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
  1478.             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
  1479.          }
  1480.          else {
  1481.             ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
  1482.             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
  1483.          }
  1484.  
  1485.          tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  1486.          tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  1487.          tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
  1488.  
  1489.          if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
  1490.             rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
  1491.             rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
  1492.          }
  1493.          else {
  1494.             rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
  1495.             rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
  1496.          }
  1497.  
  1498.          tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  1499.          tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  1500.          *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
  1501.       }
  1502.  
  1503.       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
  1504.       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
  1505.       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
  1506.       signs = LLVMBuildAnd(builder, si, signmask, "");
  1507.       signt = LLVMBuildAnd(builder, ti, signmask, "");
  1508.       signr = LLVMBuildAnd(builder, ri, signmask, "");
  1509.  
  1510.       /*
  1511.        * compute all possible new s/t coords
  1512.        * snewx = signs * -r;
  1513.        * tnewx = -t;
  1514.        * snewy = s;
  1515.        * tnewy = signt * r;
  1516.        * snewz = signr * s;
  1517.        * tnewz = -t;
  1518.        */
  1519.       tnegi = LLVMBuildXor(builder, ti, signmask, "");
  1520.       rnegi = LLVMBuildXor(builder, ri, signmask, "");
  1521.  
  1522.       snewx = LLVMBuildXor(builder, signs, rnegi, "");
  1523.       tnewx = tnegi;
  1524.  
  1525.       snewy = si;
  1526.       tnewy = LLVMBuildXor(builder, signt, ri, "");
  1527.  
  1528.       snewz = LLVMBuildXor(builder, signr, si, "");
  1529.       tnewz = tnegi;
  1530.  
  1531.       /* XXX on x86 unclear if we should cast the values back to float
  1532.        * or not - on some cpus (nehalem) pblendvb has twice the throughput
  1533.        * of blendvps though on others there just might be domain
  1534.        * transition penalties when using it (this depends on what llvm
  1535.        * will chose for the bit ops above so there appears no "right way",
  1536.        * but given the boatload of selects let's just use the int type).
  1537.        */
  1538.  
  1539.       /* select/mirror */
  1540.       if (!need_derivs) {
  1541.          ma = lp_build_select(coord_bld, as_ge_at, s, t);
  1542.       }
  1543.       *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
  1544.       *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
  1545.       *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
  1546.  
  1547.       if (!need_derivs) {
  1548.          ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
  1549.       }
  1550.       *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
  1551.       *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
  1552.       *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
  1553.  
  1554.       *face_s = LLVMBuildBitCast(builder, *face_s,
  1555.                                lp_build_vec_type(gallivm, coord_bld->type), "");
  1556.       *face_t = LLVMBuildBitCast(builder, *face_t,
  1557.                                lp_build_vec_type(gallivm, coord_bld->type), "");
  1558.  
  1559.       /* add +1 for neg face */
  1560.       /* XXX with AVX probably want to use another select here -
  1561.        * as long as we ensure vblendvps gets used we can actually
  1562.        * skip the comparison and just use sign as a "mask" directly.
  1563.        */
  1564.       mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
  1565.       signma = LLVMBuildLShr(builder, mai, signshift, "");
  1566.       *face = LLVMBuildOr(builder, *face, signma, "face");
  1567.  
  1568.       /* project coords */
  1569.       if (!need_derivs) {
  1570.          ima = lp_build_cube_imapos(coord_bld, ma);
  1571.          *face_s = lp_build_mul(coord_bld, *face_s, ima);
  1572.          *face_t = lp_build_mul(coord_bld, *face_t, ima);
  1573.       }
  1574.  
  1575.       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
  1576.       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
  1577.    }
  1578.  
  1579.    else {
  1580.       struct lp_build_if_state if_ctx;
  1581.       LLVMValueRef face_s_var;
  1582.       LLVMValueRef face_t_var;
  1583.       LLVMValueRef face_var;
  1584.       LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
  1585.       LLVMValueRef shuffles[4];
  1586.       LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
  1587.       LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
  1588.       LLVMValueRef tmp[4], rxyz, arxyz;
  1589.       struct lp_build_context *float_bld = &bld->float_bld;
  1590.  
  1591.       assert(bld->coord_bld.type.length == 4);
  1592.  
  1593.       tmp[0] = s;
  1594.       tmp[1] = t;
  1595.       tmp[2] = r;
  1596.       rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
  1597.       arxyz = lp_build_abs(&bld->coord_bld, rxyz);
  1598.  
  1599.       shuffles[0] = lp_build_const_int32(gallivm, 0);
  1600.       shuffles[1] = lp_build_const_int32(gallivm, 1);
  1601.       shuffles[2] = lp_build_const_int32(gallivm, 0);
  1602.       shuffles[3] = lp_build_const_int32(gallivm, 1);
  1603.       arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
  1604.       shuffles[0] = lp_build_const_int32(gallivm, 1);
  1605.       shuffles[1] = lp_build_const_int32(gallivm, 0);
  1606.       shuffles[2] = lp_build_const_int32(gallivm, 2);
  1607.       shuffles[3] = lp_build_const_int32(gallivm, 2);
  1608.       aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
  1609.       arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
  1610.  
  1611.       shuffles[0] = lp_build_const_int32(gallivm, 0);
  1612.       shuffles[1] = lp_build_const_int32(gallivm, 1);
  1613.       arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
  1614.                                             LLVMConstVector(shuffles, 2), "");
  1615.       shuffles[0] = lp_build_const_int32(gallivm, 2);
  1616.       shuffles[1] = lp_build_const_int32(gallivm, 3);
  1617.       arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
  1618.                                             LLVMConstVector(shuffles, 2), "");
  1619.       arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
  1620.  
  1621.       arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
  1622.                                                lp_build_const_int32(gallivm, 0), "");
  1623.       arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
  1624.                                                lp_build_const_int32(gallivm, 0), "");
  1625.       ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
  1626.                                                lp_build_const_int32(gallivm, 1), "");
  1627.       ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
  1628.                                                lp_build_const_int32(gallivm, 0), "");
  1629.       face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
  1630.       face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
  1631.       face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
  1632.  
  1633.       lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
  1634.       {
  1635.          /* +/- X face */
  1636.          LLVMValueRef sign, ima;
  1637.          si = LLVMBuildExtractElement(builder, rxyz,
  1638.                                       lp_build_const_int32(gallivm, 0), "");
  1639.          /* +/- X face */
  1640.          sign = lp_build_sgn(float_bld, si);
  1641.          ima = lp_build_cube_imaneg(coord_bld, s);
  1642.          *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
  1643.          *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
  1644.          *face = lp_build_cube_face(bld, si,
  1645.                                     PIPE_TEX_FACE_POS_X,
  1646.                                     PIPE_TEX_FACE_NEG_X);
  1647.          LLVMBuildStore(builder, *face_s, face_s_var);
  1648.          LLVMBuildStore(builder, *face_t, face_t_var);
  1649.          LLVMBuildStore(builder, *face, face_var);
  1650.       }
  1651.       lp_build_else(&if_ctx);
  1652.       {
  1653.          struct lp_build_if_state if_ctx2;
  1654.  
  1655.          lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
  1656.          {
  1657.             LLVMValueRef sign, ima;
  1658.             /* +/- Y face */
  1659.             ti = LLVMBuildExtractElement(builder, rxyz,
  1660.                                          lp_build_const_int32(gallivm, 1), "");
  1661.             sign = lp_build_sgn(float_bld, ti);
  1662.             ima = lp_build_cube_imaneg(coord_bld, t);
  1663.             *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
  1664.             *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
  1665.             *face = lp_build_cube_face(bld, ti,
  1666.                                        PIPE_TEX_FACE_POS_Y,
  1667.                                        PIPE_TEX_FACE_NEG_Y);
  1668.             LLVMBuildStore(builder, *face_s, face_s_var);
  1669.             LLVMBuildStore(builder, *face_t, face_t_var);
  1670.             LLVMBuildStore(builder, *face, face_var);
  1671.          }
  1672.          lp_build_else(&if_ctx2);
  1673.          {
  1674.             /* +/- Z face */
  1675.             LLVMValueRef sign, ima;
  1676.             ri = LLVMBuildExtractElement(builder, rxyz,
  1677.                                          lp_build_const_int32(gallivm, 2), "");
  1678.             sign = lp_build_sgn(float_bld, ri);
  1679.             ima = lp_build_cube_imaneg(coord_bld, r);
  1680.             *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
  1681.             *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
  1682.             *face = lp_build_cube_face(bld, ri,
  1683.                                        PIPE_TEX_FACE_POS_Z,
  1684.                                        PIPE_TEX_FACE_NEG_Z);
  1685.             LLVMBuildStore(builder, *face_s, face_s_var);
  1686.             LLVMBuildStore(builder, *face_t, face_t_var);
  1687.             LLVMBuildStore(builder, *face, face_var);
  1688.          }
  1689.          lp_build_endif(&if_ctx2);
  1690.       }
  1691.  
  1692.       lp_build_endif(&if_ctx);
  1693.  
  1694.       *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
  1695.       *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
  1696.       *face   = LLVMBuildLoad(builder, face_var, "face");
  1697.       *face   = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
  1698.    }
  1699. }
  1700.  
  1701.  
  1702. /**
  1703.  * Compute the partial offset of a pixel block along an arbitrary axis.
  1704.  *
  1705.  * @param coord   coordinate in pixels
  1706.  * @param stride  number of bytes between rows of successive pixel blocks
  1707.  * @param block_length  number of pixels in a pixels block along the coordinate
  1708.  *                      axis
  1709.  * @param out_offset    resulting relative offset of the pixel block in bytes
  1710.  * @param out_subcoord  resulting sub-block pixel coordinate
  1711.  */
  1712. void
  1713. lp_build_sample_partial_offset(struct lp_build_context *bld,
  1714.                                unsigned block_length,
  1715.                                LLVMValueRef coord,
  1716.                                LLVMValueRef stride,
  1717.                                LLVMValueRef *out_offset,
  1718.                                LLVMValueRef *out_subcoord)
  1719. {
  1720.    LLVMBuilderRef builder = bld->gallivm->builder;
  1721.    LLVMValueRef offset;
  1722.    LLVMValueRef subcoord;
  1723.  
  1724.    if (block_length == 1) {
  1725.       subcoord = bld->zero;
  1726.    }
  1727.    else {
  1728.       /*
  1729.        * Pixel blocks have power of two dimensions. LLVM should convert the
  1730.        * rem/div to bit arithmetic.
  1731.        * TODO: Verify this.
  1732.        * It does indeed BUT it does transform it to scalar (and back) when doing so
  1733.        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
  1734.        * The generated code looks seriously unfunny and is quite expensive.
  1735.        */
  1736. #if 0
  1737.       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
  1738.       subcoord = LLVMBuildURem(builder, coord, block_width, "");
  1739.       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
  1740. #else
  1741.       unsigned logbase2 = util_logbase2(block_length);
  1742.       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
  1743.       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
  1744.       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
  1745.       coord = LLVMBuildLShr(builder, coord, block_shift, "");
  1746. #endif
  1747.    }
  1748.  
  1749.    offset = lp_build_mul(bld, coord, stride);
  1750.  
  1751.    assert(out_offset);
  1752.    assert(out_subcoord);
  1753.  
  1754.    *out_offset = offset;
  1755.    *out_subcoord = subcoord;
  1756. }
  1757.  
  1758.  
  1759. /**
  1760.  * Compute the offset of a pixel block.
  1761.  *
  1762.  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
  1763.  *
  1764.  * Returns the relative offset and i,j sub-block coordinates
  1765.  */
  1766. void
  1767. lp_build_sample_offset(struct lp_build_context *bld,
  1768.                        const struct util_format_description *format_desc,
  1769.                        LLVMValueRef x,
  1770.                        LLVMValueRef y,
  1771.                        LLVMValueRef z,
  1772.                        LLVMValueRef y_stride,
  1773.                        LLVMValueRef z_stride,
  1774.                        LLVMValueRef *out_offset,
  1775.                        LLVMValueRef *out_i,
  1776.                        LLVMValueRef *out_j)
  1777. {
  1778.    LLVMValueRef x_stride;
  1779.    LLVMValueRef offset;
  1780.  
  1781.    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
  1782.                                  format_desc->block.bits/8);
  1783.  
  1784.    lp_build_sample_partial_offset(bld,
  1785.                                   format_desc->block.width,
  1786.                                   x, x_stride,
  1787.                                   &offset, out_i);
  1788.  
  1789.    if (y && y_stride) {
  1790.       LLVMValueRef y_offset;
  1791.       lp_build_sample_partial_offset(bld,
  1792.                                      format_desc->block.height,
  1793.                                      y, y_stride,
  1794.                                      &y_offset, out_j);
  1795.       offset = lp_build_add(bld, offset, y_offset);
  1796.    }
  1797.    else {
  1798.       *out_j = bld->zero;
  1799.    }
  1800.  
  1801.    if (z && z_stride) {
  1802.       LLVMValueRef z_offset;
  1803.       LLVMValueRef k;
  1804.       lp_build_sample_partial_offset(bld,
  1805.                                      1, /* pixel blocks are always 2D */
  1806.                                      z, z_stride,
  1807.                                      &z_offset, &k);
  1808.       offset = lp_build_add(bld, offset, z_offset);
  1809.    }
  1810.  
  1811.    *out_offset = offset;
  1812. }
  1813.