Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28. /**
  29.  * @file
  30.  * Texture sampling -- common code.
  31.  *
  32.  * @author Jose Fonseca <jfonseca@vmware.com>
  33.  */
  34.  
  35. #include "pipe/p_defines.h"
  36. #include "pipe/p_state.h"
  37. #include "util/u_format.h"
  38. #include "util/u_math.h"
  39. #include "util/u_cpu_detect.h"
  40. #include "lp_bld_arit.h"
  41. #include "lp_bld_const.h"
  42. #include "lp_bld_debug.h"
  43. #include "lp_bld_printf.h"
  44. #include "lp_bld_flow.h"
  45. #include "lp_bld_sample.h"
  46. #include "lp_bld_swizzle.h"
  47. #include "lp_bld_type.h"
  48. #include "lp_bld_logic.h"
  49. #include "lp_bld_pack.h"
  50. #include "lp_bld_quad.h"
  51. #include "lp_bld_bitarit.h"
  52.  
  53.  
  54. /*
  55.  * Bri-linear factor. Should be greater than one.
  56.  */
  57. #define BRILINEAR_FACTOR 2
  58.  
  59. /**
  60.  * Does the given texture wrap mode allow sampling the texture border color?
  61.  * XXX maybe move this into gallium util code.
  62.  */
  63. boolean
  64. lp_sampler_wrap_mode_uses_border_color(unsigned mode,
  65.                                        unsigned min_img_filter,
  66.                                        unsigned mag_img_filter)
  67. {
  68.    switch (mode) {
  69.    case PIPE_TEX_WRAP_REPEAT:
  70.    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
  71.    case PIPE_TEX_WRAP_MIRROR_REPEAT:
  72.    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
  73.       return FALSE;
  74.    case PIPE_TEX_WRAP_CLAMP:
  75.    case PIPE_TEX_WRAP_MIRROR_CLAMP:
  76.       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
  77.           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
  78.          return FALSE;
  79.       } else {
  80.          return TRUE;
  81.       }
  82.    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
  83.    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
  84.       return TRUE;
  85.    default:
  86.       assert(0 && "unexpected wrap mode");
  87.       return FALSE;
  88.    }
  89. }
  90.  
  91.  
  92. /**
  93.  * Initialize lp_sampler_static_texture_state object with the gallium
  94.  * texture/sampler_view state (this contains the parts which are
  95.  * considered static).
  96.  */
  97. void
  98. lp_sampler_static_texture_state(struct lp_static_texture_state *state,
  99.                                 const struct pipe_sampler_view *view)
  100. {
  101.    const struct pipe_resource *texture;
  102.  
  103.    memset(state, 0, sizeof *state);
  104.  
  105.    if (!view || !view->texture)
  106.       return;
  107.  
  108.    texture = view->texture;
  109.  
  110.    state->format            = view->format;
  111.    state->swizzle_r         = view->swizzle_r;
  112.    state->swizzle_g         = view->swizzle_g;
  113.    state->swizzle_b         = view->swizzle_b;
  114.    state->swizzle_a         = view->swizzle_a;
  115.  
  116.    state->target            = view->target;
  117.    state->pot_width         = util_is_power_of_two(texture->width0);
  118.    state->pot_height        = util_is_power_of_two(texture->height0);
  119.    state->pot_depth         = util_is_power_of_two(texture->depth0);
  120.    state->level_zero_only   = !view->u.tex.last_level;
  121.  
  122.    /*
  123.     * the layer / element / level parameters are all either dynamic
  124.     * state or handled transparently wrt execution.
  125.     */
  126. }
  127.  
  128.  
  129. /**
  130.  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
  131.  * state (this contains the parts which are considered static).
  132.  */
  133. void
  134. lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
  135.                                 const struct pipe_sampler_state *sampler)
  136. {
  137.    memset(state, 0, sizeof *state);
  138.  
  139.    if (!sampler)
  140.       return;
  141.  
  142.    /*
  143.     * We don't copy sampler state over unless it is actually enabled, to avoid
  144.     * spurious recompiles, as the sampler static state is part of the shader
  145.     * key.
  146.     *
  147.     * Ideally the state tracker or cso_cache module would make all state
  148.     * canonical, but until that happens it's better to be safe than sorry here.
  149.     *
  150.     * XXX: Actually there's much more than can be done here, especially
  151.     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
  152.     */
  153.  
  154.    state->wrap_s            = sampler->wrap_s;
  155.    state->wrap_t            = sampler->wrap_t;
  156.    state->wrap_r            = sampler->wrap_r;
  157.    state->min_img_filter    = sampler->min_img_filter;
  158.    state->mag_img_filter    = sampler->mag_img_filter;
  159.    state->seamless_cube_map = sampler->seamless_cube_map;
  160.  
  161.    if (sampler->max_lod > 0.0f) {
  162.       state->min_mip_filter = sampler->min_mip_filter;
  163.    } else {
  164.       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
  165.    }
  166.  
  167.    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE ||
  168.        state->min_img_filter != state->mag_img_filter) {
  169.       if (sampler->lod_bias != 0.0f) {
  170.          state->lod_bias_non_zero = 1;
  171.       }
  172.  
  173.       /* If min_lod == max_lod we can greatly simplify mipmap selection.
  174.        * This is a case that occurs during automatic mipmap generation.
  175.        */
  176.       if (sampler->min_lod == sampler->max_lod) {
  177.          state->min_max_lod_equal = 1;
  178.       } else {
  179.          if (sampler->min_lod > 0.0f) {
  180.             state->apply_min_lod = 1;
  181.          }
  182.  
  183.          /*
  184.           * XXX this won't do anything with the mesa state tracker which always
  185.           * sets max_lod to not more than actually present mip maps...
  186.           */
  187.          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
  188.             state->apply_max_lod = 1;
  189.          }
  190.       }
  191.    }
  192.  
  193.    state->compare_mode      = sampler->compare_mode;
  194.    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
  195.       state->compare_func   = sampler->compare_func;
  196.    }
  197.  
  198.    state->normalized_coords = sampler->normalized_coords;
  199. }
  200.  
  201.  
  202. /**
  203.  * Generate code to compute coordinate gradient (rho).
  204.  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  205.  *
  206.  * The resulting rho has bld->levelf format (per quad or per element).
  207.  */
  208. static LLVMValueRef
  209. lp_build_rho(struct lp_build_sample_context *bld,
  210.              unsigned texture_unit,
  211.              LLVMValueRef s,
  212.              LLVMValueRef t,
  213.              LLVMValueRef r,
  214.              LLVMValueRef cube_rho,
  215.              const struct lp_derivatives *derivs)
  216. {
  217.    struct gallivm_state *gallivm = bld->gallivm;
  218.    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
  219.    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
  220.    struct lp_build_context *float_bld = &bld->float_bld;
  221.    struct lp_build_context *coord_bld = &bld->coord_bld;
  222.    struct lp_build_context *rho_bld = &bld->lodf_bld;
  223.    const unsigned dims = bld->dims;
  224.    LLVMValueRef ddx_ddy[2] = {NULL};
  225.    LLVMBuilderRef builder = bld->gallivm->builder;
  226.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  227.    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
  228.    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
  229.    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
  230.    LLVMValueRef rho_vec;
  231.    LLVMValueRef int_size, float_size;
  232.    LLVMValueRef rho;
  233.    LLVMValueRef first_level, first_level_vec;
  234.    unsigned length = coord_bld->type.length;
  235.    unsigned num_quads = length / 4;
  236.    boolean rho_per_quad = rho_bld->type.length != length;
  237.    boolean no_rho_opt = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1);
  238.    unsigned i;
  239.    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
  240.    LLVMValueRef rho_xvec, rho_yvec;
  241.  
  242.    /* Note that all simplified calculations will only work for isotropic filtering */
  243.  
  244.    /*
  245.     * rho calcs are always per quad except for explicit derivs (excluding
  246.     * the messy cube maps for now) when requested.
  247.     */
  248.  
  249.    first_level = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
  250.                                                  bld->context_ptr, texture_unit);
  251.    first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
  252.    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, TRUE);
  253.    float_size = lp_build_int_to_float(float_size_bld, int_size);
  254.  
  255.    if (cube_rho) {
  256.       LLVMValueRef cubesize;
  257.       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
  258.  
  259.       /*
  260.        * Cube map code did already everything except size mul and per-quad extraction.
  261.        * Luckily cube maps are always quadratic!
  262.        */
  263.       if (rho_per_quad) {
  264.          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  265.                                          rho_bld->type, cube_rho, 0);
  266.       }
  267.       else {
  268.          rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
  269.       }
  270.       /* Could optimize this for single quad just skip the broadcast */
  271.       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
  272.                                             rho_bld->type, float_size, index0);
  273.       /* skipping sqrt hence returning rho squared */
  274.       cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
  275.       rho = lp_build_mul(rho_bld, cubesize, rho);
  276.    }
  277.    else if (derivs) {
  278.       LLVMValueRef ddmax[3], ddx[3], ddy[3];
  279.       for (i = 0; i < dims; i++) {
  280.          LLVMValueRef floatdim;
  281.          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
  282.  
  283.          floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
  284.                                                coord_bld->type, float_size, indexi);
  285.  
  286.          /*
  287.           * note that for rho_per_quad case could reduce math (at some shuffle
  288.           * cost), but for now use same code to per-pixel lod case.
  289.           */
  290.          if (no_rho_opt) {
  291.             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
  292.             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
  293.             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
  294.             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
  295.          }
  296.          else {
  297.             LLVMValueRef tmpx, tmpy;
  298.             tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
  299.             tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
  300.             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
  301.             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
  302.          }
  303.       }
  304.       if (no_rho_opt) {
  305.          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
  306.          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
  307.          if (dims > 2) {
  308.             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
  309.             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
  310.          }
  311.          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  312.          /* skipping sqrt hence returning rho squared */
  313.      }
  314.       else {
  315.          rho = ddmax[0];
  316.          if (dims > 1) {
  317.             rho = lp_build_max(coord_bld, rho, ddmax[1]);
  318.             if (dims > 2) {
  319.                rho = lp_build_max(coord_bld, rho, ddmax[2]);
  320.             }
  321.          }
  322.       }
  323.       if (rho_per_quad) {
  324.          /*
  325.           * rho_vec contains per-pixel rho, convert to scalar per quad.
  326.           */
  327.          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  328.                                          rho_bld->type, rho, 0);
  329.       }
  330.    }
  331.    else {
  332.       /*
  333.        * This looks all a bit complex, but it's not that bad
  334.        * (the shuffle code makes it look worse than it is).
  335.        * Still, might not be ideal for all cases.
  336.        */
  337.       static const unsigned char swizzle0[] = { /* no-op swizzle */
  338.          0, LP_BLD_SWIZZLE_DONTCARE,
  339.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  340.       };
  341.       static const unsigned char swizzle1[] = {
  342.          1, LP_BLD_SWIZZLE_DONTCARE,
  343.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  344.       };
  345.       static const unsigned char swizzle2[] = {
  346.          2, LP_BLD_SWIZZLE_DONTCARE,
  347.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  348.       };
  349.  
  350.       if (dims < 2) {
  351.          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
  352.       }
  353.       else if (dims >= 2) {
  354.          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  355.          if (dims > 2) {
  356.             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
  357.          }
  358.       }
  359.  
  360.       if (no_rho_opt) {
  361.          static const unsigned char swizzle01[] = { /* no-op swizzle */
  362.             0, 1,
  363.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  364.          };
  365.          static const unsigned char swizzle23[] = {
  366.             2, 3,
  367.             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  368.          };
  369.          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
  370.  
  371.          for (i = 0; i < num_quads; i++) {
  372.             shuffles[i*4+0] = shuffles[i*4+1] = index0;
  373.             shuffles[i*4+2] = shuffles[i*4+3] = index1;
  374.          }
  375.          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
  376.                                            LLVMConstVector(shuffles, length), "");
  377.          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
  378.          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
  379.          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  380.          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  381.          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
  382.  
  383.          if (dims > 2) {
  384.             static const unsigned char swizzle02[] = {
  385.                0, 2,
  386.                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  387.             };
  388.             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
  389.                                                   coord_bld->type, float_size, index2);
  390.             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
  391.             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
  392.             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
  393.             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
  394.          }
  395.  
  396.          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  397.          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  398.          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  399.  
  400.          if (rho_per_quad) {
  401.             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  402.                                             rho_bld->type, rho, 0);
  403.          }
  404.          else {
  405.             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
  406.          }
  407.          /* skipping sqrt hence returning rho squared */
  408.       }
  409.       else {
  410.          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
  411.          if (dims > 2) {
  412.             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
  413.          }
  414.          else {
  415.             ddx_ddy[1] = NULL; /* silence compiler warning */
  416.          }
  417.  
  418.          if (dims < 2) {
  419.             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
  420.             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
  421.          }
  422.          else if (dims == 2) {
  423.             static const unsigned char swizzle02[] = {
  424.                0, 2,
  425.                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  426.             };
  427.             static const unsigned char swizzle13[] = {
  428.                1, 3,
  429.                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  430.             };
  431.             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
  432.             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
  433.          }
  434.          else {
  435.             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
  436.             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
  437.             assert(dims == 3);
  438.             for (i = 0; i < num_quads; i++) {
  439.                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
  440.                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
  441.                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
  442.                shuffles1[4*i + 3] = i32undef;
  443.                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
  444.                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
  445.                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
  446.                shuffles2[4*i + 3] = i32undef;
  447.             }
  448.             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
  449.                                               LLVMConstVector(shuffles1, length), "");
  450.             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
  451.                                               LLVMConstVector(shuffles2, length), "");
  452.          }
  453.  
  454.          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
  455.  
  456.          if (bld->coord_type.length > 4) {
  457.             /* expand size to each quad */
  458.             if (dims > 1) {
  459.                /* could use some broadcast_vector helper for this? */
  460.                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
  461.                for (i = 0; i < num_quads; i++) {
  462.                   src[i] = float_size;
  463.                }
  464.                float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
  465.             }
  466.             else {
  467.                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
  468.             }
  469.             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
  470.  
  471.             if (dims <= 1) {
  472.                rho = rho_vec;
  473.             }
  474.             else {
  475.                if (dims >= 2) {
  476.                   LLVMValueRef rho_s, rho_t, rho_r;
  477.  
  478.                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  479.                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  480.  
  481.                   rho = lp_build_max(coord_bld, rho_s, rho_t);
  482.  
  483.                   if (dims >= 3) {
  484.                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
  485.                      rho = lp_build_max(coord_bld, rho, rho_r);
  486.                   }
  487.                }
  488.             }
  489.             if (rho_per_quad) {
  490.                rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
  491.                                                rho_bld->type, rho, 0);
  492.             }
  493.             else {
  494.                rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
  495.             }
  496.          }
  497.          else {
  498.             if (dims <= 1) {
  499.                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
  500.             }
  501.             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
  502.  
  503.             if (dims <= 1) {
  504.                rho = rho_vec;
  505.             }
  506.             else {
  507.                if (dims >= 2) {
  508.                   LLVMValueRef rho_s, rho_t, rho_r;
  509.  
  510.                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
  511.                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
  512.  
  513.                   rho = lp_build_max(float_bld, rho_s, rho_t);
  514.  
  515.                   if (dims >= 3) {
  516.                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
  517.                      rho = lp_build_max(float_bld, rho, rho_r);
  518.                   }
  519.                }
  520.             }
  521.             if (!rho_per_quad) {
  522.                rho = lp_build_broadcast_scalar(rho_bld, rho);
  523.             }
  524.          }
  525.       }
  526.    }
  527.  
  528.    return rho;
  529. }
  530.  
  531.  
  532. /*
  533.  * Bri-linear lod computation
  534.  *
  535.  * Use a piece-wise linear approximation of log2 such that:
  536.  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
  537.  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
  538.  *   with the steepness specified in 'factor'
  539.  * - exact result for 0.5, 1.5, etc.
  540.  *
  541.  *
  542.  *   1.0 -              /----*
  543.  *                     /
  544.  *                    /
  545.  *                   /
  546.  *   0.5 -          *
  547.  *                 /
  548.  *                /
  549.  *               /
  550.  *   0.0 - *----/
  551.  *
  552.  *         |                 |
  553.  *        2^0               2^1
  554.  *
  555.  * This is a technique also commonly used in hardware:
  556.  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
  557.  *
  558.  * TODO: For correctness, this should only be applied when texture is known to
  559.  * have regular mipmaps, i.e., mipmaps derived from the base level.
  560.  *
  561.  * TODO: This could be done in fixed point, where applicable.
  562.  */
  563. static void
  564. lp_build_brilinear_lod(struct lp_build_context *bld,
  565.                        LLVMValueRef lod,
  566.                        double factor,
  567.                        LLVMValueRef *out_lod_ipart,
  568.                        LLVMValueRef *out_lod_fpart)
  569. {
  570.    LLVMValueRef lod_fpart;
  571.    double pre_offset = (factor - 0.5)/factor - 0.5;
  572.    double post_offset = 1 - factor;
  573.  
  574.    if (0) {
  575.       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
  576.    }
  577.  
  578.    lod = lp_build_add(bld, lod,
  579.                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
  580.  
  581.    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
  582.  
  583.    lod_fpart = lp_build_mul(bld, lod_fpart,
  584.                             lp_build_const_vec(bld->gallivm, bld->type, factor));
  585.  
  586.    lod_fpart = lp_build_add(bld, lod_fpart,
  587.                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
  588.  
  589.    /*
  590.     * It's not necessary to clamp lod_fpart since:
  591.     * - the above expression will never produce numbers greater than one.
  592.     * - the mip filtering branch is only taken if lod_fpart is positive
  593.     */
  594.  
  595.    *out_lod_fpart = lod_fpart;
  596.  
  597.    if (0) {
  598.       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
  599.       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
  600.    }
  601. }
  602.  
  603.  
  604. /*
  605.  * Combined log2 and brilinear lod computation.
  606.  *
  607.  * It's in all identical to calling lp_build_fast_log2() and
  608.  * lp_build_brilinear_lod() above, but by combining we can compute the integer
  609.  * and fractional part independently.
  610.  */
  611. static void
  612. lp_build_brilinear_rho(struct lp_build_context *bld,
  613.                        LLVMValueRef rho,
  614.                        double factor,
  615.                        LLVMValueRef *out_lod_ipart,
  616.                        LLVMValueRef *out_lod_fpart)
  617. {
  618.    LLVMValueRef lod_ipart;
  619.    LLVMValueRef lod_fpart;
  620.  
  621.    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
  622.    const double post_offset = 1 - 2*factor;
  623.  
  624.    assert(bld->type.floating);
  625.  
  626.    assert(lp_check_value(bld->type, rho));
  627.  
  628.    /*
  629.     * The pre factor will make the intersections with the exact powers of two
  630.     * happen precisely where we want them to be, which means that the integer
  631.     * part will not need any post adjustments.
  632.     */
  633.    rho = lp_build_mul(bld, rho,
  634.                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
  635.  
  636.    /* ipart = ifloor(log2(rho)) */
  637.    lod_ipart = lp_build_extract_exponent(bld, rho, 0);
  638.  
  639.    /* fpart = rho / 2**ipart */
  640.    lod_fpart = lp_build_extract_mantissa(bld, rho);
  641.  
  642.    lod_fpart = lp_build_mul(bld, lod_fpart,
  643.                             lp_build_const_vec(bld->gallivm, bld->type, factor));
  644.  
  645.    lod_fpart = lp_build_add(bld, lod_fpart,
  646.                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
  647.  
  648.    /*
  649.     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
  650.     * - the above expression will never produce numbers greater than one.
  651.     * - the mip filtering branch is only taken if lod_fpart is positive
  652.     */
  653.  
  654.    *out_lod_ipart = lod_ipart;
  655.    *out_lod_fpart = lod_fpart;
  656. }
  657.  
  658.  
  659. /**
  660.  * Fast implementation of iround(log2(sqrt(x))), based on
  661.  * log2(x^n) == n*log2(x).
  662.  *
  663.  * Gives accurate results all the time.
  664.  * (Could be trivially extended to handle other power-of-two roots.)
  665.  */
  666. static LLVMValueRef
  667. lp_build_ilog2_sqrt(struct lp_build_context *bld,
  668.                     LLVMValueRef x)
  669. {
  670.    LLVMBuilderRef builder = bld->gallivm->builder;
  671.    LLVMValueRef ipart;
  672.    struct lp_type i_type = lp_int_type(bld->type);
  673.    LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
  674.  
  675.    assert(bld->type.floating);
  676.  
  677.    assert(lp_check_value(bld->type, x));
  678.  
  679.    /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
  680.    ipart = lp_build_extract_exponent(bld, x, 1);
  681.    ipart = LLVMBuildAShr(builder, ipart, one, "");
  682.  
  683.    return ipart;
  684. }
  685.  
  686.  
  687. /**
  688.  * Generate code to compute texture level of detail (lambda).
  689.  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
  690.  * \param lod_bias  optional float vector with the shader lod bias
  691.  * \param explicit_lod  optional float vector with the explicit lod
  692.  * \param cube_rho  rho calculated by cube coord mapping (optional)
  693.  * \param out_lod_ipart  integer part of lod
  694.  * \param out_lod_fpart  float part of lod (never larger than 1 but may be negative)
  695.  * \param out_lod_positive  (mask) if lod is positive (i.e. texture is minified)
  696.  *
  697.  * The resulting lod can be scalar per quad or be per element.
  698.  */
  699. void
  700. lp_build_lod_selector(struct lp_build_sample_context *bld,
  701.                       unsigned texture_unit,
  702.                       unsigned sampler_unit,
  703.                       LLVMValueRef s,
  704.                       LLVMValueRef t,
  705.                       LLVMValueRef r,
  706.                       LLVMValueRef cube_rho,
  707.                       const struct lp_derivatives *derivs,
  708.                       LLVMValueRef lod_bias, /* optional */
  709.                       LLVMValueRef explicit_lod, /* optional */
  710.                       unsigned mip_filter,
  711.                       LLVMValueRef *out_lod_ipart,
  712.                       LLVMValueRef *out_lod_fpart,
  713.                       LLVMValueRef *out_lod_positive)
  714.  
  715. {
  716.    LLVMBuilderRef builder = bld->gallivm->builder;
  717.    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
  718.    struct lp_build_context *lodf_bld = &bld->lodf_bld;
  719.    LLVMValueRef lod;
  720.  
  721.    *out_lod_ipart = bld->lodi_bld.zero;
  722.    *out_lod_positive = bld->lodi_bld.zero;
  723.    *out_lod_fpart = lodf_bld->zero;
  724.  
  725.    /*
  726.     * For determining min/mag, we follow GL 4.1 spec, 3.9.12 Texture Magnification:
  727.     * "Implementations may either unconditionally assume c = 0 for the minification
  728.     * vs. magnification switch-over point, or may choose to make c depend on the
  729.     * combination of minification and magnification modes as follows: if the
  730.     * magnification filter is given by LINEAR and the minification filter is given
  731.     * by NEAREST_MIPMAP_NEAREST or NEAREST_MIPMAP_LINEAR, then c = 0.5. This is
  732.     * done to ensure that a minified texture does not appear "sharper" than a
  733.     * magnified texture. Otherwise c = 0."
  734.     * And 3.9.11 Texture Minification:
  735.     * "If lod is less than or equal to the constant c (see section 3.9.12) the
  736.     * texture is said to be magnified; if it is greater, the texture is minified."
  737.     * So, using 0 as switchover point always, and using magnification for lod == 0.
  738.     * Note that the always c = 0 behavior is new (first appearing in GL 3.1 spec),
  739.     * old GL versions required 0.5 for the modes listed above.
  740.     * I have no clue about the (undocumented) wishes of d3d9/d3d10 here!
  741.     */
  742.  
  743.    if (bld->static_sampler_state->min_max_lod_equal) {
  744.       /* User is forcing sampling from a particular mipmap level.
  745.        * This is hit during mipmap generation.
  746.        */
  747.       LLVMValueRef min_lod =
  748.          dynamic_state->min_lod(dynamic_state, bld->gallivm,
  749.                                 bld->context_ptr, sampler_unit);
  750.  
  751.       lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
  752.    }
  753.    else {
  754.       if (explicit_lod) {
  755.          if (bld->num_lods != bld->coord_type.length)
  756.             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
  757.                                             lodf_bld->type, explicit_lod, 0);
  758.          else
  759.             lod = explicit_lod;
  760.       }
  761.       else {
  762.          LLVMValueRef rho;
  763.          boolean rho_squared = ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
  764.                                 (bld->dims > 1)) || cube_rho;
  765.  
  766.          rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
  767.  
  768.          /*
  769.           * Compute lod = log2(rho)
  770.           */
  771.  
  772.          if (!lod_bias &&
  773.              !bld->static_sampler_state->lod_bias_non_zero &&
  774.              !bld->static_sampler_state->apply_max_lod &&
  775.              !bld->static_sampler_state->apply_min_lod) {
  776.             /*
  777.              * Special case when there are no post-log2 adjustments, which
  778.              * saves instructions but keeping the integer and fractional lod
  779.              * computations separate from the start.
  780.              */
  781.  
  782.             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
  783.                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
  784.                /*
  785.                 * Don't actually need both values all the time, lod_ipart is
  786.                 * needed for nearest mipfilter, lod_positive if min != mag.
  787.                 */
  788.                if (rho_squared) {
  789.                   *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
  790.                }
  791.                else {
  792.                   *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
  793.                }
  794.                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
  795.                                                 rho, lodf_bld->one);
  796.                return;
  797.             }
  798.             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
  799.                 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR) &&
  800.                 !rho_squared) {
  801.                /*
  802.                 * This can't work if rho is squared. Not sure if it could be
  803.                 * fixed while keeping it worthwile, could also do sqrt here
  804.                 * but brilinear and no_rho_opt seems like a combination not
  805.                 * making much sense anyway so just use ordinary path below.
  806.                 */
  807.                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
  808.                                       out_lod_ipart, out_lod_fpart);
  809.                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
  810.                                                 rho, lodf_bld->one);
  811.                return;
  812.             }
  813.          }
  814.  
  815.          if (0) {
  816.             lod = lp_build_log2(lodf_bld, rho);
  817.          }
  818.          else {
  819.             lod = lp_build_fast_log2(lodf_bld, rho);
  820.          }
  821.          if (rho_squared) {
  822.             /* log2(x^2) == 0.5*log2(x) */
  823.             lod = lp_build_mul(lodf_bld, lod,
  824.                                lp_build_const_vec(bld->gallivm, lodf_bld->type, 0.5F));
  825.          }
  826.  
  827.          /* add shader lod bias */
  828.          if (lod_bias) {
  829.             if (bld->num_lods != bld->coord_type.length)
  830.                lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
  831.                                                     lodf_bld->type, lod_bias, 0);
  832.             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
  833.          }
  834.       }
  835.  
  836.       /* add sampler lod bias */
  837.       if (bld->static_sampler_state->lod_bias_non_zero) {
  838.          LLVMValueRef sampler_lod_bias =
  839.             dynamic_state->lod_bias(dynamic_state, bld->gallivm,
  840.                                     bld->context_ptr, sampler_unit);
  841.          sampler_lod_bias = lp_build_broadcast_scalar(lodf_bld,
  842.                                                       sampler_lod_bias);
  843.          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
  844.       }
  845.  
  846.       /* clamp lod */
  847.       if (bld->static_sampler_state->apply_max_lod) {
  848.          LLVMValueRef max_lod =
  849.             dynamic_state->max_lod(dynamic_state, bld->gallivm,
  850.                                    bld->context_ptr, sampler_unit);
  851.          max_lod = lp_build_broadcast_scalar(lodf_bld, max_lod);
  852.  
  853.          lod = lp_build_min(lodf_bld, lod, max_lod);
  854.       }
  855.       if (bld->static_sampler_state->apply_min_lod) {
  856.          LLVMValueRef min_lod =
  857.             dynamic_state->min_lod(dynamic_state, bld->gallivm,
  858.                                    bld->context_ptr, sampler_unit);
  859.          min_lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
  860.  
  861.          lod = lp_build_max(lodf_bld, lod, min_lod);
  862.       }
  863.    }
  864.  
  865.    *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
  866.                                     lod, lodf_bld->zero);
  867.  
  868.    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
  869.       if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
  870.          lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR,
  871.                                 out_lod_ipart, out_lod_fpart);
  872.       }
  873.       else {
  874.          lp_build_ifloor_fract(lodf_bld, lod, out_lod_ipart, out_lod_fpart);
  875.       }
  876.  
  877.       lp_build_name(*out_lod_fpart, "lod_fpart");
  878.    }
  879.    else {
  880.       *out_lod_ipart = lp_build_iround(lodf_bld, lod);
  881.    }
  882.  
  883.    lp_build_name(*out_lod_ipart, "lod_ipart");
  884.  
  885.    return;
  886. }
  887.  
  888.  
  889. /**
  890.  * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod
  891.  * to actual mip level.
  892.  * Note: this is all scalar per quad code.
  893.  * \param lod_ipart  int texture level of detail
  894.  * \param level_out  returns integer
  895.  * \param out_of_bounds returns per coord out_of_bounds mask if provided
  896.  */
  897. void
  898. lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
  899.                            unsigned texture_unit,
  900.                            LLVMValueRef lod_ipart,
  901.                            LLVMValueRef *level_out,
  902.                            LLVMValueRef *out_of_bounds)
  903. {
  904.    struct lp_build_context *leveli_bld = &bld->leveli_bld;
  905.    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
  906.    LLVMValueRef first_level, last_level, level;
  907.  
  908.    first_level = dynamic_state->first_level(dynamic_state, bld->gallivm,
  909.                                             bld->context_ptr, texture_unit);
  910.    last_level = dynamic_state->last_level(dynamic_state, bld->gallivm,
  911.                                           bld->context_ptr, texture_unit);
  912.    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
  913.    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
  914.  
  915.    level = lp_build_add(leveli_bld, lod_ipart, first_level);
  916.  
  917.    if (out_of_bounds) {
  918.       LLVMValueRef out, out1;
  919.       out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level);
  920.       out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level);
  921.       out = lp_build_or(leveli_bld, out, out1);
  922.       if (bld->num_mips == bld->coord_bld.type.length) {
  923.          *out_of_bounds = out;
  924.       }
  925.       else if (bld->num_mips == 1) {
  926.          *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out);
  927.       }
  928.       else {
  929.          assert(bld->num_mips == bld->coord_bld.type.length / 4);
  930.          *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
  931.                                                                 leveli_bld->type,
  932.                                                                 bld->int_coord_bld.type,
  933.                                                                 out);
  934.       }
  935.       level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds);
  936.       *level_out = level;
  937.    }
  938.    else {
  939.       /* clamp level to legal range of levels */
  940.       *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
  941.  
  942.    }
  943. }
  944.  
  945.  
  946. /**
  947.  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad (or per element) int LOD(s)
  948.  * to two (per-quad) (adjacent) mipmap level indexes, and fix up float lod
  949.  * part accordingly.
  950.  * Later, we'll sample from those two mipmap levels and interpolate between them.
  951.  */
  952. void
  953. lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
  954.                            unsigned texture_unit,
  955.                            LLVMValueRef lod_ipart,
  956.                            LLVMValueRef *lod_fpart_inout,
  957.                            LLVMValueRef *level0_out,
  958.                            LLVMValueRef *level1_out)
  959. {
  960.    LLVMBuilderRef builder = bld->gallivm->builder;
  961.    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
  962.    struct lp_build_context *leveli_bld = &bld->leveli_bld;
  963.    struct lp_build_context *levelf_bld = &bld->levelf_bld;
  964.    LLVMValueRef first_level, last_level;
  965.    LLVMValueRef clamp_min;
  966.    LLVMValueRef clamp_max;
  967.  
  968.    assert(bld->num_lods == bld->num_mips);
  969.  
  970.    first_level = dynamic_state->first_level(dynamic_state, bld->gallivm,
  971.                                             bld->context_ptr, texture_unit);
  972.    last_level = dynamic_state->last_level(dynamic_state, bld->gallivm,
  973.                                           bld->context_ptr, texture_unit);
  974.    first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
  975.    last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
  976.  
  977.    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
  978.    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
  979.  
  980.    /*
  981.     * Clamp both *level0_out and *level1_out to [first_level, last_level], with
  982.     * the minimum number of comparisons, and zeroing lod_fpart in the extreme
  983.     * ends in the process.
  984.     */
  985.  
  986.    /* *level0_out < first_level */
  987.    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
  988.                              *level0_out, first_level,
  989.                              "clamp_lod_to_first");
  990.  
  991.    *level0_out = LLVMBuildSelect(builder, clamp_min,
  992.                                  first_level, *level0_out, "");
  993.  
  994.    *level1_out = LLVMBuildSelect(builder, clamp_min,
  995.                                  first_level, *level1_out, "");
  996.  
  997.    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
  998.                                       levelf_bld->zero, *lod_fpart_inout, "");
  999.  
  1000.    /* *level0_out >= last_level */
  1001.    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
  1002.                              *level0_out, last_level,
  1003.                              "clamp_lod_to_last");
  1004.  
  1005.    *level0_out = LLVMBuildSelect(builder, clamp_max,
  1006.                                  last_level, *level0_out, "");
  1007.  
  1008.    *level1_out = LLVMBuildSelect(builder, clamp_max,
  1009.                                  last_level, *level1_out, "");
  1010.  
  1011.    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
  1012.                                       levelf_bld->zero, *lod_fpart_inout, "");
  1013.  
  1014.    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
  1015.    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
  1016.    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
  1017. }
  1018.  
  1019.  
  1020. /**
  1021.  * Return pointer to a single mipmap level.
  1022.  * \param level  integer mipmap level
  1023.  */
  1024. LLVMValueRef
  1025. lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
  1026.                           LLVMValueRef level)
  1027. {
  1028.    LLVMBuilderRef builder = bld->gallivm->builder;
  1029.    LLVMValueRef indexes[2], data_ptr, mip_offset;
  1030.  
  1031.    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
  1032.    indexes[1] = level;
  1033.    mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  1034.    mip_offset = LLVMBuildLoad(builder, mip_offset, "");
  1035.    data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
  1036.    return data_ptr;
  1037. }
  1038.  
  1039. /**
  1040.  * Return (per-pixel) offsets to mip levels.
  1041.  * \param level  integer mipmap level
  1042.  */
  1043. LLVMValueRef
  1044. lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
  1045.                          LLVMValueRef level)
  1046. {
  1047.    LLVMBuilderRef builder = bld->gallivm->builder;
  1048.    LLVMValueRef indexes[2], offsets, offset1;
  1049.  
  1050.    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
  1051.    if (bld->num_mips == 1) {
  1052.       indexes[1] = level;
  1053.       offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  1054.       offset1 = LLVMBuildLoad(builder, offset1, "");
  1055.       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
  1056.    }
  1057.    else if (bld->num_mips == bld->coord_bld.type.length / 4) {
  1058.       unsigned i;
  1059.  
  1060.       offsets = bld->int_coord_bld.undef;
  1061.       for (i = 0; i < bld->num_mips; i++) {
  1062.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1063.          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
  1064.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  1065.          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  1066.          offset1 = LLVMBuildLoad(builder, offset1, "");
  1067.          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
  1068.       }
  1069.       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
  1070.    }
  1071.    else {
  1072.       unsigned i;
  1073.  
  1074.       assert (bld->num_mips == bld->coord_bld.type.length);
  1075.  
  1076.       offsets = bld->int_coord_bld.undef;
  1077.       for (i = 0; i < bld->num_mips; i++) {
  1078.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1079.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  1080.          offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
  1081.          offset1 = LLVMBuildLoad(builder, offset1, "");
  1082.          offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
  1083.       }
  1084.    }
  1085.    return offsets;
  1086. }
  1087.  
  1088.  
  1089. /**
  1090.  * Codegen equivalent for u_minify().
  1091.  * @param lod_scalar  if lod is a (broadcasted) scalar
  1092.  * Return max(1, base_size >> level);
  1093.  */
  1094. LLVMValueRef
  1095. lp_build_minify(struct lp_build_context *bld,
  1096.                 LLVMValueRef base_size,
  1097.                 LLVMValueRef level,
  1098.                 boolean lod_scalar)
  1099. {
  1100.    LLVMBuilderRef builder = bld->gallivm->builder;
  1101.    assert(lp_check_value(bld->type, base_size));
  1102.    assert(lp_check_value(bld->type, level));
  1103.  
  1104.    if (level == bld->zero) {
  1105.       /* if we're using mipmap level zero, no minification is needed */
  1106.       return base_size;
  1107.    }
  1108.    else {
  1109.       LLVMValueRef size;
  1110.       assert(bld->type.sign);
  1111.       if (lod_scalar ||
  1112.          (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
  1113.          size = LLVMBuildLShr(builder, base_size, level, "minify");
  1114.          size = lp_build_max(bld, size, bld->one);
  1115.       }
  1116.       else {
  1117.          /*
  1118.           * emulate shift with float mul, since intel "forgot" shifts with
  1119.           * per-element shift count until avx2, which results in terrible
  1120.           * scalar extraction (both count and value), scalar shift,
  1121.           * vector reinsertion. Should not be an issue on any non-x86 cpu
  1122.           * with a vector instruction set.
  1123.           * On cpus with AMD's XOP this should also be unnecessary but I'm
  1124.           * not sure if llvm would emit this with current flags.
  1125.           */
  1126.          LLVMValueRef const127, const23, lf;
  1127.          struct lp_type ftype;
  1128.          struct lp_build_context fbld;
  1129.          ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
  1130.          lp_build_context_init(&fbld, bld->gallivm, ftype);
  1131.          const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
  1132.          const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
  1133.  
  1134.          /* calculate 2^(-level) float */
  1135.          lf = lp_build_sub(bld, const127, level);
  1136.          lf = lp_build_shl(bld, lf, const23);
  1137.          lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
  1138.  
  1139.          /* finish shift operation by doing float mul */
  1140.          base_size = lp_build_int_to_float(&fbld, base_size);
  1141.          size = lp_build_mul(&fbld, base_size, lf);
  1142.          /*
  1143.           * do the max also with floats because
  1144.           * a) non-emulated int max requires sse41
  1145.           *    (this is actually a lie as we could cast to 16bit values
  1146.           *    as 16bit is sufficient and 16bit int max is sse2)
  1147.           * b) with avx we can do int max 4-wide but float max 8-wide
  1148.           */
  1149.          size = lp_build_max(&fbld, size, fbld.one);
  1150.          size = lp_build_itrunc(&fbld, size);
  1151.       }
  1152.       return size;
  1153.    }
  1154. }
  1155.  
  1156.  
  1157. /**
  1158.  * Dereference stride_array[mipmap_level] array to get a stride.
  1159.  * Return stride as a vector.
  1160.  */
  1161. static LLVMValueRef
  1162. lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
  1163.                               LLVMValueRef stride_array, LLVMValueRef level)
  1164. {
  1165.    LLVMBuilderRef builder = bld->gallivm->builder;
  1166.    LLVMValueRef indexes[2], stride, stride1;
  1167.    indexes[0] = lp_build_const_int32(bld->gallivm, 0);
  1168.    if (bld->num_mips == 1) {
  1169.       indexes[1] = level;
  1170.       stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
  1171.       stride1 = LLVMBuildLoad(builder, stride1, "");
  1172.       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
  1173.    }
  1174.    else if (bld->num_mips == bld->coord_bld.type.length / 4) {
  1175.       LLVMValueRef stride1;
  1176.       unsigned i;
  1177.  
  1178.       stride = bld->int_coord_bld.undef;
  1179.       for (i = 0; i < bld->num_mips; i++) {
  1180.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1181.          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
  1182.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  1183.          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
  1184.          stride1 = LLVMBuildLoad(builder, stride1, "");
  1185.          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
  1186.       }
  1187.       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
  1188.    }
  1189.    else {
  1190.       LLVMValueRef stride1;
  1191.       unsigned i;
  1192.  
  1193.       assert (bld->num_mips == bld->coord_bld.type.length);
  1194.  
  1195.       stride = bld->int_coord_bld.undef;
  1196.       for (i = 0; i < bld->coord_bld.type.length; i++) {
  1197.          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1198.          indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
  1199.          stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
  1200.          stride1 = LLVMBuildLoad(builder, stride1, "");
  1201.          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
  1202.       }
  1203.    }
  1204.    return stride;
  1205. }
  1206.  
  1207.  
  1208. /**
  1209.  * When sampling a mipmap, we need to compute the width, height, depth
  1210.  * of the source levels from the level indexes.  This helper function
  1211.  * does that.
  1212.  */
  1213. void
  1214. lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
  1215.                             LLVMValueRef ilevel,
  1216.                             LLVMValueRef *out_size,
  1217.                             LLVMValueRef *row_stride_vec,
  1218.                             LLVMValueRef *img_stride_vec)
  1219. {
  1220.    const unsigned dims = bld->dims;
  1221.    LLVMValueRef ilevel_vec;
  1222.  
  1223.    /*
  1224.     * Compute width, height, depth at mipmap level 'ilevel'
  1225.     */
  1226.    if (bld->num_mips == 1) {
  1227.       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
  1228.       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec, TRUE);
  1229.    }
  1230.    else {
  1231.       LLVMValueRef int_size_vec;
  1232.       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
  1233.       unsigned num_quads = bld->coord_bld.type.length / 4;
  1234.       unsigned i;
  1235.  
  1236.       if (bld->num_mips == num_quads) {
  1237.          /*
  1238.           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
  1239.           * intel "forgot" the variable shift count instruction until avx2.
  1240.           * A harmless 8x32 shift gets translated into 32 instructions
  1241.           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
  1242.           * unable to recognize if there are really just 2 different shift
  1243.           * count values. So do the shift 4-wide before expansion.
  1244.           */
  1245.          struct lp_build_context bld4;
  1246.          struct lp_type type4;
  1247.  
  1248.          type4 = bld->int_coord_bld.type;
  1249.          type4.length = 4;
  1250.  
  1251.          lp_build_context_init(&bld4, bld->gallivm, type4);
  1252.  
  1253.          if (bld->dims == 1) {
  1254.             assert(bld->int_size_in_bld.type.length == 1);
  1255.             int_size_vec = lp_build_broadcast_scalar(&bld4,
  1256.                                                      bld->int_size);
  1257.          }
  1258.          else {
  1259.             assert(bld->int_size_in_bld.type.length == 4);
  1260.             int_size_vec = bld->int_size;
  1261.          }
  1262.  
  1263.          for (i = 0; i < num_quads; i++) {
  1264.             LLVMValueRef ileveli;
  1265.             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1266.  
  1267.             ileveli = lp_build_extract_broadcast(bld->gallivm,
  1268.                                                  bld->leveli_bld.type,
  1269.                                                  bld4.type,
  1270.                                                  ilevel,
  1271.                                                  indexi);
  1272.             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, TRUE);
  1273.          }
  1274.          /*
  1275.           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
  1276.           * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
  1277.           */
  1278.          *out_size = lp_build_concat(bld->gallivm,
  1279.                                      tmp,
  1280.                                      bld4.type,
  1281.                                      num_quads);
  1282.       }
  1283.       else {
  1284.         /* FIXME: this is terrible and results in _huge_ vector
  1285.          * (for the dims > 1 case).
  1286.          * Should refactor this (together with extract_image_sizes) and do
  1287.          * something more useful. Could for instance if we have width,height
  1288.          * with 4-wide vector pack all elements into a 8xi16 vector
  1289.          * (on which we can still do useful math) instead of using a 16xi32
  1290.          * vector.
  1291.          * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
  1292.          * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
  1293.          */
  1294.          assert(bld->num_mips == bld->coord_bld.type.length);
  1295.          if (bld->dims == 1) {
  1296.             assert(bld->int_size_in_bld.type.length == 1);
  1297.             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
  1298.                                                      bld->int_size);
  1299.             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel, FALSE);
  1300.          }
  1301.          else {
  1302.             LLVMValueRef ilevel1;
  1303.             for (i = 0; i < bld->num_mips; i++) {
  1304.                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
  1305.                ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
  1306.                                                     bld->int_size_in_bld.type, ilevel, indexi);
  1307.                tmp[i] = bld->int_size;
  1308.                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1, TRUE);
  1309.             }
  1310.             *out_size = lp_build_concat(bld->gallivm, tmp,
  1311.                                         bld->int_size_in_bld.type,
  1312.                                         bld->num_mips);
  1313.          }
  1314.       }
  1315.    }
  1316.  
  1317.    if (dims >= 2) {
  1318.       *row_stride_vec = lp_build_get_level_stride_vec(bld,
  1319.                                                       bld->row_stride_array,
  1320.                                                       ilevel);
  1321.    }
  1322.    if (dims == 3 || has_layer_coord(bld->static_texture_state->target)) {
  1323.       *img_stride_vec = lp_build_get_level_stride_vec(bld,
  1324.                                                       bld->img_stride_array,
  1325.                                                       ilevel);
  1326.    }
  1327. }
  1328.  
  1329.  
  1330. /**
  1331.  * Extract and broadcast texture size.
  1332.  *
  1333.  * @param size_type   type of the texture size vector (either
  1334.  *                    bld->int_size_type or bld->float_size_type)
  1335.  * @param coord_type  type of the texture size vector (either
  1336.  *                    bld->int_coord_type or bld->coord_type)
  1337.  * @param size        vector with the texture size (width, height, depth)
  1338.  */
  1339. void
  1340. lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
  1341.                              struct lp_build_context *size_bld,
  1342.                              struct lp_type coord_type,
  1343.                              LLVMValueRef size,
  1344.                              LLVMValueRef *out_width,
  1345.                              LLVMValueRef *out_height,
  1346.                              LLVMValueRef *out_depth)
  1347. {
  1348.    const unsigned dims = bld->dims;
  1349.    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
  1350.    struct lp_type size_type = size_bld->type;
  1351.  
  1352.    if (bld->num_mips == 1) {
  1353.       *out_width = lp_build_extract_broadcast(bld->gallivm,
  1354.                                               size_type,
  1355.                                               coord_type,
  1356.                                               size,
  1357.                                               LLVMConstInt(i32t, 0, 0));
  1358.       if (dims >= 2) {
  1359.          *out_height = lp_build_extract_broadcast(bld->gallivm,
  1360.                                                   size_type,
  1361.                                                   coord_type,
  1362.                                                   size,
  1363.                                                   LLVMConstInt(i32t, 1, 0));
  1364.          if (dims == 3) {
  1365.             *out_depth = lp_build_extract_broadcast(bld->gallivm,
  1366.                                                     size_type,
  1367.                                                     coord_type,
  1368.                                                     size,
  1369.                                                     LLVMConstInt(i32t, 2, 0));
  1370.          }
  1371.       }
  1372.    }
  1373.    else {
  1374.       unsigned num_quads = bld->coord_bld.type.length / 4;
  1375.  
  1376.       if (dims == 1) {
  1377.          *out_width = size;
  1378.       }
  1379.       else if (bld->num_mips == num_quads) {
  1380.          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
  1381.          if (dims >= 2) {
  1382.             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
  1383.             if (dims == 3) {
  1384.                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
  1385.             }
  1386.          }
  1387.       }
  1388.       else {
  1389.          assert(bld->num_mips == bld->coord_type.length);
  1390.          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
  1391.                                                 coord_type, size, 0);
  1392.          if (dims >= 2) {
  1393.             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
  1394.                                                     coord_type, size, 1);
  1395.             if (dims == 3) {
  1396.                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
  1397.                                                       coord_type, size, 2);
  1398.             }
  1399.          }
  1400.       }
  1401.    }
  1402. }
  1403.  
  1404.  
  1405. /**
  1406.  * Unnormalize coords.
  1407.  *
  1408.  * @param flt_size  vector with the integer texture size (width, height, depth)
  1409.  */
  1410. void
  1411. lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
  1412.                              LLVMValueRef flt_size,
  1413.                              LLVMValueRef *s,
  1414.                              LLVMValueRef *t,
  1415.                              LLVMValueRef *r)
  1416. {
  1417.    const unsigned dims = bld->dims;
  1418.    LLVMValueRef width;
  1419.    LLVMValueRef height;
  1420.    LLVMValueRef depth;
  1421.  
  1422.    lp_build_extract_image_sizes(bld,
  1423.                                 &bld->float_size_bld,
  1424.                                 bld->coord_type,
  1425.                                 flt_size,
  1426.                                 &width,
  1427.                                 &height,
  1428.                                 &depth);
  1429.  
  1430.    /* s = s * width, t = t * height */
  1431.    *s = lp_build_mul(&bld->coord_bld, *s, width);
  1432.    if (dims >= 2) {
  1433.       *t = lp_build_mul(&bld->coord_bld, *t, height);
  1434.       if (dims >= 3) {
  1435.          *r = lp_build_mul(&bld->coord_bld, *r, depth);
  1436.       }
  1437.    }
  1438. }
  1439.  
  1440. /**
  1441.  * Generate new coords and faces for cubemap texels falling off the face.
  1442.  *
  1443.  * @param face   face (center) of the pixel
  1444.  * @param x0     lower x coord
  1445.  * @param x1     higher x coord (must be x0 + 1)
  1446.  * @param y0     lower y coord
  1447.  * @param y1     higher y coord (must be x0 + 1)
  1448.  * @param max_coord     texture cube (level) size - 1
  1449.  * @param next_faces    new face values when falling off
  1450.  * @param next_xcoords  new x coord values when falling off
  1451.  * @param next_ycoords  new y coord values when falling off
  1452.  *
  1453.  * The arrays hold the new values when under/overflow of
  1454.  * lower x, higher x, lower y, higher y coord would occur (in this order).
  1455.  * next_xcoords/next_ycoords have two entries each (for both new lower and
  1456.  * higher coord).
  1457.  */
  1458. void
  1459. lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
  1460.                         LLVMValueRef face,
  1461.                         LLVMValueRef x0,
  1462.                         LLVMValueRef x1,
  1463.                         LLVMValueRef y0,
  1464.                         LLVMValueRef y1,
  1465.                         LLVMValueRef max_coord,
  1466.                         LLVMValueRef next_faces[4],
  1467.                         LLVMValueRef next_xcoords[4][2],
  1468.                         LLVMValueRef next_ycoords[4][2])
  1469. {
  1470.    /*
  1471.     * Lookup tables aren't nice for simd code hence try some logic here.
  1472.     * (Note that while it would not be necessary to do per-sample (4) lookups
  1473.     * when using a LUT as it's impossible that texels fall off of positive
  1474.     * and negative edges simultaneously, it would however be necessary to
  1475.     * do 2 lookups for corner handling as in this case texels both fall off
  1476.     * of x and y axes.)
  1477.     */
  1478.    /*
  1479.     * Next faces (for face 012345):
  1480.     * x < 0.0  : 451110
  1481.     * x >= 1.0 : 540001
  1482.     * y < 0.0  : 225422
  1483.     * y >= 1.0 : 334533
  1484.     * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
  1485.     * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
  1486.     * nfy+: face & ~4 > 1 ? face + 2 : 3;
  1487.     * This could also use pshufb instead, but would need (manually coded)
  1488.     * ssse3 intrinsic (llvm won't do non-constant shuffles).
  1489.     */
  1490.    struct gallivm_state *gallivm = ivec_bld->gallivm;
  1491.    LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
  1492.    LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
  1493.    LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
  1494.    LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
  1495.    LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
  1496.    LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
  1497.  
  1498.    sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
  1499.    tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
  1500.    sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one);
  1501.    faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
  1502.    tmp = lp_build_add(ivec_bld, faceand1, c4);
  1503.    next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp);
  1504.    next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one);
  1505.  
  1506.    tmp = lp_build_andnot(ivec_bld, face, c4);
  1507.    sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one);
  1508.    tmp = lp_build_add(ivec_bld, face, c2);
  1509.    next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3);
  1510.    next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one);
  1511.  
  1512.    /*
  1513.     * new xcoords (for face 012345):
  1514.     * x < 0.0  : max   max   t     max-t max  max
  1515.     * x >= 1.0 : 0     0     max-t t     0    0
  1516.     * y < 0.0  : max   0     max-s s     s    max-s
  1517.     * y >= 1.0 : max   0     s     max-s s    max-s
  1518.     *
  1519.     * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0
  1520.     * ncx[0] = max - ncx[1]
  1521.     * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max
  1522.     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
  1523.     */
  1524.    sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2);
  1525.    maxmy0 = lp_build_sub(ivec_bld, max_coord, y0);
  1526.    tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0);
  1527.    next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
  1528.    next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]);
  1529.    maxmy1 = lp_build_sub(ivec_bld, max_coord, y1);
  1530.    tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1);
  1531.    next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
  1532.    next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]);
  1533.  
  1534.    sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one);
  1535.  
  1536.    tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord);
  1537.    maxmx0 = lp_build_sub(ivec_bld, max_coord, x0);
  1538.    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
  1539.    next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
  1540.    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]);
  1541.    next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]);
  1542.    maxmx1 = lp_build_sub(ivec_bld, max_coord, x1);
  1543.    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
  1544.    next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
  1545.    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]);
  1546.    next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]);
  1547.  
  1548.    /*
  1549.     * new ycoords (for face 012345):
  1550.     * x < 0.0  : t     t     0     max   t    t
  1551.     * x >= 1.0 : t     t     0     max   t    t
  1552.     * y < 0.0  : max-s s     0     max   max  0
  1553.     * y >= 1.0 : s     max-s 0     max   0    max
  1554.     *
  1555.     * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t
  1556.     * ncy[1] = ncy[0]
  1557.     * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max
  1558.     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
  1559.     */
  1560.    tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord);
  1561.    next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0);
  1562.    next_ycoords[1][0] = next_ycoords[0][0];
  1563.    next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1);
  1564.    next_ycoords[1][1] = next_ycoords[0][1];
  1565.  
  1566.    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
  1567.    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
  1568.    next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
  1569.    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]);
  1570.    next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp);
  1571.    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
  1572.    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
  1573.    next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
  1574.    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]);
  1575.    next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp);
  1576. }
  1577.  
  1578.  
  1579. /** Helper used by lp_build_cube_lookup() */
  1580. static LLVMValueRef
  1581. lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
  1582. {
  1583.    /* ima = +0.5 / abs(coord); */
  1584.    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
  1585.    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
  1586.    LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
  1587.    return ima;
  1588. }
  1589.  
  1590.  
  1591. /** Helper for doing 3-wise selection.
  1592.  * Returns sel1 ? val2 : (sel0 ? val0 : val1).
  1593.  */
  1594. static LLVMValueRef
  1595. lp_build_select3(struct lp_build_context *sel_bld,
  1596.                  LLVMValueRef sel0,
  1597.                  LLVMValueRef sel1,
  1598.                  LLVMValueRef val0,
  1599.                  LLVMValueRef val1,
  1600.                  LLVMValueRef val2)
  1601. {
  1602.    LLVMValueRef tmp;
  1603.    tmp = lp_build_select(sel_bld, sel0, val0, val1);
  1604.    return lp_build_select(sel_bld, sel1, val2, tmp);
  1605. }
  1606.  
  1607.  
  1608. /**
  1609.  * Generate code to do cube face selection and compute per-face texcoords.
  1610.  */
  1611. void
  1612. lp_build_cube_lookup(struct lp_build_sample_context *bld,
  1613.                      LLVMValueRef *coords,
  1614.                      const struct lp_derivatives *derivs_in, /* optional */
  1615.                      LLVMValueRef *rho,
  1616.                      struct lp_derivatives *derivs_out, /* optional */
  1617.                      boolean need_derivs)
  1618. {
  1619.    struct lp_build_context *coord_bld = &bld->coord_bld;
  1620.    LLVMBuilderRef builder = bld->gallivm->builder;
  1621.    struct gallivm_state *gallivm = bld->gallivm;
  1622.    LLVMValueRef si, ti, ri;
  1623.  
  1624.    /*
  1625.     * Do per-pixel face selection. We cannot however (as we used to do)
  1626.     * simply calculate the derivs afterwards (which is very bogus for
  1627.     * explicit derivs btw) because the values would be "random" when
  1628.     * not all pixels lie on the same face. So what we do here is just
  1629.     * calculate the derivatives after scaling the coords by the absolute
  1630.     * value of the inverse major axis, and essentially do rho calculation
  1631.     * steps as if it were a 3d texture. This is perfect if all pixels hit
  1632.     * the same face, but not so great at edges, I believe the max error
  1633.     * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
  1634.     * the 3d distance between 2 points on the cube instead of measuring up/down
  1635.     * the edge). Still this is possibly a win over just selecting the same face
  1636.     * for all pixels. Unfortunately, something like that doesn't work for
  1637.     * explicit derivatives.
  1638.     */
  1639.    struct lp_build_context *cint_bld = &bld->int_coord_bld;
  1640.    struct lp_type intctype = cint_bld->type;
  1641.    LLVMTypeRef coord_vec_type = coord_bld->vec_type;
  1642.    LLVMTypeRef cint_vec_type = cint_bld->vec_type;
  1643.    LLVMValueRef as, at, ar, face, face_s, face_t;
  1644.    LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
  1645.    LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
  1646.    LLVMValueRef tnegi, rnegi;
  1647.    LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
  1648.    LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
  1649.    LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
  1650.                                                   1LL << (intctype.width - 1));
  1651.    LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
  1652.                                                    intctype.width -1);
  1653.    LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
  1654.    LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
  1655.    LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
  1656.    LLVMValueRef s = coords[0];
  1657.    LLVMValueRef t = coords[1];
  1658.    LLVMValueRef r = coords[2];
  1659.  
  1660.    assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
  1661.    assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
  1662.    assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
  1663.  
  1664.    /*
  1665.     * get absolute value (for x/y/z face selection) and sign bit
  1666.     * (for mirroring minor coords and pos/neg face selection)
  1667.     * of the original coords.
  1668.     */
  1669.    as = lp_build_abs(&bld->coord_bld, s);
  1670.    at = lp_build_abs(&bld->coord_bld, t);
  1671.    ar = lp_build_abs(&bld->coord_bld, r);
  1672.  
  1673.    /*
  1674.     * major face determination: select x if x > y else select y
  1675.     * select z if z >= max(x,y) else select previous result
  1676.     * if some axis are the same we chose z over y, y over x - the
  1677.     * dx10 spec seems to ask for it while OpenGL doesn't care (if we
  1678.     * wouldn't care could save a select or two if using different
  1679.     * compares and doing at_g_as_ar last since tnewx and tnewz are the
  1680.     * same).
  1681.     */
  1682.    as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
  1683.    maxasat = lp_build_max(coord_bld, as, at);
  1684.    ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
  1685.  
  1686.    if (need_derivs && (derivs_in ||
  1687.        ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
  1688.         (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) {
  1689.       /*
  1690.        * XXX: This is really really complex.
  1691.        * It is a bit overkill to use this for implicit derivatives as well,
  1692.        * no way this is worth the cost in practice, but seems to be the
  1693.        * only way for getting accurate and per-pixel lod values.
  1694.        */
  1695.       LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
  1696.       LLVMValueRef madx, mady, madxdivma, madydivma;
  1697.       LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
  1698.       LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
  1699.       LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
  1700.       LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
  1701.       LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
  1702.       /*
  1703.        * s = 1/2 * ( sc / ma + 1)
  1704.        * t = 1/2 * ( tc / ma + 1)
  1705.        *
  1706.        * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
  1707.        * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
  1708.        *
  1709.        * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
  1710.        * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
  1711.        * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
  1712.        * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
  1713.        */
  1714.  
  1715.       /* select ma, calculate ima */
  1716.       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
  1717.       mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
  1718.       signmabit = LLVMBuildAnd(builder, mai, signmask, "");
  1719.       ima = lp_build_div(coord_bld, coord_bld->one, ma);
  1720.       imahalf = lp_build_mul(coord_bld, posHalf, ima);
  1721.       imahalfpos = lp_build_abs(coord_bld, imahalf);
  1722.  
  1723.       if (!derivs_in) {
  1724.          ddx[0] = lp_build_ddx(coord_bld, s);
  1725.          ddx[1] = lp_build_ddx(coord_bld, t);
  1726.          ddx[2] = lp_build_ddx(coord_bld, r);
  1727.          ddy[0] = lp_build_ddy(coord_bld, s);
  1728.          ddy[1] = lp_build_ddy(coord_bld, t);
  1729.          ddy[2] = lp_build_ddy(coord_bld, r);
  1730.       }
  1731.       else {
  1732.          ddx[0] = derivs_in->ddx[0];
  1733.          ddx[1] = derivs_in->ddx[1];
  1734.          ddx[2] = derivs_in->ddx[2];
  1735.          ddy[0] = derivs_in->ddy[0];
  1736.          ddy[1] = derivs_in->ddy[1];
  1737.          ddy[2] = derivs_in->ddy[2];
  1738.       }
  1739.  
  1740.       /* select major derivatives */
  1741.       madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
  1742.       mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
  1743.  
  1744.       si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
  1745.       ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
  1746.       ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
  1747.  
  1748.       sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
  1749.       tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
  1750.       rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
  1751.  
  1752.       sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
  1753.       tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
  1754.       rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
  1755.  
  1756.       /*
  1757.        * compute all possible new s/t coords, which does the mirroring,
  1758.        * and do the same for derivs minor axes.
  1759.        * snewx = signma * -r;
  1760.        * tnewx = -t;
  1761.        * snewy = s;
  1762.        * tnewy = signma * r;
  1763.        * snewz = signma * s;
  1764.        * tnewz = -t;
  1765.        */
  1766.       tnegi = LLVMBuildXor(builder, ti, signmask, "");
  1767.       rnegi = LLVMBuildXor(builder, ri, signmask, "");
  1768.       tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
  1769.       rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
  1770.       tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
  1771.       rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
  1772.  
  1773.       snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
  1774.       tnewx = tnegi;
  1775.       sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
  1776.       tdxnewx = tdxnegi;
  1777.       sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
  1778.       tdynewx = tdynegi;
  1779.  
  1780.       snewy = si;
  1781.       tnewy = LLVMBuildXor(builder, signmabit, ri, "");
  1782.       sdxnewy = sdxi;
  1783.       tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
  1784.       sdynewy = sdyi;
  1785.       tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
  1786.  
  1787.       snewz = LLVMBuildXor(builder, signmabit, si, "");
  1788.       tnewz = tnegi;
  1789.       sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
  1790.       tdxnewz = tdxnegi;
  1791.       sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
  1792.       tdynewz = tdynegi;
  1793.  
  1794.       /* select the mirrored values */
  1795.       face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
  1796.       face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
  1797.       face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
  1798.       face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
  1799.       face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
  1800.       face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
  1801.       face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
  1802.  
  1803.       face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
  1804.       face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
  1805.       face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
  1806.       face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
  1807.       face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
  1808.       face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
  1809.  
  1810.       /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
  1811.       madxdivma = lp_build_mul(coord_bld, madx, ima);
  1812.       tmp = lp_build_mul(coord_bld, madxdivma, face_s);
  1813.       tmp = lp_build_sub(coord_bld, face_sdx, tmp);
  1814.       derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
  1815.  
  1816.       /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
  1817.       tmp = lp_build_mul(coord_bld, madxdivma, face_t);
  1818.       tmp = lp_build_sub(coord_bld, face_tdx, tmp);
  1819.       derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
  1820.  
  1821.       /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
  1822.       madydivma = lp_build_mul(coord_bld, mady, ima);
  1823.       tmp = lp_build_mul(coord_bld, madydivma, face_s);
  1824.       tmp = lp_build_sub(coord_bld, face_sdy, tmp);
  1825.       derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
  1826.  
  1827.       /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
  1828.       tmp = lp_build_mul(coord_bld, madydivma, face_t);
  1829.       tmp = lp_build_sub(coord_bld, face_tdy, tmp);
  1830.       derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
  1831.  
  1832.       signma = LLVMBuildLShr(builder, mai, signshift, "");
  1833.       coords[2] = LLVMBuildOr(builder, face, signma, "face");
  1834.  
  1835.       /* project coords */
  1836.       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
  1837.       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
  1838.  
  1839.       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
  1840.       coords[1] = lp_build_add(coord_bld, face_t, posHalf);
  1841.  
  1842.       return;
  1843.    }
  1844.  
  1845.    else if (need_derivs) {
  1846.       LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
  1847.       static const unsigned char swizzle0[] = { /* no-op swizzle */
  1848.          0, LP_BLD_SWIZZLE_DONTCARE,
  1849.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1850.       };
  1851.       static const unsigned char swizzle1[] = {
  1852.          1, LP_BLD_SWIZZLE_DONTCARE,
  1853.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1854.       };
  1855.       static const unsigned char swizzle01[] = { /* no-op swizzle */
  1856.          0, 1,
  1857.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1858.       };
  1859.       static const unsigned char swizzle23[] = {
  1860.          2, 3,
  1861.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1862.       };
  1863.       static const unsigned char swizzle02[] = {
  1864.          0, 2,
  1865.          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
  1866.       };
  1867.  
  1868.       /*
  1869.        * scale the s/t/r coords pre-select/mirror so we can calculate
  1870.        * "reasonable" derivs.
  1871.        */
  1872.       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
  1873.       imahalfpos = lp_build_cube_imapos(coord_bld, ma);
  1874.       s = lp_build_mul(coord_bld, s, imahalfpos);
  1875.       t = lp_build_mul(coord_bld, t, imahalfpos);
  1876.       r = lp_build_mul(coord_bld, r, imahalfpos);
  1877.  
  1878.       /*
  1879.        * This isn't quite the same as the "ordinary" (3d deriv) path since we
  1880.        * know the texture is square which simplifies things (we can omit the
  1881.        * size mul which happens very early completely here and do it at the
  1882.        * very end).
  1883.        * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX
  1884.        * since the error can get quite big otherwise at edges.
  1885.        * (With no_rho_approx max error is sqrt(2) at edges, same as it is
  1886.        * without no_rho_approx for 2d textures, otherwise it would be factor 2.)
  1887.        */
  1888.       ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  1889.       ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
  1890.  
  1891.       ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
  1892.       ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
  1893.  
  1894.       tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  1895.       tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  1896.       tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
  1897.  
  1898.       rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
  1899.       rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
  1900.  
  1901.       tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  1902.       tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  1903.       *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
  1904.    }
  1905.  
  1906.    if (!need_derivs) {
  1907.       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
  1908.    }
  1909.    mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
  1910.    signmabit = LLVMBuildAnd(builder, mai, signmask, "");
  1911.  
  1912.    si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
  1913.    ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
  1914.    ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
  1915.  
  1916.    /*
  1917.     * compute all possible new s/t coords, which does the mirroring
  1918.     * snewx = signma * -r;
  1919.     * tnewx = -t;
  1920.     * snewy = s;
  1921.     * tnewy = signma * r;
  1922.     * snewz = signma * s;
  1923.     * tnewz = -t;
  1924.     */
  1925.    tnegi = LLVMBuildXor(builder, ti, signmask, "");
  1926.    rnegi = LLVMBuildXor(builder, ri, signmask, "");
  1927.  
  1928.    snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
  1929.    tnewx = tnegi;
  1930.  
  1931.    snewy = si;
  1932.    tnewy = LLVMBuildXor(builder, signmabit, ri, "");
  1933.  
  1934.    snewz = LLVMBuildXor(builder, signmabit, si, "");
  1935.    tnewz = tnegi;
  1936.  
  1937.    /* select the mirrored values */
  1938.    face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
  1939.    face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
  1940.    face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
  1941.  
  1942.    face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
  1943.    face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
  1944.  
  1945.    /* add +1 for neg face */
  1946.    /* XXX with AVX probably want to use another select here -
  1947.     * as long as we ensure vblendvps gets used we can actually
  1948.     * skip the comparison and just use sign as a "mask" directly.
  1949.     */
  1950.    signma = LLVMBuildLShr(builder, mai, signshift, "");
  1951.    coords[2] = LLVMBuildOr(builder, face, signma, "face");
  1952.  
  1953.    /* project coords */
  1954.    if (!need_derivs) {
  1955.       imahalfpos = lp_build_cube_imapos(coord_bld, ma);
  1956.       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
  1957.       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
  1958.    }
  1959.  
  1960.    coords[0] = lp_build_add(coord_bld, face_s, posHalf);
  1961.    coords[1] = lp_build_add(coord_bld, face_t, posHalf);
  1962. }
  1963.  
  1964.  
  1965. /**
  1966.  * Compute the partial offset of a pixel block along an arbitrary axis.
  1967.  *
  1968.  * @param coord   coordinate in pixels
  1969.  * @param stride  number of bytes between rows of successive pixel blocks
  1970.  * @param block_length  number of pixels in a pixels block along the coordinate
  1971.  *                      axis
  1972.  * @param out_offset    resulting relative offset of the pixel block in bytes
  1973.  * @param out_subcoord  resulting sub-block pixel coordinate
  1974.  */
  1975. void
  1976. lp_build_sample_partial_offset(struct lp_build_context *bld,
  1977.                                unsigned block_length,
  1978.                                LLVMValueRef coord,
  1979.                                LLVMValueRef stride,
  1980.                                LLVMValueRef *out_offset,
  1981.                                LLVMValueRef *out_subcoord)
  1982. {
  1983.    LLVMBuilderRef builder = bld->gallivm->builder;
  1984.    LLVMValueRef offset;
  1985.    LLVMValueRef subcoord;
  1986.  
  1987.    if (block_length == 1) {
  1988.       subcoord = bld->zero;
  1989.    }
  1990.    else {
  1991.       /*
  1992.        * Pixel blocks have power of two dimensions. LLVM should convert the
  1993.        * rem/div to bit arithmetic.
  1994.        * TODO: Verify this.
  1995.        * It does indeed BUT it does transform it to scalar (and back) when doing so
  1996.        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
  1997.        * The generated code looks seriously unfunny and is quite expensive.
  1998.        */
  1999. #if 0
  2000.       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
  2001.       subcoord = LLVMBuildURem(builder, coord, block_width, "");
  2002.       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
  2003. #else
  2004.       unsigned logbase2 = util_logbase2(block_length);
  2005.       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
  2006.       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
  2007.       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
  2008.       coord = LLVMBuildLShr(builder, coord, block_shift, "");
  2009. #endif
  2010.    }
  2011.  
  2012.    offset = lp_build_mul(bld, coord, stride);
  2013.  
  2014.    assert(out_offset);
  2015.    assert(out_subcoord);
  2016.  
  2017.    *out_offset = offset;
  2018.    *out_subcoord = subcoord;
  2019. }
  2020.  
  2021.  
  2022. /**
  2023.  * Compute the offset of a pixel block.
  2024.  *
  2025.  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
  2026.  *
  2027.  * Returns the relative offset and i,j sub-block coordinates
  2028.  */
  2029. void
  2030. lp_build_sample_offset(struct lp_build_context *bld,
  2031.                        const struct util_format_description *format_desc,
  2032.                        LLVMValueRef x,
  2033.                        LLVMValueRef y,
  2034.                        LLVMValueRef z,
  2035.                        LLVMValueRef y_stride,
  2036.                        LLVMValueRef z_stride,
  2037.                        LLVMValueRef *out_offset,
  2038.                        LLVMValueRef *out_i,
  2039.                        LLVMValueRef *out_j)
  2040. {
  2041.    LLVMValueRef x_stride;
  2042.    LLVMValueRef offset;
  2043.  
  2044.    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
  2045.                                  format_desc->block.bits/8);
  2046.  
  2047.    lp_build_sample_partial_offset(bld,
  2048.                                   format_desc->block.width,
  2049.                                   x, x_stride,
  2050.                                   &offset, out_i);
  2051.  
  2052.    if (y && y_stride) {
  2053.       LLVMValueRef y_offset;
  2054.       lp_build_sample_partial_offset(bld,
  2055.                                      format_desc->block.height,
  2056.                                      y, y_stride,
  2057.                                      &y_offset, out_j);
  2058.       offset = lp_build_add(bld, offset, y_offset);
  2059.    }
  2060.    else {
  2061.       *out_j = bld->zero;
  2062.    }
  2063.  
  2064.    if (z && z_stride) {
  2065.       LLVMValueRef z_offset;
  2066.       LLVMValueRef k;
  2067.       lp_build_sample_partial_offset(bld,
  2068.                                      1, /* pixel blocks are always 2D */
  2069.                                      z, z_stride,
  2070.                                      &z_offset, &k);
  2071.       offset = lp_build_add(bld, offset, z_offset);
  2072.    }
  2073.  
  2074.    *out_offset = offset;
  2075. }
  2076.