Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2009 VMware, Inc.
  4.  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  5.  * All Rights Reserved.
  6.  *
  7.  * Permission is hereby granted, free of charge, to any person obtaining a
  8.  * copy of this software and associated documentation files (the
  9.  * "Software"), to deal in the Software without restriction, including
  10.  * without limitation the rights to use, copy, modify, merge, publish,
  11.  * distribute, sub license, and/or sell copies of the Software, and to
  12.  * permit persons to whom the Software is furnished to do so, subject to
  13.  * the following conditions:
  14.  *
  15.  * The above copyright notice and this permission notice (including the
  16.  * next paragraph) shall be included in all copies or substantial portions
  17.  * of the Software.
  18.  *
  19.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22.  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26.  *
  27.  **************************************************************************/
  28.  
  29. /**
  30.  * @file
  31.  * Position and shader input interpolation.
  32.  *
  33.  * @author Jose Fonseca <jfonseca@vmware.com>
  34.  */
  35.  
  36. #include "pipe/p_shader_tokens.h"
  37. #include "util/u_debug.h"
  38. #include "util/u_memory.h"
  39. #include "util/u_math.h"
  40. #include "tgsi/tgsi_scan.h"
  41. #include "gallivm/lp_bld_debug.h"
  42. #include "gallivm/lp_bld_const.h"
  43. #include "gallivm/lp_bld_arit.h"
  44. #include "gallivm/lp_bld_swizzle.h"
  45. #include "gallivm/lp_bld_flow.h"
  46. #include "lp_bld_interp.h"
  47.  
  48.  
  49. /*
  50.  * The shader JIT function operates on blocks of quads.
  51.  * Each block has 2x2 quads and each quad has 2x2 pixels.
  52.  *
  53.  * We iterate over the quads in order 0, 1, 2, 3:
  54.  *
  55.  * #################
  56.  * #   |   #   |   #
  57.  * #---0---#---1---#
  58.  * #   |   #   |   #
  59.  * #################
  60.  * #   |   #   |   #
  61.  * #---2---#---3---#
  62.  * #   |   #   |   #
  63.  * #################
  64.  *
  65.  * If we iterate over multiple quads at once, quads 01 and 23 are processed
  66.  * together.
  67.  *
  68.  * Within each quad, we have four pixels which are represented in SOA
  69.  * order:
  70.  *
  71.  * #########
  72.  * # 0 | 1 #
  73.  * #---+---#
  74.  * # 2 | 3 #
  75.  * #########
  76.  *
  77.  * So the green channel (for example) of the four pixels is stored in
  78.  * a single vector register: {g0, g1, g2, g3}.
  79.  * The order stays the same even with multiple quads:
  80.  * 0 1 4 5
  81.  * 2 3 6 7
  82.  * is stored as g0..g7
  83.  */
  84.  
  85.  
  86. /**
  87.  * Do one perspective divide per quad.
  88.  *
  89.  * For perspective interpolation, the final attribute value is given
  90.  *
  91.  *  a' = a/w = a * oow
  92.  *
  93.  * where
  94.  *
  95.  *  a = a0 + dadx*x + dady*y
  96.  *  w = w0 + dwdx*x + dwdy*y
  97.  *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
  98.  *
  99.  * Instead of computing the division per pixel, with this macro we compute the
  100.  * division on the upper left pixel of each quad, and use a linear
  101.  * approximation in the remaining pixels, given by:
  102.  *
  103.  *  da'dx = (dadx - dwdx*a)*oow
  104.  *  da'dy = (dady - dwdy*a)*oow
  105.  *
  106.  * Ironically, this actually makes things slower -- probably because the
  107.  * divide hardware unit is rarely used, whereas the multiply unit is typically
  108.  * already saturated.
  109.  */
  110. #define PERSPECTIVE_DIVIDE_PER_QUAD 0
  111.  
  112.  
  113. static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
  114. static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
  115.  
  116.  
  117. static void
  118. attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
  119. {
  120.    if(attrib == 0)
  121.       lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
  122.    else
  123.       lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
  124. }
  125.  
  126. static void
  127. calc_offsets(struct lp_build_context *coeff_bld,
  128.              unsigned quad_start_index,
  129.              LLVMValueRef *pixoffx,
  130.              LLVMValueRef *pixoffy)
  131. {
  132.    unsigned i;
  133.    unsigned num_pix = coeff_bld->type.length;
  134.    struct gallivm_state *gallivm = coeff_bld->gallivm;
  135.    LLVMBuilderRef builder = coeff_bld->gallivm->builder;
  136.    LLVMValueRef nr, pixxf, pixyf;
  137.  
  138.    *pixoffx = coeff_bld->undef;
  139.    *pixoffy = coeff_bld->undef;
  140.  
  141.    for (i = 0; i < num_pix; i++) {
  142.       nr = lp_build_const_int32(gallivm, i);
  143.       pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
  144.                                    (quad_start_index & 1) * 2);
  145.       pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
  146.                                    (quad_start_index & 2));
  147.       *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
  148.       *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
  149.    }
  150. }
  151.  
  152.  
  153. /* Much easier, and significantly less instructions in the per-stamp
  154.  * part (less than half) but overall more instructions so a loss if
  155.  * most quads are active. Might be a win though with larger vectors.
  156.  * No ability to do per-quad divide (doable but not implemented)
  157.  * Could be made to work with passed in pixel offsets (i.e. active quad merging).
  158.  */
  159. static void
  160. coeffs_init_simple(struct lp_build_interp_soa_context *bld,
  161.                    LLVMValueRef a0_ptr,
  162.                    LLVMValueRef dadx_ptr,
  163.                    LLVMValueRef dady_ptr)
  164. {
  165.    struct lp_build_context *coeff_bld = &bld->coeff_bld;
  166.    struct lp_build_context *setup_bld = &bld->setup_bld;
  167.    struct gallivm_state *gallivm = coeff_bld->gallivm;
  168.    LLVMBuilderRef builder = gallivm->builder;
  169.    unsigned attrib;
  170.  
  171.    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
  172.       /*
  173.        * always fetch all 4 values for performance/simplicity
  174.        * Note: we do that here because it seems to generate better
  175.        * code. It generates a lot of moves initially but less
  176.        * moves later. As far as I can tell this looks like a
  177.        * llvm issue, instead of simply reloading the values from
  178.        * the passed in pointers it if it runs out of registers
  179.        * it spills/reloads them. Maybe some optimization passes
  180.        * would help.
  181.        * Might want to investigate this again later.
  182.        */
  183.       const unsigned interp = bld->interp[attrib];
  184.       LLVMValueRef index = lp_build_const_int32(gallivm,
  185.                                 attrib * TGSI_NUM_CHANNELS);
  186.       LLVMValueRef ptr;
  187.       LLVMValueRef dadxaos = setup_bld->zero;
  188.       LLVMValueRef dadyaos = setup_bld->zero;
  189.       LLVMValueRef a0aos = setup_bld->zero;
  190.  
  191.       switch (interp) {
  192.       case LP_INTERP_PERSPECTIVE:
  193.          /* fall-through */
  194.  
  195.       case LP_INTERP_LINEAR:
  196.          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
  197.          ptr = LLVMBuildBitCast(builder, ptr,
  198.                LLVMPointerType(setup_bld->vec_type, 0), "");
  199.          dadxaos = LLVMBuildLoad(builder, ptr, "");
  200.  
  201.          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
  202.          ptr = LLVMBuildBitCast(builder, ptr,
  203.                LLVMPointerType(setup_bld->vec_type, 0), "");
  204.          dadyaos = LLVMBuildLoad(builder, ptr, "");
  205.  
  206.          attrib_name(dadxaos, attrib, 0, ".dadxaos");
  207.          attrib_name(dadyaos, attrib, 0, ".dadyaos");
  208.          /* fall-through */
  209.  
  210.       case LP_INTERP_CONSTANT:
  211.       case LP_INTERP_FACING:
  212.          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
  213.          ptr = LLVMBuildBitCast(builder, ptr,
  214.                LLVMPointerType(setup_bld->vec_type, 0), "");
  215.          a0aos = LLVMBuildLoad(builder, ptr, "");
  216.          attrib_name(a0aos, attrib, 0, ".a0aos");
  217.          break;
  218.  
  219.       case LP_INTERP_POSITION:
  220.          /* Nothing to do as the position coeffs are already setup in slot 0 */
  221.          continue;
  222.  
  223.       default:
  224.          assert(0);
  225.          break;
  226.       }
  227.       bld->a0aos[attrib] = a0aos;
  228.       bld->dadxaos[attrib] = dadxaos;
  229.       bld->dadyaos[attrib] = dadyaos;
  230.    }
  231. }
  232.  
  233. /**
  234.  * Interpolate the shader input attribute values.
  235.  * This is called for each (group of) quad(s).
  236.  */
  237. static void
  238. attribs_update_simple(struct lp_build_interp_soa_context *bld,
  239.                       struct gallivm_state *gallivm,
  240.                       LLVMValueRef loop_iter,
  241.                       int start,
  242.                       int end)
  243. {
  244.    LLVMBuilderRef builder = gallivm->builder;
  245.    struct lp_build_context *coeff_bld = &bld->coeff_bld;
  246.    struct lp_build_context *setup_bld = &bld->setup_bld;
  247.    LLVMValueRef oow = NULL;
  248.    unsigned attrib;
  249.    LLVMValueRef pixoffx;
  250.    LLVMValueRef pixoffy;
  251.    LLVMValueRef ptr;
  252.  
  253.    /* could do this with code-generated passed in pixel offsets too */
  254.  
  255.    assert(loop_iter);
  256.    ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
  257.    pixoffx = LLVMBuildLoad(builder, ptr, "");
  258.    ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
  259.    pixoffy = LLVMBuildLoad(builder, ptr, "");
  260.  
  261.    pixoffx = LLVMBuildFAdd(builder, pixoffx,
  262.                            lp_build_broadcast_scalar(coeff_bld, bld->x), "");
  263.    pixoffy = LLVMBuildFAdd(builder, pixoffy,
  264.                            lp_build_broadcast_scalar(coeff_bld, bld->y), "");
  265.  
  266.    for (attrib = start; attrib < end; attrib++) {
  267.       const unsigned mask = bld->mask[attrib];
  268.       const unsigned interp = bld->interp[attrib];
  269.       unsigned chan;
  270.  
  271.       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
  272.          if (mask & (1 << chan)) {
  273.             LLVMValueRef index;
  274.             LLVMValueRef dadx = coeff_bld->zero;
  275.             LLVMValueRef dady = coeff_bld->zero;
  276.             LLVMValueRef a = coeff_bld->zero;
  277.  
  278.             index = lp_build_const_int32(gallivm, chan);
  279.             switch (interp) {
  280.             case LP_INTERP_PERSPECTIVE:
  281.                /* fall-through */
  282.  
  283.             case LP_INTERP_LINEAR:
  284.                if (attrib == 0 && chan == 0) {
  285.                   dadx = coeff_bld->one;
  286.                   if (bld->pos_offset) {
  287.                      a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
  288.                   }
  289.                }
  290.                else if (attrib == 0 && chan == 1) {
  291.                   dady = coeff_bld->one;
  292.                   if (bld->pos_offset) {
  293.                      a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
  294.                   }
  295.                }
  296.                else {
  297.                   dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
  298.                                                     coeff_bld->type, bld->dadxaos[attrib],
  299.                                                     index);
  300.                   dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
  301.                                                     coeff_bld->type, bld->dadyaos[attrib],
  302.                                                     index);
  303.                   a = lp_build_extract_broadcast(gallivm, setup_bld->type,
  304.                                                  coeff_bld->type, bld->a0aos[attrib],
  305.                                                  index);
  306.                }
  307.                /*
  308.                 * a = a0 + (x * dadx + y * dady)
  309.                 */
  310.                dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
  311.                dady = LLVMBuildFMul(builder, dady, pixoffy, "");
  312.                a = LLVMBuildFAdd(builder, a, dadx, "");
  313.                a = LLVMBuildFAdd(builder, a, dady, "");
  314.  
  315.                if (interp == LP_INTERP_PERSPECTIVE) {
  316.                   if (oow == NULL) {
  317.                      LLVMValueRef w = bld->attribs[0][3];
  318.                      assert(attrib != 0);
  319.                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
  320.                      oow = lp_build_rcp(coeff_bld, w);
  321.                   }
  322.                   a = lp_build_mul(coeff_bld, a, oow);
  323.                }
  324.                break;
  325.  
  326.             case LP_INTERP_CONSTANT:
  327.             case LP_INTERP_FACING:
  328.                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
  329.                                               coeff_bld->type, bld->a0aos[attrib],
  330.                                               index);
  331.                break;
  332.  
  333.             case LP_INTERP_POSITION:
  334.                assert(attrib > 0);
  335.                a = bld->attribs[0][chan];
  336.                break;
  337.  
  338.             default:
  339.                assert(0);
  340.                break;
  341.             }
  342.  
  343.             if ((attrib == 0) && (chan == 2)){
  344.                /* FIXME: Depth values can exceed 1.0, due to the fact that
  345.                 * setup interpolation coefficients refer to (0,0) which causes
  346.                 * precision loss. So we must clamp to 1.0 here to avoid artifacts
  347.                 */
  348.                a = lp_build_min(coeff_bld, a, coeff_bld->one);
  349.             }
  350.             bld->attribs[attrib][chan] = a;
  351.          }
  352.       }
  353.    }
  354. }
  355.  
  356. /**
  357.  * Initialize the bld->a, dadq fields.  This involves fetching
  358.  * those values from the arrays which are passed into the JIT function.
  359.  */
  360. static void
  361. coeffs_init(struct lp_build_interp_soa_context *bld,
  362.             LLVMValueRef a0_ptr,
  363.             LLVMValueRef dadx_ptr,
  364.             LLVMValueRef dady_ptr)
  365. {
  366.    struct lp_build_context *coeff_bld = &bld->coeff_bld;
  367.    struct lp_build_context *setup_bld = &bld->setup_bld;
  368.    struct gallivm_state *gallivm = coeff_bld->gallivm;
  369.    LLVMBuilderRef builder = gallivm->builder;
  370.    LLVMValueRef pixoffx, pixoffy;
  371.    unsigned attrib;
  372.    unsigned chan;
  373.    unsigned i;
  374.  
  375.    pixoffx = coeff_bld->undef;
  376.    pixoffy = coeff_bld->undef;
  377.    for (i = 0; i < coeff_bld->type.length; i++) {
  378.       LLVMValueRef nr = lp_build_const_int32(gallivm, i);
  379.       LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
  380.       LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
  381.       pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
  382.       pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
  383.    }
  384.  
  385.  
  386.    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
  387.       const unsigned mask = bld->mask[attrib];
  388.       const unsigned interp = bld->interp[attrib];
  389.       LLVMValueRef index = lp_build_const_int32(gallivm,
  390.                                 attrib * TGSI_NUM_CHANNELS);
  391.       LLVMValueRef ptr;
  392.       LLVMValueRef dadxaos = setup_bld->zero;
  393.       LLVMValueRef dadyaos = setup_bld->zero;
  394.       LLVMValueRef a0aos = setup_bld->zero;
  395.  
  396.       /* always fetch all 4 values for performance/simplicity */
  397.       switch (interp) {
  398.       case LP_INTERP_PERSPECTIVE:
  399.          /* fall-through */
  400.  
  401.       case LP_INTERP_LINEAR:
  402.          ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
  403.          ptr = LLVMBuildBitCast(builder, ptr,
  404.                LLVMPointerType(setup_bld->vec_type, 0), "");
  405.          dadxaos = LLVMBuildLoad(builder, ptr, "");
  406.  
  407.          ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
  408.          ptr = LLVMBuildBitCast(builder, ptr,
  409.                LLVMPointerType(setup_bld->vec_type, 0), "");
  410.          dadyaos = LLVMBuildLoad(builder, ptr, "");
  411.  
  412.          attrib_name(dadxaos, attrib, 0, ".dadxaos");
  413.          attrib_name(dadyaos, attrib, 0, ".dadyaos");
  414.          /* fall-through */
  415.  
  416.       case LP_INTERP_CONSTANT:
  417.       case LP_INTERP_FACING:
  418.          ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
  419.          ptr = LLVMBuildBitCast(builder, ptr,
  420.                LLVMPointerType(setup_bld->vec_type, 0), "");
  421.          a0aos = LLVMBuildLoad(builder, ptr, "");
  422.          attrib_name(a0aos, attrib, 0, ".a0aos");
  423.          break;
  424.  
  425.       case LP_INTERP_POSITION:
  426.          /* Nothing to do as the position coeffs are already setup in slot 0 */
  427.          continue;
  428.  
  429.       default:
  430.          assert(0);
  431.          break;
  432.       }
  433.  
  434.       /*
  435.        * a = a0 + (x * dadx + y * dady)
  436.        * a0aos is the attrib value at top left corner of stamp
  437.        */
  438.       if (interp != LP_INTERP_CONSTANT &&
  439.           interp != LP_INTERP_FACING) {
  440.          LLVMValueRef axaos, ayaos;
  441.          axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
  442.                                dadxaos, "");
  443.          ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
  444.                                dadyaos, "");
  445.          a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
  446.          a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
  447.       }
  448.  
  449.       /*
  450.        * dadq = {0, dadx, dady, dadx + dady}
  451.        * for two quads (side by side) this is:
  452.        * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
  453.        */
  454.       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  455.          /* this generates a CRAPLOAD of shuffles... */
  456.          if (mask & (1 << chan)) {
  457.             LLVMValueRef dadx, dady;
  458.             LLVMValueRef dadq, dadq2;
  459.             LLVMValueRef a;
  460.             LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
  461.  
  462.             if (attrib == 0 && chan == 0) {
  463.                a = bld->x;
  464.                if (bld->pos_offset) {
  465.                   a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
  466.                }
  467.                a = lp_build_broadcast_scalar(coeff_bld, a);
  468.                dadx = coeff_bld->one;
  469.                dady = coeff_bld->zero;
  470.             }
  471.             else if (attrib == 0 && chan == 1) {
  472.                a = bld->y;
  473.                if (bld->pos_offset) {
  474.                   a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
  475.                }
  476.                a = lp_build_broadcast_scalar(coeff_bld, a);
  477.                dady = coeff_bld->one;
  478.                dadx = coeff_bld->zero;
  479.             }
  480.             else {
  481.                dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
  482.                                               coeff_bld->type, dadxaos, chan_index);
  483.                dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
  484.                                               coeff_bld->type, dadyaos, chan_index);
  485.  
  486.                /*
  487.                 * a = {a, a, a, a}
  488.                 */
  489.                a = lp_build_extract_broadcast(gallivm, setup_bld->type,
  490.                                               coeff_bld->type, a0aos, chan_index);
  491.             }
  492.  
  493.             dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
  494.             dady = LLVMBuildFMul(builder, dady, pixoffy, "");
  495.             dadq = LLVMBuildFAdd(builder, dadx, dady, "");
  496.  
  497.             /*
  498.              * Compute the attrib values on the upper-left corner of each
  499.              * group of quads.
  500.              * Note that if we process 2 quads at once this doesn't
  501.              * really exactly to what we want.
  502.              * We need to access elem 0 and 2 respectively later if we process
  503.              * 2 quads at once.
  504.              */
  505.  
  506.             if (interp != LP_INTERP_CONSTANT &&
  507.                 interp != LP_INTERP_FACING) {
  508.                dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
  509.                a = LLVMBuildFAdd(builder, a, dadq2, "");
  510.             }
  511.  
  512. #if PERSPECTIVE_DIVIDE_PER_QUAD
  513.             /*
  514.              * a *= 1 / w
  515.              */
  516.  
  517.             /*
  518.              * XXX since we're only going to access elements 0,2 out of 8
  519.              * if we have 8-wide vectors we should do the division only 4-wide.
  520.              * a is really a 2-elements in a 4-wide vector disguised as 8-wide
  521.              * in this case.
  522.              */
  523.             if (interp == LP_INTERP_PERSPECTIVE) {
  524.                LLVMValueRef w = bld->a[0][3];
  525.                assert(attrib != 0);
  526.                assert(bld->mask[0] & TGSI_WRITEMASK_W);
  527.                if (!bld->oow) {
  528.                   bld->oow = lp_build_rcp(coeff_bld, w);
  529.                   lp_build_name(bld->oow, "oow");
  530.                }
  531.                a = lp_build_mul(coeff_bld, a, bld->oow);
  532.             }
  533. #endif
  534.  
  535.             attrib_name(a, attrib, chan, ".a");
  536.             attrib_name(dadq, attrib, chan, ".dadq");
  537.  
  538.             bld->a[attrib][chan] = lp_build_alloca(gallivm,
  539.                                                    LLVMTypeOf(a), "");
  540.             LLVMBuildStore(builder, a, bld->a[attrib][chan]);
  541.             bld->dadq[attrib][chan] = dadq;
  542.          }
  543.       }
  544.    }
  545. }
  546.  
  547.  
  548. /**
  549.  * Increment the shader input attribute values.
  550.  * This is called when we move from one quad to the next.
  551.  */
  552. static void
  553. attribs_update(struct lp_build_interp_soa_context *bld,
  554.                struct gallivm_state *gallivm,
  555.                LLVMValueRef loop_iter,
  556.                int start,
  557.                int end)
  558. {
  559.    LLVMBuilderRef builder = gallivm->builder;
  560.    struct lp_build_context *coeff_bld = &bld->coeff_bld;
  561.    LLVMValueRef oow = NULL;
  562.    unsigned attrib;
  563.    unsigned chan;
  564.  
  565.    for(attrib = start; attrib < end; ++attrib) {
  566.       const unsigned mask = bld->mask[attrib];
  567.       const unsigned interp = bld->interp[attrib];
  568.       for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  569.          if(mask & (1 << chan)) {
  570.             LLVMValueRef a;
  571.             if (interp == LP_INTERP_CONSTANT ||
  572.                 interp == LP_INTERP_FACING) {
  573.                a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
  574.             }
  575.             else if (interp == LP_INTERP_POSITION) {
  576.                assert(attrib > 0);
  577.                a = bld->attribs[0][chan];
  578.             }
  579.             else {
  580.                LLVMValueRef dadq;
  581.  
  582.                a = bld->a[attrib][chan];
  583.  
  584.                /*
  585.                 * Broadcast the attribute value for this quad into all elements
  586.                 */
  587.  
  588.                {
  589.                   /* stored as vector load as float */
  590.                   LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
  591.                                                             gallivm->context), 0);
  592.                   LLVMValueRef ptr;
  593.                   a = LLVMBuildBitCast(builder, a, ptr_type, "");
  594.                   ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
  595.                   a = LLVMBuildLoad(builder, ptr, "");
  596.                   a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
  597.                }
  598.  
  599.                /*
  600.                 * Get the derivatives.
  601.                 */
  602.  
  603.                dadq = bld->dadq[attrib][chan];
  604.  
  605. #if PERSPECTIVE_DIVIDE_PER_QUAD
  606.                if (interp == LP_INTERP_PERSPECTIVE) {
  607.                   LLVMValueRef dwdq = bld->dadq[0][3];
  608.  
  609.                   if (oow == NULL) {
  610.                      assert(bld->oow);
  611.                      oow = LLVMBuildShuffleVector(coeff_bld->builder,
  612.                                                   bld->oow, coeff_bld->undef,
  613.                                                   shuffle, "");
  614.                   }
  615.  
  616.                   dadq = lp_build_sub(coeff_bld,
  617.                                       dadq,
  618.                                       lp_build_mul(coeff_bld, a, dwdq));
  619.                   dadq = lp_build_mul(coeff_bld, dadq, oow);
  620.                }
  621. #endif
  622.  
  623.                /*
  624.                 * Add the derivatives
  625.                 */
  626.  
  627.                a = lp_build_add(coeff_bld, a, dadq);
  628.  
  629. #if !PERSPECTIVE_DIVIDE_PER_QUAD
  630.                if (interp == LP_INTERP_PERSPECTIVE) {
  631.                   if (oow == NULL) {
  632.                      LLVMValueRef w = bld->attribs[0][3];
  633.                      assert(attrib != 0);
  634.                      assert(bld->mask[0] & TGSI_WRITEMASK_W);
  635.                      oow = lp_build_rcp(coeff_bld, w);
  636.                   }
  637.                   a = lp_build_mul(coeff_bld, a, oow);
  638.                }
  639. #endif
  640.  
  641.                if (attrib == 0 && chan == 2) {
  642.                   /* FIXME: Depth values can exceed 1.0, due to the fact that
  643.                    * setup interpolation coefficients refer to (0,0) which causes
  644.                    * precision loss. So we must clamp to 1.0 here to avoid artifacts
  645.                    */
  646.                   a = lp_build_min(coeff_bld, a, coeff_bld->one);
  647.                }
  648.  
  649.                attrib_name(a, attrib, chan, "");
  650.             }
  651.             bld->attribs[attrib][chan] = a;
  652.          }
  653.       }
  654.    }
  655. }
  656.  
  657.  
  658. /**
  659.  * Generate the position vectors.
  660.  *
  661.  * Parameter x0, y0 are the integer values with upper left coordinates.
  662.  */
  663. static void
  664. pos_init(struct lp_build_interp_soa_context *bld,
  665.          LLVMValueRef x0,
  666.          LLVMValueRef y0)
  667. {
  668.    LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
  669.    struct lp_build_context *coeff_bld = &bld->coeff_bld;
  670.  
  671.    bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
  672.    bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
  673. }
  674.  
  675.  
  676. /**
  677.  * Initialize fragment shader input attribute info.
  678.  */
  679. void
  680. lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
  681.                          struct gallivm_state *gallivm,
  682.                          unsigned num_inputs,
  683.                          const struct lp_shader_input *inputs,
  684.                          boolean pixel_center_integer,
  685.                          LLVMBuilderRef builder,
  686.                          struct lp_type type,
  687.                          LLVMValueRef a0_ptr,
  688.                          LLVMValueRef dadx_ptr,
  689.                          LLVMValueRef dady_ptr,
  690.                          LLVMValueRef x0,
  691.                          LLVMValueRef y0)
  692. {
  693.    struct lp_type coeff_type;
  694.    struct lp_type setup_type;
  695.    unsigned attrib;
  696.    unsigned chan;
  697.  
  698.    memset(bld, 0, sizeof *bld);
  699.  
  700.    memset(&coeff_type, 0, sizeof coeff_type);
  701.    coeff_type.floating = TRUE;
  702.    coeff_type.sign = TRUE;
  703.    coeff_type.width = 32;
  704.    coeff_type.length = type.length;
  705.  
  706.    memset(&setup_type, 0, sizeof setup_type);
  707.    setup_type.floating = TRUE;
  708.    setup_type.sign = TRUE;
  709.    setup_type.width = 32;
  710.    setup_type.length = TGSI_NUM_CHANNELS;
  711.  
  712.  
  713.    /* XXX: we don't support interpolating into any other types */
  714.    assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
  715.  
  716.    lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
  717.    lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
  718.  
  719.    /* For convenience */
  720.    bld->pos = bld->attribs[0];
  721.    bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
  722.  
  723.    /* Position */
  724.    bld->mask[0] = TGSI_WRITEMASK_XYZW;
  725.    bld->interp[0] = LP_INTERP_LINEAR;
  726.  
  727.    /* Inputs */
  728.    for (attrib = 0; attrib < num_inputs; ++attrib) {
  729.       bld->mask[1 + attrib] = inputs[attrib].usage_mask;
  730.       bld->interp[1 + attrib] = inputs[attrib].interp;
  731.    }
  732.    bld->num_attribs = 1 + num_inputs;
  733.  
  734.    /* Ensure all masked out input channels have a valid value */
  735.    for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
  736.       for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
  737.          bld->attribs[attrib][chan] = bld->coeff_bld.undef;
  738.       }
  739.    }
  740.  
  741.    if (pixel_center_integer) {
  742.       bld->pos_offset = 0.0;
  743.    } else {
  744.       bld->pos_offset = 0.5;
  745.    }
  746.  
  747.    pos_init(bld, x0, y0);
  748.  
  749.    if (coeff_type.length > 4) {
  750.       bld->simple_interp = TRUE;
  751.       {
  752.          /* XXX this should use a global static table */
  753.          unsigned i;
  754.          unsigned num_loops = 16 / type.length;
  755.          LLVMValueRef pixoffx, pixoffy, index;
  756.          LLVMValueRef ptr;
  757.  
  758.          bld->xoffset_store = lp_build_array_alloca(gallivm,
  759.                                                     lp_build_vec_type(gallivm, type),
  760.                                                     lp_build_const_int32(gallivm, num_loops),
  761.                                                     "");
  762.          bld->yoffset_store = lp_build_array_alloca(gallivm,
  763.                                                     lp_build_vec_type(gallivm, type),
  764.                                                     lp_build_const_int32(gallivm, num_loops),
  765.                                                     "");
  766.          for (i = 0; i < num_loops; i++) {
  767.             index = lp_build_const_int32(gallivm, i);
  768.             calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
  769.             ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
  770.             LLVMBuildStore(builder, pixoffx, ptr);
  771.             ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
  772.             LLVMBuildStore(builder, pixoffy, ptr);
  773.          }
  774.       }
  775.       coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
  776.    }
  777.    else {
  778.       bld->simple_interp = FALSE;
  779.       coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
  780.    }
  781.  
  782. }
  783.  
  784.  
  785. /*
  786.  * Advance the position and inputs to the given quad within the block.
  787.  */
  788.  
  789. void
  790. lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
  791.                                       struct gallivm_state *gallivm,
  792.                                       LLVMValueRef quad_start_index)
  793. {
  794.    if (bld->simple_interp) {
  795.       attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
  796.    }
  797.    else {
  798.       attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
  799.    }
  800. }
  801.  
  802. void
  803. lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
  804.                                    struct gallivm_state *gallivm,
  805.                                    LLVMValueRef quad_start_index)
  806. {
  807.    if (bld->simple_interp) {
  808.       attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
  809.    }
  810.    else {
  811.       attribs_update(bld, gallivm, quad_start_index, 0, 1);
  812.    }
  813. }
  814.  
  815.