0,0 → 1,1122 |
/************************************************************************** |
* |
* Copyright 2009-2010 VMware, Inc. |
* All Rights Reserved. |
* |
* Permission is hereby granted, free of charge, to any person obtaining a |
* copy of this software and associated documentation files (the |
* "Software"), to deal in the Software without restriction, including |
* without limitation the rights to use, copy, modify, merge, publish, |
* distribute, sub license, and/or sell copies of the Software, and to |
* permit persons to whom the Software is furnished to do so, subject to |
* the following conditions: |
* |
* The above copyright notice and this permission notice (including the |
* next paragraph) shall be included in all copies or substantial portions |
* of the Software. |
* |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
* |
**************************************************************************/ |
|
/** |
* @file |
* Depth/stencil testing to LLVM IR translation. |
* |
* To be done accurately/efficiently the depth/stencil test must be done with |
* the same type/format of the depth/stencil buffer, which implies massaging |
* the incoming depths to fit into place. Using a more straightforward |
* type/format for depth/stencil values internally and only convert when |
* flushing would avoid this, but it would most likely result in depth fighting |
* artifacts. |
* |
* Since we're using linear layout for everything, but we need to deal with |
* 2x2 quads, we need to load/store multiple values and swizzle them into |
* place (we could avoid this by doing depth/stencil testing in linear format, |
* which would be easy for late depth/stencil test as we could do that after |
* the fragment shader loop just as we do for color buffers, but more tricky |
* for early depth test as we'd need both masks and interpolated depth in |
* linear format). |
* |
* |
* @author Jose Fonseca <jfonseca@vmware.com> |
* @author Brian Paul <jfonseca@vmware.com> |
*/ |
|
#include "pipe/p_state.h" |
#include "util/u_format.h" |
#include "util/u_cpu_detect.h" |
|
#include "gallivm/lp_bld_type.h" |
#include "gallivm/lp_bld_arit.h" |
#include "gallivm/lp_bld_bitarit.h" |
#include "gallivm/lp_bld_const.h" |
#include "gallivm/lp_bld_conv.h" |
#include "gallivm/lp_bld_logic.h" |
#include "gallivm/lp_bld_flow.h" |
#include "gallivm/lp_bld_intr.h" |
#include "gallivm/lp_bld_debug.h" |
#include "gallivm/lp_bld_swizzle.h" |
#include "gallivm/lp_bld_pack.h" |
|
#include "lp_bld_depth.h" |
|
|
/** Used to select fields from pipe_stencil_state */ |
enum stencil_op { |
S_FAIL_OP, |
Z_FAIL_OP, |
Z_PASS_OP |
}; |
|
|
|
/** |
* Do the stencil test comparison (compare FB stencil values against ref value). |
* This will be used twice when generating two-sided stencil code. |
* \param stencil the front/back stencil state |
* \param stencilRef the stencil reference value, replicated as a vector |
* \param stencilVals vector of stencil values from framebuffer |
* \return vector mask of pass/fail values (~0 or 0) |
*/ |
static LLVMValueRef |
lp_build_stencil_test_single(struct lp_build_context *bld, |
const struct pipe_stencil_state *stencil, |
LLVMValueRef stencilRef, |
LLVMValueRef stencilVals) |
{ |
LLVMBuilderRef builder = bld->gallivm->builder; |
const unsigned stencilMax = 255; /* XXX fix */ |
struct lp_type type = bld->type; |
LLVMValueRef res; |
|
/* |
* SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values |
* are between 0..255 so ensure we generate the fastest comparisons for |
* wider elements. |
*/ |
if (type.width <= 8) { |
assert(!type.sign); |
} else { |
assert(type.sign); |
} |
|
assert(stencil->enabled); |
|
if (stencil->valuemask != stencilMax) { |
/* compute stencilRef = stencilRef & valuemask */ |
LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask); |
stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, ""); |
/* compute stencilVals = stencilVals & valuemask */ |
stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, ""); |
} |
|
res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals); |
|
return res; |
} |
|
|
/** |
* Do the one or two-sided stencil test comparison. |
* \sa lp_build_stencil_test_single |
* \param front_facing an integer vector mask, indicating front (~0) or back |
* (0) facing polygon. If NULL, assume front-facing. |
*/ |
static LLVMValueRef |
lp_build_stencil_test(struct lp_build_context *bld, |
const struct pipe_stencil_state stencil[2], |
LLVMValueRef stencilRefs[2], |
LLVMValueRef stencilVals, |
LLVMValueRef front_facing) |
{ |
LLVMValueRef res; |
|
assert(stencil[0].enabled); |
|
/* do front face test */ |
res = lp_build_stencil_test_single(bld, &stencil[0], |
stencilRefs[0], stencilVals); |
|
if (stencil[1].enabled && front_facing != NULL) { |
/* do back face test */ |
LLVMValueRef back_res; |
|
back_res = lp_build_stencil_test_single(bld, &stencil[1], |
stencilRefs[1], stencilVals); |
|
res = lp_build_select(bld, front_facing, res, back_res); |
} |
|
return res; |
} |
|
|
/** |
* Apply the stencil operator (add/sub/keep/etc) to the given vector |
* of stencil values. |
* \return new stencil values vector |
*/ |
static LLVMValueRef |
lp_build_stencil_op_single(struct lp_build_context *bld, |
const struct pipe_stencil_state *stencil, |
enum stencil_op op, |
LLVMValueRef stencilRef, |
LLVMValueRef stencilVals) |
|
{ |
LLVMBuilderRef builder = bld->gallivm->builder; |
struct lp_type type = bld->type; |
LLVMValueRef res; |
LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff); |
unsigned stencil_op; |
|
assert(type.sign); |
|
switch (op) { |
case S_FAIL_OP: |
stencil_op = stencil->fail_op; |
break; |
case Z_FAIL_OP: |
stencil_op = stencil->zfail_op; |
break; |
case Z_PASS_OP: |
stencil_op = stencil->zpass_op; |
break; |
default: |
assert(0 && "Invalid stencil_op mode"); |
stencil_op = PIPE_STENCIL_OP_KEEP; |
} |
|
switch (stencil_op) { |
case PIPE_STENCIL_OP_KEEP: |
res = stencilVals; |
/* we can return early for this case */ |
return res; |
case PIPE_STENCIL_OP_ZERO: |
res = bld->zero; |
break; |
case PIPE_STENCIL_OP_REPLACE: |
res = stencilRef; |
break; |
case PIPE_STENCIL_OP_INCR: |
res = lp_build_add(bld, stencilVals, bld->one); |
res = lp_build_min(bld, res, max); |
break; |
case PIPE_STENCIL_OP_DECR: |
res = lp_build_sub(bld, stencilVals, bld->one); |
res = lp_build_max(bld, res, bld->zero); |
break; |
case PIPE_STENCIL_OP_INCR_WRAP: |
res = lp_build_add(bld, stencilVals, bld->one); |
res = LLVMBuildAnd(builder, res, max, ""); |
break; |
case PIPE_STENCIL_OP_DECR_WRAP: |
res = lp_build_sub(bld, stencilVals, bld->one); |
res = LLVMBuildAnd(builder, res, max, ""); |
break; |
case PIPE_STENCIL_OP_INVERT: |
res = LLVMBuildNot(builder, stencilVals, ""); |
res = LLVMBuildAnd(builder, res, max, ""); |
break; |
default: |
assert(0 && "bad stencil op mode"); |
res = bld->undef; |
} |
|
return res; |
} |
|
|
/** |
* Do the one or two-sided stencil test op/update. |
*/ |
static LLVMValueRef |
lp_build_stencil_op(struct lp_build_context *bld, |
const struct pipe_stencil_state stencil[2], |
enum stencil_op op, |
LLVMValueRef stencilRefs[2], |
LLVMValueRef stencilVals, |
LLVMValueRef mask, |
LLVMValueRef front_facing) |
|
{ |
LLVMBuilderRef builder = bld->gallivm->builder; |
LLVMValueRef res; |
|
assert(stencil[0].enabled); |
|
/* do front face op */ |
res = lp_build_stencil_op_single(bld, &stencil[0], op, |
stencilRefs[0], stencilVals); |
|
if (stencil[1].enabled && front_facing != NULL) { |
/* do back face op */ |
LLVMValueRef back_res; |
|
back_res = lp_build_stencil_op_single(bld, &stencil[1], op, |
stencilRefs[1], stencilVals); |
|
res = lp_build_select(bld, front_facing, res, back_res); |
} |
|
if (stencil[0].writemask != 0xff || |
(stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) { |
/* mask &= stencil[0].writemask */ |
LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type, |
stencil[0].writemask); |
if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) { |
LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type, |
stencil[1].writemask); |
writemask = lp_build_select(bld, front_facing, writemask, back_writemask); |
} |
|
mask = LLVMBuildAnd(builder, mask, writemask, ""); |
/* res = (res & mask) | (stencilVals & ~mask) */ |
res = lp_build_select_bitwise(bld, mask, res, stencilVals); |
} |
else { |
/* res = mask ? res : stencilVals */ |
res = lp_build_select(bld, mask, res, stencilVals); |
} |
|
return res; |
} |
|
|
|
/** |
* Return a type that matches the depth/stencil format. |
*/ |
struct lp_type |
lp_depth_type(const struct util_format_description *format_desc, |
unsigned length) |
{ |
struct lp_type type; |
unsigned z_swizzle; |
|
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); |
assert(format_desc->block.width == 1); |
assert(format_desc->block.height == 1); |
|
memset(&type, 0, sizeof type); |
type.width = format_desc->block.bits; |
|
z_swizzle = format_desc->swizzle[0]; |
if (z_swizzle < 4) { |
if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) { |
type.floating = TRUE; |
assert(z_swizzle == 0); |
assert(format_desc->channel[z_swizzle].size == 32); |
} |
else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) { |
assert(format_desc->block.bits <= 32); |
assert(format_desc->channel[z_swizzle].normalized); |
if (format_desc->channel[z_swizzle].size < format_desc->block.bits) { |
/* Prefer signed integers when possible, as SSE has less support |
* for unsigned comparison; |
*/ |
type.sign = TRUE; |
} |
} |
else |
assert(0); |
} |
|
type.length = length; |
|
return type; |
} |
|
|
/** |
* Compute bitmask and bit shift to apply to the incoming fragment Z values |
* and the Z buffer values needed before doing the Z comparison. |
* |
* Note that we leave the Z bits in the position that we find them |
* in the Z buffer (typically 0xffffff00 or 0x00ffffff). That lets us |
* get by with fewer bit twiddling steps. |
*/ |
static boolean |
get_z_shift_and_mask(const struct util_format_description *format_desc, |
unsigned *shift, unsigned *width, unsigned *mask) |
{ |
unsigned total_bits; |
unsigned z_swizzle; |
|
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); |
assert(format_desc->block.width == 1); |
assert(format_desc->block.height == 1); |
|
/* 64bit d/s format is special already extracted 32 bits */ |
total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits; |
|
z_swizzle = format_desc->swizzle[0]; |
|
if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE) |
return FALSE; |
|
*width = format_desc->channel[z_swizzle].size; |
/* & 31 is for the same reason as the 32-bit limit above */ |
*shift = format_desc->channel[z_swizzle].shift & 31; |
|
if (*width == total_bits) { |
*mask = 0xffffffff; |
} else { |
*mask = ((1 << *width) - 1) << *shift; |
} |
|
return TRUE; |
} |
|
|
/** |
* Compute bitmask and bit shift to apply to the framebuffer pixel values |
* to put the stencil bits in the least significant position. |
* (i.e. 0x000000ff) |
*/ |
static boolean |
get_s_shift_and_mask(const struct util_format_description *format_desc, |
unsigned *shift, unsigned *mask) |
{ |
unsigned s_swizzle; |
unsigned sz; |
|
s_swizzle = format_desc->swizzle[1]; |
|
if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE) |
return FALSE; |
|
/* just special case 64bit d/s format */ |
if (format_desc->block.bits > 32) { |
/* XXX big-endian? */ |
assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); |
*shift = 0; |
*mask = 0xff; |
return TRUE; |
} |
|
*shift = format_desc->channel[s_swizzle].shift; |
sz = format_desc->channel[s_swizzle].size; |
*mask = (1U << sz) - 1U; |
|
return TRUE; |
} |
|
|
/** |
* Perform the occlusion test and increase the counter. |
* Test the depth mask. Add the number of channel which has none zero mask |
* into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}. |
* The counter will add 4. |
* TODO: could get that out of the fs loop. |
* |
* \param type holds element type of the mask vector. |
* \param maskvalue is the depth test mask. |
* \param counter is a pointer of the uint32 counter. |
*/ |
void |
lp_build_occlusion_count(struct gallivm_state *gallivm, |
struct lp_type type, |
LLVMValueRef maskvalue, |
LLVMValueRef counter) |
{ |
LLVMBuilderRef builder = gallivm->builder; |
LLVMContextRef context = gallivm->context; |
LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1); |
LLVMValueRef count, newcount; |
|
assert(type.length <= 16); |
assert(type.floating); |
|
if(util_cpu_caps.has_sse && type.length == 4) { |
const char *movmskintr = "llvm.x86.sse.movmsk.ps"; |
const char *popcntintr = "llvm.ctpop.i32"; |
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, |
lp_build_vec_type(gallivm, type), ""); |
bits = lp_build_intrinsic_unary(builder, movmskintr, |
LLVMInt32TypeInContext(context), bits); |
count = lp_build_intrinsic_unary(builder, popcntintr, |
LLVMInt32TypeInContext(context), bits); |
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); |
} |
else if(util_cpu_caps.has_avx && type.length == 8) { |
const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; |
const char *popcntintr = "llvm.ctpop.i32"; |
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, |
lp_build_vec_type(gallivm, type), ""); |
bits = lp_build_intrinsic_unary(builder, movmskintr, |
LLVMInt32TypeInContext(context), bits); |
count = lp_build_intrinsic_unary(builder, popcntintr, |
LLVMInt32TypeInContext(context), bits); |
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); |
} |
else { |
unsigned i; |
LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); |
LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8); |
LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4); |
LLVMValueRef shufflev, countd; |
LLVMValueRef shuffles[16]; |
const char *popcntintr = NULL; |
|
countv = LLVMBuildBitCast(builder, countv, i8vntype, ""); |
|
for (i = 0; i < type.length; i++) { |
shuffles[i] = lp_build_const_int32(gallivm, 4*i); |
} |
|
shufflev = LLVMConstVector(shuffles, type.length); |
countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, ""); |
countd = LLVMBuildBitCast(builder, countd, counttype, "countd"); |
|
/* |
* XXX FIXME |
* this is bad on cpus without popcount (on x86 supported by intel |
* nehalem, amd barcelona, and up - not tied to sse42). |
* Would be much faster to just sum the 4 elements of the vector with |
* some horizontal add (shuffle/add/shuffle/add after the initial and). |
*/ |
switch (type.length) { |
case 4: |
popcntintr = "llvm.ctpop.i32"; |
break; |
case 8: |
popcntintr = "llvm.ctpop.i64"; |
break; |
case 16: |
popcntintr = "llvm.ctpop.i128"; |
break; |
default: |
assert(0); |
} |
count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd); |
|
if (type.length > 8) { |
count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), ""); |
} |
else if (type.length < 8) { |
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), ""); |
} |
} |
newcount = LLVMBuildLoad(builder, counter, "origcount"); |
newcount = LLVMBuildAdd(builder, newcount, count, "newcount"); |
LLVMBuildStore(builder, newcount, counter); |
} |
|
|
/** |
* Load depth/stencil values. |
* The stored values are linear, swizzle them. |
* |
* \param type the data type of the fragment depth/stencil values |
* \param format_desc description of the depth/stencil surface |
* \param is_1d whether this resource has only one dimension |
* \param loop_counter the current loop iteration |
* \param depth_ptr pointer to the depth/stencil values of this 4x4 block |
* \param depth_stride stride of the depth/stencil buffer |
* \param z_fb contains z values loaded from fb (may include padding) |
* \param s_fb contains s values loaded from fb (may include padding) |
*/ |
void |
lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm, |
struct lp_type z_src_type, |
const struct util_format_description *format_desc, |
boolean is_1d, |
LLVMValueRef depth_ptr, |
LLVMValueRef depth_stride, |
LLVMValueRef *z_fb, |
LLVMValueRef *s_fb, |
LLVMValueRef loop_counter) |
{ |
LLVMBuilderRef builder = gallivm->builder; |
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; |
LLVMValueRef zs_dst1, zs_dst2; |
LLVMValueRef zs_dst_ptr; |
LLVMValueRef depth_offset1, depth_offset2; |
LLVMTypeRef load_ptr_type; |
unsigned depth_bytes = format_desc->block.bits / 8; |
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); |
struct lp_type zs_load_type = zs_type; |
|
zs_load_type.length = zs_load_type.length / 2; |
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); |
|
if (z_src_type.length == 4) { |
unsigned i; |
LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, |
lp_build_const_int32(gallivm, 1), ""); |
LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, |
lp_build_const_int32(gallivm, 2), ""); |
LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, |
depth_stride, ""); |
depth_offset1 = LLVMBuildMul(builder, looplsb, |
lp_build_const_int32(gallivm, depth_bytes * 2), ""); |
depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); |
|
/* just concatenate the loaded 2x2 values into 4-wide vector */ |
for (i = 0; i < 4; i++) { |
shuffles[i] = lp_build_const_int32(gallivm, i); |
} |
} |
else { |
unsigned i; |
LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, |
lp_build_const_int32(gallivm, 1), ""); |
assert(z_src_type.length == 8); |
depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); |
/* |
* We load 2x4 values, and need to swizzle them (order |
* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. |
*/ |
for (i = 0; i < 8; i++) { |
shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); |
} |
} |
|
depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); |
|
/* Load current z/stencil values from z/stencil buffer */ |
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); |
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); |
zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, ""); |
if (is_1d) { |
zs_dst2 = lp_build_undef(gallivm, zs_load_type); |
} |
else { |
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); |
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, ""); |
zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, ""); |
} |
|
*z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2, |
LLVMConstVector(shuffles, zs_type.length), ""); |
*s_fb = *z_fb; |
|
if (format_desc->block.bits < z_src_type.width) { |
/* Extend destination ZS values (e.g., when reading from Z16_UNORM) */ |
*z_fb = LLVMBuildZExt(builder, *z_fb, |
lp_build_int_vec_type(gallivm, z_src_type), ""); |
} |
|
else if (format_desc->block.bits > 32) { |
/* rely on llvm to handle too wide vector we have here nicely */ |
unsigned i; |
struct lp_type typex2 = zs_type; |
struct lp_type s_type = zs_type; |
LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4]; |
LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4]; |
LLVMValueRef tmp; |
|
typex2.width = typex2.width / 2; |
typex2.length = typex2.length * 2; |
s_type.width = s_type.width / 2; |
s_type.floating = 0; |
|
tmp = LLVMBuildBitCast(builder, *z_fb, |
lp_build_vec_type(gallivm, typex2), ""); |
|
for (i = 0; i < zs_type.length; i++) { |
shuffles1[i] = lp_build_const_int32(gallivm, i * 2); |
shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1); |
} |
*z_fb = LLVMBuildShuffleVector(builder, tmp, tmp, |
LLVMConstVector(shuffles1, zs_type.length), ""); |
*s_fb = LLVMBuildShuffleVector(builder, tmp, tmp, |
LLVMConstVector(shuffles2, zs_type.length), ""); |
*s_fb = LLVMBuildBitCast(builder, *s_fb, |
lp_build_vec_type(gallivm, s_type), ""); |
lp_build_name(*s_fb, "s_dst"); |
} |
|
lp_build_name(*z_fb, "z_dst"); |
lp_build_name(*s_fb, "s_dst"); |
lp_build_name(*z_fb, "z_dst"); |
} |
|
/** |
* Store depth/stencil values. |
* Incoming values are swizzled (typically n 2x2 quads), stored linear. |
* If there's a mask it will do select/store otherwise just store. |
* |
* \param type the data type of the fragment depth/stencil values |
* \param format_desc description of the depth/stencil surface |
* \param is_1d whether this resource has only one dimension |
* \param mask the alive/dead pixel mask for the quad (vector) |
* \param z_fb z values read from fb (with padding) |
* \param s_fb s values read from fb (with padding) |
* \param loop_counter the current loop iteration |
* \param depth_ptr pointer to the depth/stencil values of this 4x4 block |
* \param depth_stride stride of the depth/stencil buffer |
* \param z_value the depth values to store (with padding) |
* \param s_value the stencil values to store (with padding) |
*/ |
void |
lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm, |
struct lp_type z_src_type, |
const struct util_format_description *format_desc, |
boolean is_1d, |
struct lp_build_mask_context *mask, |
LLVMValueRef z_fb, |
LLVMValueRef s_fb, |
LLVMValueRef loop_counter, |
LLVMValueRef depth_ptr, |
LLVMValueRef depth_stride, |
LLVMValueRef z_value, |
LLVMValueRef s_value) |
{ |
struct lp_build_context z_bld; |
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4]; |
LLVMBuilderRef builder = gallivm->builder; |
LLVMValueRef mask_value = NULL; |
LLVMValueRef zs_dst1, zs_dst2; |
LLVMValueRef zs_dst_ptr1, zs_dst_ptr2; |
LLVMValueRef depth_offset1, depth_offset2; |
LLVMTypeRef load_ptr_type; |
unsigned depth_bytes = format_desc->block.bits / 8; |
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length); |
struct lp_type z_type = zs_type; |
struct lp_type zs_load_type = zs_type; |
|
zs_load_type.length = zs_load_type.length / 2; |
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0); |
|
z_type.width = z_src_type.width; |
|
lp_build_context_init(&z_bld, gallivm, z_type); |
|
/* |
* This is far from ideal, at least for late depth write we should do this |
* outside the fs loop to avoid all the swizzle stuff. |
*/ |
if (z_src_type.length == 4) { |
LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter, |
lp_build_const_int32(gallivm, 1), ""); |
LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter, |
lp_build_const_int32(gallivm, 2), ""); |
LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb, |
depth_stride, ""); |
depth_offset1 = LLVMBuildMul(builder, looplsb, |
lp_build_const_int32(gallivm, depth_bytes * 2), ""); |
depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, ""); |
} |
else { |
unsigned i; |
LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter, |
lp_build_const_int32(gallivm, 1), ""); |
assert(z_src_type.length == 8); |
depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, ""); |
/* |
* We load 2x4 values, and need to swizzle them (order |
* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately. |
*/ |
for (i = 0; i < 8; i++) { |
shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); |
} |
} |
|
depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, ""); |
|
zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, ""); |
zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, ""); |
zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, ""); |
zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, ""); |
|
if (format_desc->block.bits > 32) { |
s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, ""); |
} |
|
if (mask) { |
mask_value = lp_build_mask_value(mask); |
z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb); |
if (format_desc->block.bits > 32) { |
s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, ""); |
s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb); |
} |
} |
|
if (zs_type.width < z_src_type.width) { |
/* Truncate ZS values (e.g., when writing to Z16_UNORM) */ |
z_value = LLVMBuildTrunc(builder, z_value, |
lp_build_int_vec_type(gallivm, zs_type), ""); |
} |
|
if (format_desc->block.bits <= 32) { |
if (z_src_type.length == 4) { |
zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2); |
zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2); |
} |
else { |
assert(z_src_type.length == 8); |
zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value, |
LLVMConstVector(&shuffles[0], |
zs_load_type.length), ""); |
zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value, |
LLVMConstVector(&shuffles[4], |
zs_load_type.length), ""); |
} |
} |
else { |
if (z_src_type.length == 4) { |
zs_dst1 = lp_build_interleave2(gallivm, z_type, |
z_value, s_value, 0); |
zs_dst2 = lp_build_interleave2(gallivm, z_type, |
z_value, s_value, 1); |
} |
else { |
unsigned i; |
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2]; |
assert(z_src_type.length == 8); |
for (i = 0; i < 8; i++) { |
shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2); |
shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 + |
z_src_type.length); |
} |
zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value, |
LLVMConstVector(&shuffles[0], |
z_src_type.length), ""); |
zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value, |
LLVMConstVector(&shuffles[8], |
z_src_type.length), ""); |
} |
zs_dst1 = LLVMBuildBitCast(builder, zs_dst1, |
lp_build_vec_type(gallivm, zs_load_type), ""); |
zs_dst2 = LLVMBuildBitCast(builder, zs_dst2, |
lp_build_vec_type(gallivm, zs_load_type), ""); |
} |
|
LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1); |
if (!is_1d) { |
LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2); |
} |
} |
|
/** |
* Generate code for performing depth and/or stencil tests. |
* We operate on a vector of values (typically n 2x2 quads). |
* |
* \param depth the depth test state |
* \param stencil the front/back stencil state |
* \param type the data type of the fragment depth/stencil values |
* \param format_desc description of the depth/stencil surface |
* \param mask the alive/dead pixel mask for the quad (vector) |
* \param stencil_refs the front/back stencil ref values (scalar) |
* \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) |
* \param zs_dst the depth/stencil values in framebuffer |
* \param face contains boolean value indicating front/back facing polygon |
*/ |
void |
lp_build_depth_stencil_test(struct gallivm_state *gallivm, |
const struct pipe_depth_state *depth, |
const struct pipe_stencil_state stencil[2], |
struct lp_type z_src_type, |
const struct util_format_description *format_desc, |
struct lp_build_mask_context *mask, |
LLVMValueRef stencil_refs[2], |
LLVMValueRef z_src, |
LLVMValueRef z_fb, |
LLVMValueRef s_fb, |
LLVMValueRef face, |
LLVMValueRef *z_value, |
LLVMValueRef *s_value, |
boolean do_branch) |
{ |
LLVMBuilderRef builder = gallivm->builder; |
struct lp_type z_type; |
struct lp_build_context z_bld; |
struct lp_build_context s_bld; |
struct lp_type s_type; |
unsigned z_shift = 0, z_width = 0, z_mask = 0; |
LLVMValueRef z_dst = NULL; |
LLVMValueRef stencil_vals = NULL; |
LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; |
LLVMValueRef z_pass = NULL, s_pass_mask = NULL; |
LLVMValueRef current_mask = lp_build_mask_value(mask); |
LLVMValueRef front_facing = NULL; |
boolean have_z, have_s; |
|
/* |
* Depths are expected to be between 0 and 1, even if they are stored in |
* floats. Setting these bits here will ensure that the lp_build_conv() call |
* below won't try to unnecessarily clamp the incoming values. |
*/ |
if(z_src_type.floating) { |
z_src_type.sign = FALSE; |
z_src_type.norm = TRUE; |
} |
else { |
assert(!z_src_type.sign); |
assert(z_src_type.norm); |
} |
|
/* Pick the type matching the depth-stencil format. */ |
z_type = lp_depth_type(format_desc, z_src_type.length); |
|
/* Pick the intermediate type for depth operations. */ |
z_type.width = z_src_type.width; |
assert(z_type.length == z_src_type.length); |
|
/* FIXME: for non-float depth/stencil might generate better code |
* if we'd always split it up to use 128bit operations. |
* For stencil we'd almost certainly want to pack to 8xi16 values, |
* for z just run twice. |
*/ |
|
/* Sanity checking */ |
{ |
const unsigned z_swizzle = format_desc->swizzle[0]; |
const unsigned s_swizzle = format_desc->swizzle[1]; |
|
assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE || |
s_swizzle != UTIL_FORMAT_SWIZZLE_NONE); |
|
assert(depth->enabled || stencil[0].enabled); |
|
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); |
assert(format_desc->block.width == 1); |
assert(format_desc->block.height == 1); |
|
if (stencil[0].enabled) { |
assert(s_swizzle < 4); |
assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); |
assert(format_desc->channel[s_swizzle].pure_integer); |
assert(!format_desc->channel[s_swizzle].normalized); |
assert(format_desc->channel[s_swizzle].size == 8); |
} |
|
if (depth->enabled) { |
assert(z_swizzle < 4); |
if (z_type.floating) { |
assert(z_swizzle == 0); |
assert(format_desc->channel[z_swizzle].type == |
UTIL_FORMAT_TYPE_FLOAT); |
assert(format_desc->channel[z_swizzle].size == 32); |
} |
else { |
assert(format_desc->channel[z_swizzle].type == |
UTIL_FORMAT_TYPE_UNSIGNED); |
assert(format_desc->channel[z_swizzle].normalized); |
assert(!z_type.fixed); |
} |
} |
} |
|
|
/* Setup build context for Z vals */ |
lp_build_context_init(&z_bld, gallivm, z_type); |
|
/* Setup build context for stencil vals */ |
s_type = lp_int_type(z_type); |
lp_build_context_init(&s_bld, gallivm, s_type); |
|
/* Compute and apply the Z/stencil bitmasks and shifts. |
*/ |
{ |
unsigned s_shift, s_mask; |
|
z_dst = z_fb; |
stencil_vals = s_fb; |
|
have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); |
have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); |
|
if (have_z) { |
if (z_mask != 0xffffffff) { |
z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); |
} |
|
/* |
* Align the framebuffer Z 's LSB to the right. |
*/ |
if (z_shift) { |
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); |
z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); |
} else if (z_bitmask) { |
z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); |
} else { |
lp_build_name(z_dst, "z_dst"); |
} |
} |
|
if (have_s) { |
if (s_shift) { |
LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); |
stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); |
stencil_shift = shift; /* used below */ |
} |
|
if (s_mask != 0xffffffff) { |
LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); |
stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); |
} |
|
lp_build_name(stencil_vals, "s_dst"); |
} |
} |
|
if (stencil[0].enabled) { |
|
if (face) { |
LLVMValueRef zero = lp_build_const_int32(gallivm, 0); |
|
/* front_facing = face != 0 ? ~0 : 0 */ |
front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); |
front_facing = LLVMBuildSExt(builder, front_facing, |
LLVMIntTypeInContext(gallivm->context, |
s_bld.type.length*s_bld.type.width), |
""); |
front_facing = LLVMBuildBitCast(builder, front_facing, |
s_bld.int_vec_type, ""); |
} |
|
/* convert scalar stencil refs into vectors */ |
stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]); |
stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]); |
|
s_pass_mask = lp_build_stencil_test(&s_bld, stencil, |
stencil_refs, stencil_vals, |
front_facing); |
|
/* apply stencil-fail operator */ |
{ |
LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask); |
stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, |
stencil_refs, stencil_vals, |
s_fail_mask, front_facing); |
} |
} |
|
if (depth->enabled) { |
/* |
* Convert fragment Z to the desired type, aligning the LSB to the right. |
*/ |
|
assert(z_type.width == z_src_type.width); |
assert(z_type.length == z_src_type.length); |
assert(lp_check_value(z_src_type, z_src)); |
if (z_src_type.floating) { |
/* |
* Convert from floating point values |
*/ |
|
if (!z_type.floating) { |
z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, |
z_src_type, |
z_width, |
z_src); |
} |
} else { |
/* |
* Convert from unsigned normalized values. |
*/ |
|
assert(!z_src_type.sign); |
assert(!z_src_type.fixed); |
assert(z_src_type.norm); |
assert(!z_type.floating); |
if (z_src_type.width > z_width) { |
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, |
z_src_type.width - z_width); |
z_src = LLVMBuildLShr(builder, z_src, shift, ""); |
} |
} |
assert(lp_check_value(z_type, z_src)); |
|
lp_build_name(z_src, "z_src"); |
|
/* compare src Z to dst Z, returning 'pass' mask */ |
z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); |
|
/* mask off bits that failed stencil test */ |
if (s_pass_mask) { |
current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); |
} |
|
if (!stencil[0].enabled) { |
/* We can potentially skip all remaining operations here, but only |
* if stencil is disabled because we still need to update the stencil |
* buffer values. Don't need to update Z buffer values. |
*/ |
lp_build_mask_update(mask, z_pass); |
|
if (do_branch) { |
lp_build_mask_check(mask); |
} |
} |
|
if (depth->writemask) { |
LLVMValueRef z_pass_mask; |
|
/* mask off bits that failed Z test */ |
z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); |
|
/* Mix the old and new Z buffer values. |
* z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] |
*/ |
z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst); |
} |
|
if (stencil[0].enabled) { |
/* update stencil buffer values according to z pass/fail result */ |
LLVMValueRef z_fail_mask, z_pass_mask; |
|
/* apply Z-fail operator */ |
z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass); |
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, |
stencil_refs, stencil_vals, |
z_fail_mask, front_facing); |
|
/* apply Z-pass operator */ |
z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, ""); |
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, |
stencil_refs, stencil_vals, |
z_pass_mask, front_facing); |
} |
} |
else { |
/* No depth test: apply Z-pass operator to stencil buffer values which |
* passed the stencil test. |
*/ |
s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, ""); |
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, |
stencil_refs, stencil_vals, |
s_pass_mask, front_facing); |
} |
|
/* Put Z and stencil bits in the right place */ |
if (have_z && z_shift) { |
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); |
z_dst = LLVMBuildShl(builder, z_dst, shift, ""); |
} |
if (stencil_vals && stencil_shift) |
stencil_vals = LLVMBuildShl(builder, stencil_vals, |
stencil_shift, ""); |
|
/* Finally, merge the z/stencil values */ |
if (format_desc->block.bits <= 32) { |
if (have_z && have_s) |
*z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); |
else if (have_z) |
*z_value = z_dst; |
else |
*z_value = stencil_vals; |
*s_value = *z_value; |
} |
else { |
*z_value = z_dst; |
*s_value = stencil_vals; |
} |
|
if (s_pass_mask) |
lp_build_mask_update(mask, s_pass_mask); |
|
if (depth->enabled && stencil[0].enabled) |
lp_build_mask_update(mask, z_pass); |
} |
|