Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2013 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28.  
  29. /**
  30.  * @file
  31.  * Format conversion code for "special" float formats.
  32.  *
  33.  * @author Roland Scheidegger <sroland@vmware.com>
  34.  */
  35.  
  36.  
  37. #include "util/u_debug.h"
  38.  
  39. #include "lp_bld_type.h"
  40. #include "lp_bld_const.h"
  41. #include "lp_bld_arit.h"
  42. #include "lp_bld_bitarit.h"
  43. #include "lp_bld_logic.h"
  44. #include "lp_bld_format.h"
  45.  
  46.  
  47. /**
  48.  * Convert float32 to a float-like value with less exponent and mantissa
  49.  * bits. The mantissa is still biased, and the mantissa still has an implied 1,
  50.  * and there may be a sign bit.
  51.  *
  52.  * @param src             (vector) float value to convert
  53.  * @param mantissa_bits   the number of mantissa bits
  54.  * @param exponent_bits   the number of exponent bits
  55.  * @param mantissa_start  the start position of the small float in result value
  56.  * @param has_sign        if the small float has a sign bit
  57.  *
  58.  * This implements round-towards-zero (trunc) hence too large numbers get
  59.  * converted to largest representable number, not infinity.
  60.  * Small numbers may get converted to denorms, depending on normal
  61.  * float denorm handling of the cpu.
  62.  * Note that compared to the references, below, we skip any rounding bias
  63.  * since we do rounding towards zero - OpenGL allows rounding towards zero
  64.  * (though not preferred) and DX10 even seems to require it.
  65.  * Note that this will pack mantissa, exponent and sign bit (if any) together,
  66.  * and shift the result to mantissa_start.
  67.  *
  68.  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
  69.  * ref https://gist.github.com/rygorous/2156668
  70.  */
  71. LLVMValueRef
  72. lp_build_float_to_smallfloat(struct gallivm_state *gallivm,
  73.                              struct lp_type i32_type,
  74.                              LLVMValueRef src,
  75.                              unsigned mantissa_bits,
  76.                              unsigned exponent_bits,
  77.                              unsigned mantissa_start,
  78.                              boolean has_sign)
  79. {
  80.    LLVMBuilderRef builder = gallivm->builder;
  81.    LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
  82.    LLVMValueRef rescale_src, i32_roundmask, small_max;
  83.    LLVMValueRef i32_qnanbit, shift, res;
  84.    LLVMValueRef is_nan_or_inf, nan_or_inf, mask, i32_src;
  85.    struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
  86.    struct lp_build_context f32_bld, i32_bld;
  87.    LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
  88.    unsigned exponent_start = mantissa_start + mantissa_bits;
  89.    boolean always_preserve_nans = true;
  90.    boolean maybe_correct_denorm_rounding = true;
  91.  
  92.    lp_build_context_init(&f32_bld, gallivm, f32_type);
  93.    lp_build_context_init(&i32_bld, gallivm, i32_type);
  94.  
  95.    i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
  96.                                              ((1 << exponent_bits) - 1) << 23);
  97.    i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
  98.  
  99.    i32_src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
  100.  
  101.    if (has_sign) {
  102.       rescale_src = src;
  103.    }
  104.    else {
  105.       /* clamp to pos range (can still have sign bit if NaN or negative zero) */
  106.       rescale_src = lp_build_max(&f32_bld, zero, src);
  107.    }
  108.    rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
  109.  
  110.    /* "ordinary" number */
  111.    /*
  112.     * get rid of excess mantissa bits and sign bit
  113.     * This is only really needed for correct rounding of denorms I think
  114.     * but only if we use the preserve NaN path does using
  115.     * src_abs instead save us any instruction.
  116.     */
  117.    if (maybe_correct_denorm_rounding || !always_preserve_nans) {
  118.       i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
  119.                                              ~((1 << (23 - mantissa_bits)) - 1) &
  120.                                              0x7fffffff);
  121.       rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
  122.       rescale_src = lp_build_and(&i32_bld, rescale_src, i32_roundmask);
  123.       rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, "");
  124.    }
  125.    else {
  126.       rescale_src = lp_build_abs(&f32_bld, src);
  127.    }
  128.  
  129.    /* bias exponent (and denormalize if necessary) */
  130.    magic = lp_build_const_int_vec(gallivm, i32_type,
  131.                                   ((1 << (exponent_bits - 1)) - 1) << 23);
  132.    magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
  133.    normal = lp_build_mul(&f32_bld, rescale_src, magic);
  134.  
  135.    /* clamp to max value - largest non-infinity number */
  136.    small_max = lp_build_const_int_vec(gallivm, i32_type,
  137.                                       (((1 << exponent_bits) - 2) << 23) |
  138.                                       (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
  139.    small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
  140.    normal = lp_build_min(&f32_bld, normal, small_max);
  141.    normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
  142.  
  143.    /*
  144.     * handle nan/inf cases
  145.     * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
  146.     * (for no sign) else ->Inf -> ->Inf too.
  147.     * could use explicit "unordered" comparison checking for NaNs
  148.     * which might save us from calculating src_abs too.
  149.     * (Cannot actually save the comparison since we need to distinguish
  150.     * Inf and NaN cases anyway, but it would be better for AVX.)
  151.     */
  152.    if (always_preserve_nans) {
  153.       LLVMValueRef infcheck_src, is_inf, is_nan;
  154.       LLVMValueRef src_abs = lp_build_abs(&f32_bld, src);
  155.       src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
  156.  
  157.       if (has_sign) {
  158.          infcheck_src = src_abs;
  159.       }
  160.       else {
  161.          infcheck_src = i32_src;
  162.       }
  163.       is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
  164.                                 src_abs, i32_floatexpmask);
  165.       is_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
  166.                                 infcheck_src, i32_floatexpmask);
  167.       is_nan_or_inf = lp_build_or(&i32_bld, is_nan, is_inf);
  168.       /* could also set more mantissa bits but need at least the highest mantissa bit */
  169.       i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
  170.       /* combine maxexp with qnanbit */
  171.       nan_or_inf = lp_build_or(&i32_bld, i32_smallexpmask,
  172.                                lp_build_and(&i32_bld, is_nan, i32_qnanbit));
  173.    }
  174.    else {
  175.       /*
  176.        * A couple simplifications, with mostly 2 drawbacks (so disabled):
  177.        * - it will promote some SNaNs (those which only had bits set
  178.        * in the mantissa part which got chopped off) to +-Infinity.
  179.        * (Those bits get chopped off anyway later so can as well use
  180.        * rescale_src instead of src_abs here saving the calculation of that.)
  181.        * - for no sign case, it relies on the max() being used for rescale_src
  182.        * to give back the NaN (which is NOT ieee754r behavior, but should work
  183.        * with sse2 on a full moon (rather if I got the operand order right) -
  184.        * we _don't_ have well-defined behavior specified with min/max wrt NaNs,
  185.        * however, and if it gets converted to cmp/select it may not work (we
  186.        * don't really have specified behavior for cmp wrt NaNs neither).
  187.        */
  188.       rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
  189.       is_nan_or_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GEQUAL,
  190.                                        rescale_src, i32_floatexpmask);
  191.       /* note this will introduce excess exponent bits */
  192.       nan_or_inf = rescale_src;
  193.    }
  194.    res = lp_build_select(&i32_bld, is_nan_or_inf, nan_or_inf, normal);
  195.  
  196.    if (mantissa_start > 0 || !always_preserve_nans) {
  197.       /* mask off excess bits */
  198.       unsigned maskbits = (1 << (mantissa_bits + exponent_bits)) - 1;
  199.       mask = lp_build_const_int_vec(gallivm, i32_type,
  200.                                     maskbits << (23 - mantissa_bits));
  201.       res = lp_build_and(&i32_bld, res, mask);
  202.    }
  203.  
  204.    /* add back sign bit at right position */
  205.    if (has_sign) {
  206.       LLVMValueRef sign;
  207.       struct lp_type u32_type = lp_type_uint_vec(32, 32 * i32_type.length);
  208.       struct lp_build_context u32_bld;
  209.       lp_build_context_init(&u32_bld, gallivm, u32_type);
  210.  
  211.       mask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000);
  212.       shift = lp_build_const_int_vec(gallivm, i32_type, 8 - exponent_bits);
  213.       sign = lp_build_and(&i32_bld, mask, i32_src);
  214.       sign = lp_build_shr(&u32_bld, sign, shift);
  215.       res = lp_build_or(&i32_bld, sign, res);
  216.    }
  217.  
  218.    /* shift to final position */
  219.    if (exponent_start < 23) {
  220.       shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
  221.       res = lp_build_shr(&i32_bld, res, shift);
  222.    }
  223.    else {
  224.       shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
  225.       res = lp_build_shl(&i32_bld, res, shift);
  226.    }
  227.    return res;
  228. }
  229.  
  230.  
  231. /**
  232.  * Convert rgba float SoA values to packed r11g11b10 values.
  233.  *
  234.  * @param src   SoA float (vector) values to convert.
  235.  */
  236. LLVMValueRef
  237. lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
  238.                             LLVMValueRef *src)
  239. {
  240.    LLVMValueRef dst, rcomp, bcomp, gcomp;
  241.    struct lp_build_context i32_bld;
  242.    LLVMTypeRef src_type = LLVMTypeOf(*src);
  243.    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
  244.                             LLVMGetVectorSize(src_type) : 1;
  245.    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
  246.  
  247.    lp_build_context_init(&i32_bld, gallivm, i32_type);
  248.  
  249.    /* "rescale" and put in right position */
  250.    rcomp = lp_build_float_to_smallfloat(gallivm, i32_type, src[0], 6, 5, 0, false);
  251.    gcomp = lp_build_float_to_smallfloat(gallivm, i32_type, src[1], 6, 5, 11, false);
  252.    bcomp = lp_build_float_to_smallfloat(gallivm, i32_type, src[2], 5, 5, 22, false);
  253.  
  254.    /* combine the values */
  255.    dst = lp_build_or(&i32_bld, rcomp, gcomp);
  256.    return lp_build_or(&i32_bld, dst, bcomp);
  257. }
  258.  
  259.  
  260. /**
  261.  * Convert a float-like value with less exponent and mantissa
  262.  * bits than a normal float32 to a float32. The mantissa of
  263.  * the source value is assumed to have an implied 1, and the exponent
  264.  * is biased. There may be a sign bit.
  265.  * The source value to extract must be in a 32bit int (bits not part of
  266.  * the value to convert will be masked off).
  267.  * This works for things like 11-bit floats or half-floats,
  268.  * mantissa, exponent (and sign if present) must be packed
  269.  * the same as they are in a ordinary float.
  270.  *
  271.  * @param src             (vector) value to convert
  272.  * @param mantissa_bits   the number of mantissa bits
  273.  * @param exponent_bits   the number of exponent bits
  274.  * @param mantissa_start  the bit start position of the packed component
  275.  * @param has_sign        if the small float has a sign bit
  276.  *
  277.  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
  278.  * ref https://gist.github.com/rygorous/2156668
  279.  */
  280. LLVMValueRef
  281. lp_build_smallfloat_to_float(struct gallivm_state *gallivm,
  282.                              struct lp_type f32_type,
  283.                              LLVMValueRef src,
  284.                              unsigned mantissa_bits,
  285.                              unsigned exponent_bits,
  286.                              unsigned mantissa_start,
  287.                              boolean has_sign)
  288. {
  289.    LLVMBuilderRef builder = gallivm->builder;
  290.    LLVMValueRef smallexpmask, i32_floatexpmask, magic;
  291.    LLVMValueRef wasinfnan, tmp, res, shift, maskabs, srcabs, sign;
  292.    unsigned exponent_start = mantissa_start + mantissa_bits;
  293.    struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
  294.    struct lp_build_context f32_bld, i32_bld;
  295.  
  296.    lp_build_context_init(&f32_bld, gallivm, f32_type);
  297.    lp_build_context_init(&i32_bld, gallivm, i32_type);
  298.  
  299.    /* extract the component to "float position" */
  300.    if (exponent_start < 23) {
  301.       shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
  302.       src = lp_build_shl(&i32_bld, src, shift);
  303.    }
  304.    else {
  305.       shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
  306.       src = lp_build_shr(&i32_bld, src, shift);
  307.    }
  308.    maskabs = lp_build_const_int_vec(gallivm, i32_type,
  309.                                     ((1 << (mantissa_bits + exponent_bits)) - 1)
  310.                                     << (23 - mantissa_bits));
  311.    srcabs = lp_build_and(&i32_bld, src, maskabs);
  312.    srcabs = LLVMBuildBitCast(builder, srcabs, f32_bld.vec_type, "");
  313.  
  314.    /* now do the actual scaling */
  315.    smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
  316.                                          ((1 << exponent_bits) - 1) << 23);
  317.    i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
  318.    /*
  319.     * magic number has exponent new exp bias + (new exp bias - old exp bias),
  320.     * mantissa is 0.
  321.     */
  322.    magic = lp_build_const_int_vec(gallivm, i32_type,
  323.                                   (255 - (1 << (exponent_bits - 1))) << 23);
  324.    magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
  325.  
  326.    /* adjust exponent and fix denorms */
  327.    res = lp_build_mul(&f32_bld, srcabs, magic);
  328.  
  329.    /*
  330.     * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
  331.     * so a simple "or" will do (because exp adjust will leave mantissa intact)
  332.     */
  333.    /* use float compare (better for AVX 8-wide / no AVX2 but else should use int) */
  334.    smallexpmask = LLVMBuildBitCast(builder, smallexpmask, f32_bld.vec_type, "");
  335.    wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
  336.    res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
  337.    tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
  338.    res = lp_build_or(&i32_bld, tmp, res);
  339.  
  340.    if (has_sign) {
  341.       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000);
  342.       shift = lp_build_const_int_vec(gallivm, i32_type, 8 - exponent_bits);
  343.       sign = lp_build_shl(&i32_bld, src, shift);
  344.       sign = lp_build_and(&i32_bld, signmask, sign);
  345.       res = lp_build_or(&i32_bld, res, sign);
  346.    }
  347.  
  348.    return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
  349. }
  350.  
  351.  
  352. /**
  353.  * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
  354.  *
  355.  * @param src   packed AoS r11g11b10 values (as (vector) int32)
  356.  * @param dst   pointer to the SoA result values
  357.  */
  358. void
  359. lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
  360.                             LLVMValueRef src,
  361.                             LLVMValueRef *dst)
  362. {
  363.    LLVMTypeRef src_type = LLVMTypeOf(src);
  364.    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
  365.                             LLVMGetVectorSize(src_type) : 1;
  366.    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
  367.  
  368.    dst[0] = lp_build_smallfloat_to_float(gallivm, f32_type, src, 6, 5, 0, false);
  369.    dst[1] = lp_build_smallfloat_to_float(gallivm, f32_type, src, 6, 5, 11, false);
  370.    dst[2] = lp_build_smallfloat_to_float(gallivm, f32_type, src, 5, 5, 22, false);
  371.  
  372.    /* Just set alpha to one */
  373.    dst[3] = lp_build_one(gallivm, f32_type);
  374. }
  375.  
  376.  
  377. static LLVMValueRef
  378. lp_build_rgb9_to_float_helper(struct gallivm_state *gallivm,
  379.                               struct lp_type f32_type,
  380.                               LLVMValueRef src,
  381.                               LLVMValueRef scale,
  382.                               unsigned mantissa_start)
  383. {
  384.    LLVMValueRef shift, mask;
  385.  
  386.    struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
  387.    struct lp_build_context i32_bld, f32_bld;
  388.  
  389.    lp_build_context_init(&i32_bld, gallivm, i32_type);
  390.    lp_build_context_init(&f32_bld, gallivm, f32_type);
  391.  
  392.    /*
  393.     * This is much easier as other weirdo float formats, since
  394.     * there's no sign, no Inf/NaN, and there's nothing special
  395.     * required for normals/denormals neither (as without the implied one
  396.     * for the mantissa for other formats, everything looks like a denormal).
  397.     * So just do (float)comp_bits * scale
  398.     */
  399.    shift = lp_build_const_int_vec(gallivm, i32_type, mantissa_start);
  400.    mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff);
  401.    src = lp_build_shr(&i32_bld, src, shift);
  402.    src = lp_build_and(&i32_bld, src, mask);
  403.    src = lp_build_int_to_float(&f32_bld, src);
  404.    return lp_build_mul(&f32_bld, src, scale);
  405. }
  406.  
  407.  
  408. /**
  409.  * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
  410.  *
  411.  * @param src   packed AoS rgb9e5 values (as (vector) int32)
  412.  * @param dst   pointer to the SoA result values
  413.  */
  414. void
  415. lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
  416.                          LLVMValueRef src,
  417.                          LLVMValueRef *dst)
  418. {
  419.    LLVMBuilderRef builder = gallivm->builder;
  420.    LLVMTypeRef src_type = LLVMTypeOf(src);
  421.    LLVMValueRef shift, scale, bias, exp;
  422.    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
  423.                             LLVMGetVectorSize(src_type) : 1;
  424.    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
  425.    struct lp_type u32_type = lp_type_uint_vec(32, 32 * src_length);
  426.    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
  427.    struct lp_build_context i32_bld, u32_bld, f32_bld;
  428.  
  429.    lp_build_context_init(&i32_bld, gallivm, i32_type);
  430.    lp_build_context_init(&u32_bld, gallivm, u32_type);
  431.    lp_build_context_init(&f32_bld, gallivm, f32_type);
  432.  
  433.    /* extract exponent */
  434.    shift = lp_build_const_int_vec(gallivm, i32_type, 27);
  435.    /* this shift needs to be unsigned otherwise need mask */
  436.    exp = lp_build_shr(&u32_bld, src, shift);
  437.  
  438.    /*
  439.     * scale factor is 2 ^ (exp - bias)
  440.     * (and additionally corrected here for the mantissa bits)
  441.     * not using shift because
  442.     * a) don't have vector shift in a lot of cases
  443.     * b) shift direction changes hence need 2 shifts + conditional
  444.     *    (or rotate instruction which is even more rare (for instance XOP))
  445.     * so use whacky float 2 ^ function instead manipulating exponent
  446.     * (saves us the float conversion at the end too)
  447.     */
  448.    bias = lp_build_const_int_vec(gallivm, i32_type, 127 - (15 + 9));
  449.    scale = lp_build_add(&i32_bld, exp, bias);
  450.    shift = lp_build_const_int_vec(gallivm, i32_type, 23);
  451.    scale = lp_build_shl(&i32_bld, scale, shift);
  452.    scale = LLVMBuildBitCast(builder, scale, f32_bld.vec_type, "");
  453.  
  454.    dst[0] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 0);
  455.    dst[1] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 9);
  456.    dst[2] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 18);
  457.  
  458.    /* Just set alpha to one */
  459.    dst[3] = f32_bld.one;
  460. }
  461.