Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /**************************************************************************
  2.  *
  3.  * Copyright 2013 VMware, Inc.
  4.  * All Rights Reserved.
  5.  *
  6.  * Permission is hereby granted, free of charge, to any person obtaining a
  7.  * copy of this software and associated documentation files (the
  8.  * "Software"), to deal in the Software without restriction, including
  9.  * without limitation the rights to use, copy, modify, merge, publish,
  10.  * distribute, sub license, and/or sell copies of the Software, and to
  11.  * permit persons to whom the Software is furnished to do so, subject to
  12.  * the following conditions:
  13.  *
  14.  * The above copyright notice and this permission notice (including the
  15.  * next paragraph) shall be included in all copies or substantial portions
  16.  * of the Software.
  17.  *
  18.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19.  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20.  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21.  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22.  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23.  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24.  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25.  *
  26.  **************************************************************************/
  27.  
  28.  
  29. /**
  30.  * @file
  31.  * Format conversion code for srgb formats.
  32.  *
  33.  * Functions for converting from srgb to linear and vice versa.
  34.  * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
  35.  *
  36.  * srgb->linear:
  37.  * cl = cs / 12.92,                 cs <= 0.04045
  38.  * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
  39.  *
  40.  * linear->srgb:
  41.  * if (isnan(cl)) {
  42.  *    Map IEEE-754 Not-a-number to zero.
  43.  *    cs = 0.0;
  44.  * } else if (cl > 1.0) {
  45.  *    cs = 1.0;
  46.  * } else if (cl < 0.0) {
  47.  *    cs = 0.0;
  48.  * } else if (cl < 0.0031308) {
  49.  *    cs = 12.92 * cl;
  50.  * } else {
  51.  *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
  52.  * }
  53.  *
  54.  * This does not need to be accurate, however at least for d3d10
  55.  * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
  56.  * 1) For srgb->linear, it is required that the error on the srgb side is
  57.  *    not larger than 0.5f, which I interpret that if you map the value back
  58.  *    to srgb from linear using the ideal conversion, it would not be off by
  59.  *    more than 0.5f (that is, it would map to the same 8-bit integer value
  60.  *    as it was before conversion to linear).
  61.  * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
  62.  *    error is allowed.
  63.  * 3) Additionally, all srgb values converted to linear and back must result
  64.  *    in the same value as they were originally.
  65.  *
  66.  * @author Roland Scheidegger <sroland@vmware.com>
  67.  */
  68.  
  69.  
  70. #include "util/u_debug.h"
  71.  
  72. #include "lp_bld_type.h"
  73. #include "lp_bld_const.h"
  74. #include "lp_bld_arit.h"
  75. #include "lp_bld_bitarit.h"
  76. #include "lp_bld_logic.h"
  77. #include "lp_bld_format.h"
  78.  
  79.  
  80.  
  81. /**
  82.  * Convert srgb int values to linear float values.
  83.  * Several possibilities how to do this, e.g.
  84.  * - table
  85.  * - doing the pow() with int-to-float and float-to-int tricks
  86.  *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
  87.  * - just using standard polynomial approximation
  88.  *   (3rd order polynomial is required for crappy but just sufficient accuracy)
  89.  *
  90.  * @param src   integer (vector) value(s) to convert
  91.  *              (8 bit values unpacked to 32 bit already).
  92.  */
  93. LLVMValueRef
  94. lp_build_srgb_to_linear(struct gallivm_state *gallivm,
  95.                         struct lp_type src_type,
  96.                         LLVMValueRef src)
  97. {
  98.    struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
  99.    struct lp_build_context f32_bld;
  100.    LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
  101.    double coeffs[4] = {0.0023f,
  102.                        0.0030f / 255.0f,
  103.                        0.6935f / (255.0f * 255.0f),
  104.                        0.3012f / (255.0f * 255.0f * 255.0f)
  105.    };
  106.  
  107.    assert(src_type.width == 32);
  108.  
  109.    lp_build_context_init(&f32_bld, gallivm, f32_type);
  110.  
  111.    /*
  112.     * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
  113.     * ( poly =  0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
  114.     * (found with octave polyfit and some magic as I couldn't get the error
  115.     * function right). Using the above mentioned error function, the values stay
  116.     * within +-0.35, except for the lowest values - hence tweaking linear segment
  117.     * to cover the first 16 instead of the first 11 values (the error stays
  118.     * just about acceptable there too).
  119.     * Hence: lin = src > 15 ? poly : src / 12.6
  120.     * This function really only makes sense for vectors, should use LUT otherwise.
  121.     * All in all (including float conversion) 11 instructions (with sse4.1),
  122.     * 6 constants (polynomial could be done with 1 instruction less at the cost
  123.     * of slightly worse dependency chain, fma should also help).
  124.     */
  125.    /* doing the 1/255 mul as part of the approximation */
  126.    srcf = lp_build_int_to_float(&f32_bld, src);
  127.    lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
  128.    part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
  129.  
  130.    part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);
  131.  
  132.    lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
  133.    is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
  134.    return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
  135. }
  136.  
  137.  
  138. /**
  139.  * Convert linear float values to srgb int values.
  140.  * Several possibilities how to do this, e.g.
  141.  * - use table (based on exponent/highest order mantissa bits) and do
  142.  *   linear interpolation (https://gist.github.com/rygorous/2203834)
  143.  * - Chebyshev polynomial
  144.  * - Approximation using reciprocals
  145.  * - using int-to-float and float-to-int tricks for pow()
  146.  *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
  147.  *
  148.  * @param src   float (vector) value(s) to convert.
  149.  */
  150. static LLVMValueRef
  151. lp_build_linear_to_srgb(struct gallivm_state *gallivm,
  152.                         struct lp_type src_type,
  153.                         LLVMValueRef src)
  154. {
  155.    LLVMBuilderRef builder = gallivm->builder;
  156.    struct lp_build_context f32_bld;
  157.    LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;
  158.  
  159.    lp_build_context_init(&f32_bld, gallivm, src_type);
  160.  
  161.    src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
  162.  
  163.    if (0) {
  164.       /*
  165.        * using int-to-float and float-to-int trick for pow().
  166.        * This is much more accurate than necessary thanks to the correction,
  167.        * but it most certainly makes no sense without rsqrt available.
  168.        * Bonus points if you understand how this works...
  169.        * All in all (including min/max clamp, conversion) 19 instructions.
  170.        */
  171.  
  172.       float exp_f = 2.0f / 3.0f;
  173.       /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */
  174.       float exp2f_c = 1.30438178253e+19f;
  175.       float coeff_f = 0.62996f;
  176.       LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
  177.       struct lp_type int_type = lp_int_type(src_type);
  178.  
  179.       /*
  180.        * First calculate approx x^8/12
  181.        */
  182.       exponent = lp_build_const_vec(gallivm, src_type, exp_f);
  183.       coeff = lp_build_const_vec(gallivm, src_type,
  184.                                  exp2f_c * powf(coeff_f, 1.0f / exp_f));
  185.  
  186.       /* premultiply src */
  187.       tmp = lp_build_mul(&f32_bld, coeff, src);
  188.       /* "log2" */
  189.       tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
  190.       tmp = lp_build_int_to_float(&f32_bld, tmp);
  191.       /* multiply for pow */
  192.       tmp = lp_build_mul(&f32_bld, tmp, exponent);
  193.       /* "exp2" */
  194.       pow_approx = lp_build_itrunc(&f32_bld, tmp);
  195.       pow_approx = LLVMBuildBitCast(builder, pow_approx,
  196.                                     lp_build_vec_type(gallivm, src_type), "");
  197.  
  198.       /*
  199.        * Since that pow was inaccurate (like 3 bits, though each sqrt step would
  200.        * give another bit), compensate the error (which is why we chose another
  201.        * exponent in the first place).
  202.        */
  203.       /* x * x^(8/12) = x^(20/12) */
  204.       pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
  205.  
  206.       /* x * x * x^(-4/12) = x^(20/12) */
  207.       /* Should avoid using rsqrt if it's not available, but
  208.        * using x * x^(4/12) * x^(4/12) instead will change error weight */
  209.       tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
  210.       x2 = lp_build_mul(&f32_bld, src, src);
  211.       pow_2 = lp_build_mul(&f32_bld, x2, tmp);
  212.  
  213.       /* average the values so the errors cancel out, compensate bias,
  214.        * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
  215.        * for conversion to int in here */
  216.       tmp = lp_build_add(&f32_bld, pow_1, pow_2);
  217.       coeff = lp_build_const_vec(gallivm, src_type,
  218.                                  1.0f / (3.0f * coeff_f) * 0.999852f *
  219.                                  powf(1.055f * 255.0f, 4.0f));
  220.       pow_final = lp_build_mul(&f32_bld, tmp, coeff);
  221.  
  222.       /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
  223.       if (lp_build_fast_rsqrt_available(src_type)) {
  224.          pow_final = lp_build_fast_rsqrt(&f32_bld,
  225.                         lp_build_fast_rsqrt(&f32_bld, pow_final));
  226.       }
  227.       else {
  228.          pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
  229.       }
  230.       pow_final = lp_build_add(&f32_bld, pow_final,
  231.                                lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
  232.    }
  233.  
  234.    else {
  235.       /*
  236.        * using "rational polynomial" approximation here.
  237.        * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
  238.        * factoring in the 255.0 mul and the scaling mul.
  239.        * (a is closer to actual value so has higher weight than b.)
  240.        * Note: the constants are magic values. They were found empirically,
  241.        * possibly could be improved but good enough (be VERY careful with
  242.        * error metric if you'd want to tweak them, they also MUST fit with
  243.        * the crappy polynomial above for srgb->linear since it is required
  244.        * that each srgb value maps back to the same value).
  245.        * This function has an error of max +-0.17 (and we'd only require +-0.6),
  246.        * for the approximated srgb->linear values the error is naturally larger
  247.        * (+-0.42) but still accurate enough (required +-0.5 essentially).
  248.        * All in all (including min/max clamp, conversion) 15 instructions.
  249.        * FMA would help (minus 2 instructions).
  250.        */
  251.  
  252.       LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;
  253.  
  254.       if (lp_build_fast_rsqrt_available(src_type)) {
  255.          tmp = lp_build_fast_rsqrt(&f32_bld, src);
  256.          x05 = lp_build_mul(&f32_bld, src, tmp);
  257.       }
  258.       else {
  259.          /*
  260.           * I don't really expect this to be practical without rsqrt
  261.           * but there's no reason for triple punishment so at least
  262.           * save the otherwise resulting division and unnecessary mul...
  263.           */
  264.          x05 = lp_build_sqrt(&f32_bld, src);
  265.       }
  266.  
  267.       tmp = lp_build_mul(&f32_bld, x05, src);
  268.       if (lp_build_fast_rsqrt_available(src_type)) {
  269.          x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
  270.       }
  271.       else {
  272.          x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
  273.       }
  274.  
  275.       a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
  276.       b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
  277.       c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
  278.  
  279.       tmp = lp_build_mul(&f32_bld, a_const, x0375);
  280.       tmp2 = lp_build_mul(&f32_bld, b_const, x05);
  281.       tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
  282.       pow_final = lp_build_add(&f32_bld, tmp, tmp2);
  283.    }
  284.  
  285.    /* linear part is easy */
  286.    lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
  287.    lin = lp_build_mul(&f32_bld, src, lin_const);
  288.  
  289.    lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
  290.    is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
  291.    tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
  292.  
  293.    f32_bld.type.sign = 0;
  294.    return lp_build_iround(&f32_bld, tmp);
  295. }
  296.  
  297.  
  298. /**
  299.  * Convert linear float soa values to packed srgb AoS values.
  300.  * This only handles packed formats which are 4x8bit in size
  301.  * (rgba and rgbx plus swizzles).
  302.  *
  303.  * @param src   float SoA (vector) values to convert.
  304.  */
  305. LLVMValueRef
  306. lp_build_float_to_srgb_packed(struct gallivm_state *gallivm,
  307.                               const struct util_format_description *dst_fmt,
  308.                               struct lp_type src_type,
  309.                               LLVMValueRef *src)
  310. {
  311.    LLVMBuilderRef builder = gallivm->builder;
  312.    unsigned chan;
  313.    struct lp_build_context f32_bld;
  314.    struct lp_type int32_type = lp_int_type(src_type);
  315.    LLVMValueRef tmpsrgb[4], alpha, dst;
  316.  
  317.    lp_build_context_init(&f32_bld, gallivm, src_type);
  318.  
  319.    /* rgb is subject to linear->srgb conversion, alpha is not */
  320.    for (chan = 0; chan < 3; chan++) {
  321.       tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, src[chan]);
  322.    }
  323.    /*
  324.     * can't use lp_build_conv since we want to keep values as 32bit
  325.     * here so we can interleave with rgb to go from SoA->AoS.
  326.     */
  327.    alpha = lp_build_clamp(&f32_bld, src[3], f32_bld.zero, f32_bld.one);
  328.    alpha = lp_build_mul(&f32_bld, alpha,
  329.                         lp_build_const_vec(gallivm, src_type, 255.0f));
  330.    tmpsrgb[3] = lp_build_iround(&f32_bld, alpha);
  331.  
  332.    dst = lp_build_zero(gallivm, int32_type);
  333.    for (chan = 0; chan < dst_fmt->nr_channels; chan++) {
  334.       if (dst_fmt->swizzle[chan] <= UTIL_FORMAT_SWIZZLE_W) {
  335.          unsigned ls;
  336.          LLVMValueRef shifted, shift_val;
  337.          ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift;
  338.          shift_val = lp_build_const_int_vec(gallivm, int32_type, ls);
  339.          shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, "");
  340.          dst = LLVMBuildOr(builder, dst, shifted, "");
  341.       }
  342.    }
  343.    return dst;
  344. }
  345.