Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5563 | serge | 1 | /************************************************************************** |
2 | * |
||
3 | * Copyright 2013 VMware, Inc. |
||
4 | * All Rights Reserved. |
||
5 | * |
||
6 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
7 | * copy of this software and associated documentation files (the |
||
8 | * "Software"), to deal in the Software without restriction, including |
||
9 | * without limitation the rights to use, copy, modify, merge, publish, |
||
10 | * distribute, sub license, and/or sell copies of the Software, and to |
||
11 | * permit persons to whom the Software is furnished to do so, subject to |
||
12 | * the following conditions: |
||
13 | * |
||
14 | * The above copyright notice and this permission notice (including the |
||
15 | * next paragraph) shall be included in all copies or substantial portions |
||
16 | * of the Software. |
||
17 | * |
||
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||
19 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||
20 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||
21 | * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||
22 | * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||
23 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||
24 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
25 | * |
||
26 | **************************************************************************/ |
||
27 | |||
28 | |||
29 | /** |
||
30 | * @file |
||
31 | * Format conversion code for srgb formats. |
||
32 | * |
||
33 | * Functions for converting from srgb to linear and vice versa. |
||
34 | * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt: |
||
35 | * |
||
36 | * srgb->linear: |
||
37 | * cl = cs / 12.92, cs <= 0.04045 |
||
38 | * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045 |
||
39 | * |
||
40 | * linear->srgb: |
||
41 | * if (isnan(cl)) { |
||
42 | * Map IEEE-754 Not-a-number to zero. |
||
43 | * cs = 0.0; |
||
44 | * } else if (cl > 1.0) { |
||
45 | * cs = 1.0; |
||
46 | * } else if (cl < 0.0) { |
||
47 | * cs = 0.0; |
||
48 | * } else if (cl < 0.0031308) { |
||
49 | * cs = 12.92 * cl; |
||
50 | * } else { |
||
51 | * cs = 1.055 * pow(cl, 0.41666) - 0.055; |
||
52 | * } |
||
53 | * |
||
54 | * This does not need to be accurate, however at least for d3d10 |
||
55 | * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx): |
||
56 | * 1) For srgb->linear, it is required that the error on the srgb side is |
||
57 | * not larger than 0.5f, which I interpret that if you map the value back |
||
58 | * to srgb from linear using the ideal conversion, it would not be off by |
||
59 | * more than 0.5f (that is, it would map to the same 8-bit integer value |
||
60 | * as it was before conversion to linear). |
||
61 | * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large |
||
62 | * error is allowed. |
||
63 | * 3) Additionally, all srgb values converted to linear and back must result |
||
64 | * in the same value as they were originally. |
||
65 | * |
||
66 | * @author Roland Scheidegger |
||
67 | */ |
||
68 | |||
69 | |||
70 | #include "util/u_debug.h" |
||
71 | |||
72 | #include "lp_bld_type.h" |
||
73 | #include "lp_bld_const.h" |
||
74 | #include "lp_bld_arit.h" |
||
75 | #include "lp_bld_bitarit.h" |
||
76 | #include "lp_bld_logic.h" |
||
77 | #include "lp_bld_format.h" |
||
78 | |||
79 | |||
80 | |||
81 | /** |
||
82 | * Convert srgb int values to linear float values. |
||
83 | * Several possibilities how to do this, e.g. |
||
84 | * - table |
||
85 | * - doing the pow() with int-to-float and float-to-int tricks |
||
86 | * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) |
||
87 | * - just using standard polynomial approximation |
||
88 | * (3rd order polynomial is required for crappy but just sufficient accuracy) |
||
89 | * |
||
90 | * @param src integer (vector) value(s) to convert |
||
91 | * (8 bit values unpacked to 32 bit already). |
||
92 | */ |
||
93 | LLVMValueRef |
||
94 | lp_build_srgb_to_linear(struct gallivm_state *gallivm, |
||
95 | struct lp_type src_type, |
||
96 | LLVMValueRef src) |
||
97 | { |
||
98 | struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32); |
||
99 | struct lp_build_context f32_bld; |
||
100 | LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh; |
||
101 | double coeffs[4] = {0.0023f, |
||
102 | 0.0030f / 255.0f, |
||
103 | 0.6935f / (255.0f * 255.0f), |
||
104 | 0.3012f / (255.0f * 255.0f * 255.0f) |
||
105 | }; |
||
106 | |||
107 | assert(src_type.width == 32); |
||
108 | |||
109 | lp_build_context_init(&f32_bld, gallivm, f32_type); |
||
110 | |||
111 | /* |
||
112 | * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) |
||
113 | * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023) |
||
114 | * (found with octave polyfit and some magic as I couldn't get the error |
||
115 | * function right). Using the above mentioned error function, the values stay |
||
116 | * within +-0.35, except for the lowest values - hence tweaking linear segment |
||
117 | * to cover the first 16 instead of the first 11 values (the error stays |
||
118 | * just about acceptable there too). |
||
119 | * Hence: lin = src > 15 ? poly : src / 12.6 |
||
120 | * This function really only makes sense for vectors, should use LUT otherwise. |
||
121 | * All in all (including float conversion) 11 instructions (with sse4.1), |
||
122 | * 6 constants (polynomial could be done with 1 instruction less at the cost |
||
123 | * of slightly worse dependency chain, fma should also help). |
||
124 | */ |
||
125 | /* doing the 1/255 mul as part of the approximation */ |
||
126 | srcf = lp_build_int_to_float(&f32_bld, src); |
||
127 | lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f)); |
||
128 | part_lin = lp_build_mul(&f32_bld, srcf, lin_const); |
||
129 | |||
130 | part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4); |
||
131 | |||
132 | lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f); |
||
133 | is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh); |
||
134 | return lp_build_select(&f32_bld, is_linear, part_lin, part_pow); |
||
135 | } |
||
136 | |||
137 | |||
138 | /** |
||
139 | * Convert linear float values to srgb int values. |
||
140 | * Several possibilities how to do this, e.g. |
||
141 | * - use table (based on exponent/highest order mantissa bits) and do |
||
142 | * linear interpolation (https://gist.github.com/rygorous/2203834) |
||
143 | * - Chebyshev polynomial |
||
144 | * - Approximation using reciprocals |
||
145 | * - using int-to-float and float-to-int tricks for pow() |
||
146 | * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) |
||
147 | * |
||
148 | * @param src float (vector) value(s) to convert. |
||
149 | */ |
||
150 | static LLVMValueRef |
||
151 | lp_build_linear_to_srgb(struct gallivm_state *gallivm, |
||
152 | struct lp_type src_type, |
||
153 | LLVMValueRef src) |
||
154 | { |
||
155 | LLVMBuilderRef builder = gallivm->builder; |
||
156 | struct lp_build_context f32_bld; |
||
157 | LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final; |
||
158 | |||
159 | lp_build_context_init(&f32_bld, gallivm, src_type); |
||
160 | |||
161 | src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); |
||
162 | |||
163 | if (0) { |
||
164 | /* |
||
165 | * using int-to-float and float-to-int trick for pow(). |
||
166 | * This is much more accurate than necessary thanks to the correction, |
||
167 | * but it most certainly makes no sense without rsqrt available. |
||
168 | * Bonus points if you understand how this works... |
||
169 | * All in all (including min/max clamp, conversion) 19 instructions. |
||
170 | */ |
||
171 | |||
172 | float exp_f = 2.0f / 3.0f; |
||
173 | /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */ |
||
174 | float exp2f_c = 1.30438178253e+19f; |
||
175 | float coeff_f = 0.62996f; |
||
176 | LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2; |
||
177 | struct lp_type int_type = lp_int_type(src_type); |
||
178 | |||
179 | /* |
||
180 | * First calculate approx x^8/12 |
||
181 | */ |
||
182 | exponent = lp_build_const_vec(gallivm, src_type, exp_f); |
||
183 | coeff = lp_build_const_vec(gallivm, src_type, |
||
184 | exp2f_c * powf(coeff_f, 1.0f / exp_f)); |
||
185 | |||
186 | /* premultiply src */ |
||
187 | tmp = lp_build_mul(&f32_bld, coeff, src); |
||
188 | /* "log2" */ |
||
189 | tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), ""); |
||
190 | tmp = lp_build_int_to_float(&f32_bld, tmp); |
||
191 | /* multiply for pow */ |
||
192 | tmp = lp_build_mul(&f32_bld, tmp, exponent); |
||
193 | /* "exp2" */ |
||
194 | pow_approx = lp_build_itrunc(&f32_bld, tmp); |
||
195 | pow_approx = LLVMBuildBitCast(builder, pow_approx, |
||
196 | lp_build_vec_type(gallivm, src_type), ""); |
||
197 | |||
198 | /* |
||
199 | * Since that pow was inaccurate (like 3 bits, though each sqrt step would |
||
200 | * give another bit), compensate the error (which is why we chose another |
||
201 | * exponent in the first place). |
||
202 | */ |
||
203 | /* x * x^(8/12) = x^(20/12) */ |
||
204 | pow_1 = lp_build_mul(&f32_bld, pow_approx, src); |
||
205 | |||
206 | /* x * x * x^(-4/12) = x^(20/12) */ |
||
207 | /* Should avoid using rsqrt if it's not available, but |
||
208 | * using x * x^(4/12) * x^(4/12) instead will change error weight */ |
||
209 | tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); |
||
210 | x2 = lp_build_mul(&f32_bld, src, src); |
||
211 | pow_2 = lp_build_mul(&f32_bld, x2, tmp); |
||
212 | |||
213 | /* average the values so the errors cancel out, compensate bias, |
||
214 | * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul |
||
215 | * for conversion to int in here */ |
||
216 | tmp = lp_build_add(&f32_bld, pow_1, pow_2); |
||
217 | coeff = lp_build_const_vec(gallivm, src_type, |
||
218 | 1.0f / (3.0f * coeff_f) * 0.999852f * |
||
219 | powf(1.055f * 255.0f, 4.0f)); |
||
220 | pow_final = lp_build_mul(&f32_bld, tmp, coeff); |
||
221 | |||
222 | /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ |
||
223 | if (lp_build_fast_rsqrt_available(src_type)) { |
||
224 | pow_final = lp_build_fast_rsqrt(&f32_bld, |
||
225 | lp_build_fast_rsqrt(&f32_bld, pow_final)); |
||
226 | } |
||
227 | else { |
||
228 | pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final)); |
||
229 | } |
||
230 | pow_final = lp_build_add(&f32_bld, pow_final, |
||
231 | lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f)); |
||
232 | } |
||
233 | |||
234 | else { |
||
235 | /* |
||
236 | * using "rational polynomial" approximation here. |
||
237 | * Essentially y = a*x^0.375 + b*x^0.5 + c, with also |
||
238 | * factoring in the 255.0 mul and the scaling mul. |
||
239 | * (a is closer to actual value so has higher weight than b.) |
||
240 | * Note: the constants are magic values. They were found empirically, |
||
241 | * possibly could be improved but good enough (be VERY careful with |
||
242 | * error metric if you'd want to tweak them, they also MUST fit with |
||
243 | * the crappy polynomial above for srgb->linear since it is required |
||
244 | * that each srgb value maps back to the same value). |
||
245 | * This function has an error of max +-0.17 (and we'd only require +-0.6), |
||
246 | * for the approximated srgb->linear values the error is naturally larger |
||
247 | * (+-0.42) but still accurate enough (required +-0.5 essentially). |
||
248 | * All in all (including min/max clamp, conversion) 15 instructions. |
||
249 | * FMA would help (minus 2 instructions). |
||
250 | */ |
||
251 | |||
252 | LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2; |
||
253 | |||
254 | if (lp_build_fast_rsqrt_available(src_type)) { |
||
255 | tmp = lp_build_fast_rsqrt(&f32_bld, src); |
||
256 | x05 = lp_build_mul(&f32_bld, src, tmp); |
||
257 | } |
||
258 | else { |
||
259 | /* |
||
260 | * I don't really expect this to be practical without rsqrt |
||
261 | * but there's no reason for triple punishment so at least |
||
262 | * save the otherwise resulting division and unnecessary mul... |
||
263 | */ |
||
264 | x05 = lp_build_sqrt(&f32_bld, src); |
||
265 | } |
||
266 | |||
267 | tmp = lp_build_mul(&f32_bld, x05, src); |
||
268 | if (lp_build_fast_rsqrt_available(src_type)) { |
||
269 | x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp)); |
||
270 | } |
||
271 | else { |
||
272 | x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); |
||
273 | } |
||
274 | |||
275 | a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f); |
||
276 | b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f); |
||
277 | c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); |
||
278 | |||
279 | tmp = lp_build_mul(&f32_bld, a_const, x0375); |
||
280 | tmp2 = lp_build_mul(&f32_bld, b_const, x05); |
||
281 | tmp2 = lp_build_add(&f32_bld, tmp2, c_const); |
||
282 | pow_final = lp_build_add(&f32_bld, tmp, tmp2); |
||
283 | } |
||
284 | |||
285 | /* linear part is easy */ |
||
286 | lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); |
||
287 | lin = lp_build_mul(&f32_bld, src, lin_const); |
||
288 | |||
289 | lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); |
||
290 | is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh); |
||
291 | tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); |
||
292 | |||
293 | f32_bld.type.sign = 0; |
||
294 | return lp_build_iround(&f32_bld, tmp); |
||
295 | } |
||
296 | |||
297 | |||
298 | /** |
||
299 | * Convert linear float soa values to packed srgb AoS values. |
||
300 | * This only handles packed formats which are 4x8bit in size |
||
301 | * (rgba and rgbx plus swizzles). |
||
302 | * |
||
303 | * @param src float SoA (vector) values to convert. |
||
304 | */ |
||
305 | LLVMValueRef |
||
306 | lp_build_float_to_srgb_packed(struct gallivm_state *gallivm, |
||
307 | const struct util_format_description *dst_fmt, |
||
308 | struct lp_type src_type, |
||
309 | LLVMValueRef *src) |
||
310 | { |
||
311 | LLVMBuilderRef builder = gallivm->builder; |
||
312 | unsigned chan; |
||
313 | struct lp_build_context f32_bld; |
||
314 | struct lp_type int32_type = lp_int_type(src_type); |
||
315 | LLVMValueRef tmpsrgb[4], alpha, dst; |
||
316 | |||
317 | lp_build_context_init(&f32_bld, gallivm, src_type); |
||
318 | |||
319 | /* rgb is subject to linear->srgb conversion, alpha is not */ |
||
320 | for (chan = 0; chan < 3; chan++) { |
||
321 | tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, src[chan]); |
||
322 | } |
||
323 | /* |
||
324 | * can't use lp_build_conv since we want to keep values as 32bit |
||
325 | * here so we can interleave with rgb to go from SoA->AoS. |
||
326 | */ |
||
327 | alpha = lp_build_clamp(&f32_bld, src[3], f32_bld.zero, f32_bld.one); |
||
328 | alpha = lp_build_mul(&f32_bld, alpha, |
||
329 | lp_build_const_vec(gallivm, src_type, 255.0f)); |
||
330 | tmpsrgb[3] = lp_build_iround(&f32_bld, alpha); |
||
331 | |||
332 | dst = lp_build_zero(gallivm, int32_type); |
||
333 | for (chan = 0; chan < dst_fmt->nr_channels; chan++) { |
||
334 | if (dst_fmt->swizzle[chan] <= UTIL_FORMAT_SWIZZLE_W) { |
||
335 | unsigned ls; |
||
336 | LLVMValueRef shifted, shift_val; |
||
337 | ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift; |
||
338 | shift_val = lp_build_const_int_vec(gallivm, int32_type, ls); |
||
339 | shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, ""); |
||
340 | dst = LLVMBuildOr(builder, dst, shifted, ""); |
||
341 | } |
||
342 | } |
||
343 | return dst; |
||
344 | }=>>>>>=> |