Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
/**************************************************************************
2
 *
3
 * Copyright 2009 VMware, Inc.
4
 * All Rights Reserved.
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the
8
 * "Software"), to deal in the Software without restriction, including
9
 * without limitation the rights to use, copy, modify, merge, publish,
10
 * distribute, sub license, and/or sell copies of the Software, and to
11
 * permit persons to whom the Software is furnished to do so, subject to
12
 * the following conditions:
13
 *
14
 * The above copyright notice and this permission notice (including the
15
 * next paragraph) shall be included in all copies or substantial portions
16
 * of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21
 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 *
26
 **************************************************************************/
27
 
28
 
29
/**
30
 * @file
31
 * Helper functions for type conversions.
32
 *
33
 * We want to use the fastest type for a given computation whenever feasible.
34
 * The other side of this is that we need to be able convert between several
35
 * types accurately and efficiently.
36
 *
37
 * Conversion between types of different bit width is quite complex since a
38
 *
39
 * To remember there are a few invariants in type conversions:
40
 *
41
 * - register width must remain constant:
42
 *
43
 *     src_type.width * src_type.length == dst_type.width * dst_type.length
44
 *
45
 * - total number of elements must remain constant:
46
 *
47
 *     src_type.length * num_srcs == dst_type.length * num_dsts
48
 *
49
 * It is not always possible to do the conversion both accurately and
50
 * efficiently, usually due to lack of adequate machine instructions. In these
51
 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52
 * there this functions can be used anywhere. In the future we might have a
53
 * precision parameter which can gauge the accuracy vs efficiency compromise,
54
 * but for now if the data conversion between two stages happens to be the
55
 * bottleneck, then most likely should just avoid converting at all and run
56
 * both stages with the same type.
57
 *
58
 * Make sure to run lp_test_conv unit test after any change to this file.
59
 *
60
 * @author Jose Fonseca 
61
 */
62
 
63
 
64
#include "util/u_debug.h"
65
#include "util/u_math.h"
66
#include "util/u_half.h"
67
#include "util/u_cpu_detect.h"
68
 
69
#include "lp_bld_type.h"
70
#include "lp_bld_const.h"
71
#include "lp_bld_arit.h"
72
#include "lp_bld_bitarit.h"
73
#include "lp_bld_pack.h"
74
#include "lp_bld_conv.h"
75
#include "lp_bld_logic.h"
76
#include "lp_bld_intr.h"
77
#include "lp_bld_printf.h"
78
#include "lp_bld_format.h"
79
 
80
 
81
 
82
/**
83
 * Converts int16 half-float to float32
84
 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
85
 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
86
 *
87
 * @param src           value to convert
88
 *
89
 */
90
LLVMValueRef
91
lp_build_half_to_float(struct gallivm_state *gallivm,
92
                       LLVMValueRef src)
93
{
94
   LLVMBuilderRef builder = gallivm->builder;
95
   LLVMTypeRef src_type = LLVMTypeOf(src);
96
   unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
97
                            LLVMGetVectorSize(src_type) : 1;
98
 
99
   struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
100
   struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
101
   LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
102
   LLVMValueRef h;
103
 
104
   if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
105
       (src_length == 4 || src_length == 8)) {
106
      const char *intrinsic = NULL;
107
      if (src_length == 4) {
108
         src = lp_build_pad_vector(gallivm, src, 8);
109
         intrinsic = "llvm.x86.vcvtph2ps.128";
110
      }
111
      else {
112
         intrinsic = "llvm.x86.vcvtph2ps.256";
113
      }
114
      return lp_build_intrinsic_unary(builder, intrinsic,
115
                                      lp_build_vec_type(gallivm, f32_type), src);
116
   }
117
 
118
   /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
119
   h = LLVMBuildZExt(builder, src, int_vec_type, "");
120
   return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
121
}
122
 
123
 
124
/**
125
 * Converts float32 to int16 half-float
126
 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
127
 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
128
 *
129
 * @param src           value to convert
130
 *
131
 * Convert float32 to half floats, preserving Infs and NaNs,
132
 * with rounding towards zero (trunc).
133
 */
134
LLVMValueRef
135
lp_build_float_to_half(struct gallivm_state *gallivm,
136
                       LLVMValueRef src)
137
{
138
   LLVMBuilderRef builder = gallivm->builder;
139
   LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
140
   unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
141
                   ? LLVMGetVectorSize(f32_vec_type) : 1;
142
   struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
143
   struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
144
   LLVMValueRef result;
145
 
146
   if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
147
       (length == 4 || length == 8)) {
148
      struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
149
      unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
150
      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
151
      const char *intrinsic = NULL;
152
      if (length == 4) {
153
         intrinsic = "llvm.x86.vcvtps2ph.128";
154
      }
155
      else {
156
         intrinsic = "llvm.x86.vcvtps2ph.256";
157
      }
158
      result = lp_build_intrinsic_binary(builder, intrinsic,
159
                                         lp_build_vec_type(gallivm, i168_type),
160
                                         src, LLVMConstInt(i32t, mode, 0));
161
      if (length == 4) {
162
         result = lp_build_extract_range(gallivm, result, 0, 4);
163
      }
164
   }
165
 
166
   else {
167
      result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
168
      /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
169
      result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
170
   }
171
 
172
   /*
173
    * Debugging code.
174
    */
175
   if (0) {
176
     LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
177
     LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
178
     LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
179
     LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
180
     unsigned i;
181
 
182
     LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
183
     LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
184
     func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
185
 
186
     for (i = 0; i < length; ++i) {
187
        LLVMValueRef index = LLVMConstInt(i32t, i, 0);
188
        LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
189
#if 0
190
        /* XXX: not really supported by backends */
191
        LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
192
#else
193
        LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
194
#endif
195
        ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
196
     }
197
 
198
     lp_build_print_value(gallivm, "src  = ", src);
199
     lp_build_print_value(gallivm, "llvm = ", result);
200
     lp_build_print_value(gallivm, "util = ", ref_result);
201
     lp_build_printf(gallivm, "\n");
202
  }
203
 
204
   return result;
205
}
206
 
207
 
208
/**
209
 * Special case for converting clamped IEEE-754 floats to unsigned norms.
210
 *
211
 * The mathematical voodoo below may seem excessive but it is actually
212
 * paramount we do it this way for several reasons. First, there is no single
213
 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
214
 * secondly, even if there was, since the FP's mantissa takes only a fraction
215
 * of register bits the typically scale and cast approach would require double
216
 * precision for accurate results, and therefore half the throughput
217
 *
218
 * Although the result values can be scaled to an arbitrary bit width specified
219
 * by dst_width, the actual result type will have the same width.
220
 *
221
 * Ex: src = { float, float, float, float }
222
 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
223
 */
224
LLVMValueRef
225
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
226
                                        struct lp_type src_type,
227
                                        unsigned dst_width,
228
                                        LLVMValueRef src)
229
{
230
   LLVMBuilderRef builder = gallivm->builder;
231
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
232
   LLVMValueRef res;
233
   unsigned mantissa;
234
 
235
   assert(src_type.floating);
236
   assert(dst_width <= src_type.width);
237
   src_type.sign = FALSE;
238
 
239
   mantissa = lp_mantissa(src_type);
240
 
241
   if (dst_width <= mantissa) {
242
      /*
243
       * Apply magic coefficients that will make the desired result to appear
244
       * in the lowest significant bits of the mantissa, with correct rounding.
245
       *
246
       * This only works if the destination width fits in the mantissa.
247
       */
248
 
249
      unsigned long long ubound;
250
      unsigned long long mask;
251
      double scale;
252
      double bias;
253
 
254
      ubound = (1ULL << dst_width);
255
      mask = ubound - 1;
256
      scale = (double)mask/ubound;
257
      bias = (double)(1ULL << (mantissa - dst_width));
258
 
259
      res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
260
      res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
261
      res = LLVMBuildBitCast(builder, res, int_vec_type, "");
262
      res = LLVMBuildAnd(builder, res,
263
                         lp_build_const_int_vec(gallivm, src_type, mask), "");
264
   }
265
   else if (dst_width == (mantissa + 1)) {
266
      /*
267
       * The destination width matches exactly what can be represented in
268
       * floating point (i.e., mantissa + 1 bits). So do a straight
269
       * multiplication followed by casting. No further rounding is necessary.
270
       */
271
 
272
      double scale;
273
 
274
      scale = (double)((1ULL << dst_width) - 1);
275
 
276
      res = LLVMBuildFMul(builder, src,
277
                          lp_build_const_vec(gallivm, src_type, scale), "");
278
      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
279
   }
280
   else {
281
      /*
282
       * The destination exceeds what can be represented in the floating point.
283
       * So multiply by the largest power two we get away with, and when
284
       * subtract the most significant bit to rescale to normalized values.
285
       *
286
       * The largest power of two factor we can get away is
287
       * (1 << (src_type.width - 1)), because we need to use signed . In theory it
288
       * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
289
       * INT_MIN should be returned in FPToSI, which is the correct result for
290
       * values near 1.0!
291
       *
292
       * This means we get (src_type.width - 1) correct bits for values near 0.0,
293
       * and (mantissa + 1) correct bits for values near 1.0. Equally or more
294
       * important, we also get exact results for 0.0 and 1.0.
295
       */
296
 
297
      unsigned n = MIN2(src_type.width - 1, dst_width);
298
 
299
      double scale = (double)(1ULL << n);
300
      unsigned lshift = dst_width - n;
301
      unsigned rshift = n;
302
      LLVMValueRef lshifted;
303
      LLVMValueRef rshifted;
304
 
305
      res = LLVMBuildFMul(builder, src,
306
                          lp_build_const_vec(gallivm, src_type, scale), "");
307
      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
308
 
309
      /*
310
       * Align the most significant bit to its final place.
311
       *
312
       * This will cause 1.0 to overflow to 0, but the later adjustment will
313
       * get it right.
314
       */
315
      if (lshift) {
316
         lshifted = LLVMBuildShl(builder, res,
317
                                 lp_build_const_int_vec(gallivm, src_type,
318
                                                        lshift), "");
319
      } else {
320
         lshifted = res;
321
      }
322
 
323
      /*
324
       * Align the most significant bit to the right.
325
       */
326
      rshifted =  LLVMBuildLShr(builder, res,
327
                                lp_build_const_int_vec(gallivm, src_type, rshift),
328
                                "");
329
 
330
      /*
331
       * Subtract the MSB to the LSB, therefore re-scaling from
332
       * (1 << dst_width) to ((1 << dst_width) - 1).
333
       */
334
 
335
      res = LLVMBuildSub(builder, lshifted, rshifted, "");
336
   }
337
 
338
   return res;
339
}
340
 
341
 
342
/**
343
 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
344
 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
345
 * return {float, float, float, float} with values in range [0, 1].
346
 */
347
LLVMValueRef
348
lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
349
                                unsigned src_width,
350
                                struct lp_type dst_type,
351
                                LLVMValueRef src)
352
{
353
   LLVMBuilderRef builder = gallivm->builder;
354
   LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
355
   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
356
   LLVMValueRef bias_;
357
   LLVMValueRef res;
358
   unsigned mantissa;
359
   unsigned n;
360
   unsigned long long ubound;
361
   unsigned long long mask;
362
   double scale;
363
   double bias;
364
 
365
   assert(dst_type.floating);
366
 
367
   mantissa = lp_mantissa(dst_type);
368
 
369
   if (src_width <= (mantissa + 1)) {
370
      /*
371
       * The source width matches fits what can be represented in floating
372
       * point (i.e., mantissa + 1 bits). So do a straight multiplication
373
       * followed by casting. No further rounding is necessary.
374
       */
375
 
376
      scale = 1.0/(double)((1ULL << src_width) - 1);
377
      res = LLVMBuildSIToFP(builder, src, vec_type, "");
378
      res = LLVMBuildFMul(builder, res,
379
                          lp_build_const_vec(gallivm, dst_type, scale), "");
380
      return res;
381
   }
382
   else {
383
      /*
384
       * The source width exceeds what can be represented in floating
385
       * point. So truncate the incoming values.
386
       */
387
 
388
      n = MIN2(mantissa, src_width);
389
 
390
      ubound = ((unsigned long long)1 << n);
391
      mask = ubound - 1;
392
      scale = (double)ubound/mask;
393
      bias = (double)((unsigned long long)1 << (mantissa - n));
394
 
395
      res = src;
396
 
397
      if (src_width > mantissa) {
398
         int shift = src_width - mantissa;
399
         res = LLVMBuildLShr(builder, res,
400
                             lp_build_const_int_vec(gallivm, dst_type, shift), "");
401
      }
402
 
403
      bias_ = lp_build_const_vec(gallivm, dst_type, bias);
404
 
405
      res = LLVMBuildOr(builder,
406
                        res,
407
                        LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
408
 
409
      res = LLVMBuildBitCast(builder, res, vec_type, "");
410
 
411
      res = LLVMBuildFSub(builder, res, bias_, "");
412
      res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
413
   }
414
 
415
   return res;
416
}
417
 
418
 
419
/**
420
 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
421
 *
422
 * Returns the number of dsts created from src
423
 */
424
int lp_build_conv_auto(struct gallivm_state *gallivm,
425
                       struct lp_type src_type,
426
                       struct lp_type* dst_type,
427
                       const LLVMValueRef *src,
428
                       unsigned num_srcs,
429
                       LLVMValueRef *dst)
430
{
431
   int i;
432
   int num_dsts = num_srcs;
433
 
434
   if (src_type.floating == dst_type->floating &&
435
       src_type.width == dst_type->width &&
436
       src_type.length == dst_type->length &&
437
       src_type.fixed == dst_type->fixed &&
438
       src_type.norm == dst_type->norm &&
439
       src_type.sign == dst_type->sign)
440
      return num_dsts;
441
 
442
   /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
443
    */
444
   if (src_type.floating == 1 &&
445
       src_type.fixed    == 0 &&
446
       src_type.sign     == 1 &&
447
       src_type.norm     == 0 &&
448
       src_type.width    == 32 &&
449
 
450
       dst_type->floating == 0 &&
451
       dst_type->fixed    == 0 &&
452
       dst_type->sign     == 0 &&
453
       dst_type->norm     == 1 &&
454
       dst_type->width    == 8)
455
   {
456
      /* Special case 4x4f --> 1x16ub */
457
      if (src_type.length == 4 &&
458
          util_cpu_caps.has_sse2)
459
      {
460
         num_dsts = (num_srcs + 3) / 4;
461
         dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
462
 
463
         lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
464
         return num_dsts;
465
      }
466
 
467
      /* Special case 2x8f --> 1x16ub */
468
      if (src_type.length == 8 &&
469
          util_cpu_caps.has_avx)
470
      {
471
         num_dsts = (num_srcs + 1) / 2;
472
         dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
473
 
474
         lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
475
         return num_dsts;
476
      }
477
   }
478
 
479
   /* lp_build_resize does not support M:N */
480
   if (src_type.width == dst_type->width) {
481
      lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
482
   } else {
483
      for (i = 0; i < num_srcs; ++i) {
484
         lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
485
      }
486
   }
487
 
488
   return num_dsts;
489
}
490
 
491
 
492
/**
493
 * Generic type conversion.
494
 *
495
 * TODO: Take a precision argument, or even better, add a new precision member
496
 * to the lp_type union.
497
 */
498
void
499
lp_build_conv(struct gallivm_state *gallivm,
500
              struct lp_type src_type,
501
              struct lp_type dst_type,
502
              const LLVMValueRef *src, unsigned num_srcs,
503
              LLVMValueRef *dst, unsigned num_dsts)
504
{
505
   LLVMBuilderRef builder = gallivm->builder;
506
   struct lp_type tmp_type;
507
   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
508
   unsigned num_tmps;
509
   unsigned i;
510
 
511
   /* We must not loose or gain channels. Only precision */
512
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
513
 
514
   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
515
   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
516
   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
517
   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
518
 
519
   tmp_type = src_type;
520
   for(i = 0; i < num_srcs; ++i) {
521
      assert(lp_check_value(src_type, src[i]));
522
      tmp[i] = src[i];
523
   }
524
   num_tmps = num_srcs;
525
 
526
 
527
   /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
528
    */
529
   if (src_type.floating == 1 &&
530
       src_type.fixed    == 0 &&
531
       src_type.sign     == 1 &&
532
       src_type.norm     == 0 &&
533
       src_type.width    == 32 &&
534
       src_type.length   == 4 &&
535
 
536
       dst_type.floating == 0 &&
537
       dst_type.fixed    == 0 &&
538
       dst_type.sign     == 0 &&
539
       dst_type.norm     == 1 &&
540
       dst_type.width    == 8 &&
541
 
542
       ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
543
        (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
544
 
545
       util_cpu_caps.has_sse2)
546
   {
547
      struct lp_build_context bld;
548
      struct lp_type int16_type, int32_type;
549
      struct lp_type dst_type_ext = dst_type;
550
      LLVMValueRef const_255f;
551
      unsigned i, j;
552
 
553
      lp_build_context_init(&bld, gallivm, src_type);
554
 
555
      dst_type_ext.length = 16;
556
      int16_type = int32_type = dst_type_ext;
557
 
558
      int16_type.width *= 2;
559
      int16_type.length /= 2;
560
      int16_type.sign = 1;
561
 
562
      int32_type.width *= 4;
563
      int32_type.length /= 4;
564
      int32_type.sign = 1;
565
 
566
      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
567
 
568
      for (i = 0; i < num_dsts; ++i, src += 4) {
569
         LLVMValueRef lo, hi;
570
 
571
         for (j = 0; j < dst_type.length / 4; ++j) {
572
            tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
573
            tmp[j] = lp_build_iround(&bld, tmp[j]);
574
         }
575
 
576
         if (num_srcs == 1) {
577
            tmp[1] = tmp[0];
578
         }
579
 
580
         /* relying on clamping behavior of sse2 intrinsics here */
581
         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
582
 
583
         if (num_srcs < 4) {
584
            hi = lo;
585
         }
586
         else {
587
            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
588
         }
589
         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
590
      }
591
      if (num_srcs < 4) {
592
         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
593
      }
594
 
595
      return;
596
   }
597
 
598
   /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
599
    */
600
   else if (src_type.floating == 1 &&
601
      src_type.fixed    == 0 &&
602
      src_type.sign     == 1 &&
603
      src_type.norm     == 0 &&
604
      src_type.width    == 32 &&
605
      src_type.length   == 8 &&
606
 
607
      dst_type.floating == 0 &&
608
      dst_type.fixed    == 0 &&
609
      dst_type.sign     == 0 &&
610
      dst_type.norm     == 1 &&
611
      dst_type.width    == 8 &&
612
 
613
      ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
614
       (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
615
 
616
      util_cpu_caps.has_avx) {
617
 
618
      struct lp_build_context bld;
619
      struct lp_type int16_type, int32_type;
620
      struct lp_type dst_type_ext = dst_type;
621
      LLVMValueRef const_255f;
622
      unsigned i;
623
 
624
      lp_build_context_init(&bld, gallivm, src_type);
625
 
626
      dst_type_ext.length = 16;
627
      int16_type = int32_type = dst_type_ext;
628
 
629
      int16_type.width *= 2;
630
      int16_type.length /= 2;
631
      int16_type.sign = 1;
632
 
633
      int32_type.width *= 4;
634
      int32_type.length /= 4;
635
      int32_type.sign = 1;
636
 
637
      const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
638
 
639
      for (i = 0; i < num_dsts; ++i, src += 2) {
640
         LLVMValueRef lo, hi, a, b;
641
 
642
         a = LLVMBuildFMul(builder, src[0], const_255f, "");
643
         a = lp_build_iround(&bld, a);
644
         tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
645
         tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
646
         /* relying on clamping behavior of sse2 intrinsics here */
647
         lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
648
 
649
         if (num_srcs == 1) {
650
            hi = lo;
651
         }
652
         else {
653
            b = LLVMBuildFMul(builder, src[1], const_255f, "");
654
            b = lp_build_iround(&bld, b);
655
            tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
656
            tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
657
            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
658
 
659
         }
660
         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
661
      }
662
 
663
      if (num_srcs == 1) {
664
         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
665
      }
666
 
667
      return;
668
   }
669
 
670
   /* Special case -> 16bit half-float
671
    */
672
   else if (dst_type.floating && dst_type.width == 16)
673
   {
674
      /* Only support src as 32bit float currently */
675
      assert(src_type.floating && src_type.width == 32);
676
 
677
      for(i = 0; i < num_tmps; ++i)
678
         dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
679
 
680
      return;
681
   }
682
 
683
   /* Pre convert half-floats to floats
684
    */
685
   else if (src_type.floating && src_type.width == 16)
686
   {
687
      for(i = 0; i < num_tmps; ++i)
688
         tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
689
 
690
      tmp_type.width = 32;
691
   }
692
 
693
   /*
694
    * Clamp if necessary
695
    */
696
 
697
   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
698
      struct lp_build_context bld;
699
      double src_min = lp_const_min(src_type);
700
      double dst_min = lp_const_min(dst_type);
701
      double src_max = lp_const_max(src_type);
702
      double dst_max = lp_const_max(dst_type);
703
      LLVMValueRef thres;
704
 
705
      lp_build_context_init(&bld, gallivm, tmp_type);
706
 
707
      if(src_min < dst_min) {
708
         if(dst_min == 0.0)
709
            thres = bld.zero;
710
         else
711
            thres = lp_build_const_vec(gallivm, src_type, dst_min);
712
         for(i = 0; i < num_tmps; ++i)
713
            tmp[i] = lp_build_max(&bld, tmp[i], thres);
714
      }
715
 
716
      if(src_max > dst_max) {
717
         if(dst_max == 1.0)
718
            thres = bld.one;
719
         else
720
            thres = lp_build_const_vec(gallivm, src_type, dst_max);
721
         for(i = 0; i < num_tmps; ++i)
722
            tmp[i] = lp_build_min(&bld, tmp[i], thres);
723
      }
724
   }
725
 
726
   /*
727
    * Scale to the narrowest range
728
    */
729
 
730
   if(dst_type.floating) {
731
      /* Nothing to do */
732
   }
733
   else if(tmp_type.floating) {
734
      if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
735
         for(i = 0; i < num_tmps; ++i) {
736
            tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
737
                                                             tmp_type,
738
                                                             dst_type.width,
739
                                                             tmp[i]);
740
         }
741
         tmp_type.floating = FALSE;
742
      }
743
      else {
744
         double dst_scale = lp_const_scale(dst_type);
745
         LLVMTypeRef tmp_vec_type;
746
 
747
         if (dst_scale != 1.0) {
748
            LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
749
            for(i = 0; i < num_tmps; ++i)
750
               tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
751
         }
752
 
753
         /* Use an equally sized integer for intermediate computations */
754
         tmp_type.floating = FALSE;
755
         tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
756
         for(i = 0; i < num_tmps; ++i) {
757
#if 0
758
            if(dst_type.sign)
759
               tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
760
            else
761
               tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
762
#else
763
           /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
764
            tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
765
#endif
766
         }
767
      }
768
   }
769
   else {
770
      unsigned src_shift = lp_const_shift(src_type);
771
      unsigned dst_shift = lp_const_shift(dst_type);
772
      unsigned src_offset = lp_const_offset(src_type);
773
      unsigned dst_offset = lp_const_offset(dst_type);
774
 
775
      /* Compensate for different offsets */
776
      if (dst_offset > src_offset && src_type.width > dst_type.width) {
777
         for (i = 0; i < num_tmps; ++i) {
778
            LLVMValueRef shifted;
779
            LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
780
            if(src_type.sign)
781
               shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
782
            else
783
               shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
784
 
785
            tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
786
         }
787
      }
788
 
789
      if(src_shift > dst_shift) {
790
         LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
791
                                                     src_shift - dst_shift);
792
         for(i = 0; i < num_tmps; ++i)
793
            if(src_type.sign)
794
               tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
795
            else
796
               tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
797
      }
798
   }
799
 
800
   /*
801
    * Truncate or expand bit width
802
    *
803
    * No data conversion should happen here, although the sign bits are
804
    * crucial to avoid bad clamping.
805
    */
806
 
807
   {
808
      struct lp_type new_type;
809
 
810
      new_type = tmp_type;
811
      new_type.sign   = dst_type.sign;
812
      new_type.width  = dst_type.width;
813
      new_type.length = dst_type.length;
814
 
815
      lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
816
 
817
      tmp_type = new_type;
818
      num_tmps = num_dsts;
819
   }
820
 
821
   /*
822
    * Scale to the widest range
823
    */
824
 
825
   if(src_type.floating) {
826
      /* Nothing to do */
827
   }
828
   else if(!src_type.floating && dst_type.floating) {
829
      if(!src_type.fixed && !src_type.sign && src_type.norm) {
830
         for(i = 0; i < num_tmps; ++i) {
831
            tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
832
                                                     src_type.width,
833
                                                     dst_type,
834
                                                     tmp[i]);
835
         }
836
         tmp_type.floating = TRUE;
837
      }
838
      else {
839
         double src_scale = lp_const_scale(src_type);
840
         LLVMTypeRef tmp_vec_type;
841
 
842
         /* Use an equally sized integer for intermediate computations */
843
         tmp_type.floating = TRUE;
844
         tmp_type.sign = TRUE;
845
         tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
846
         for(i = 0; i < num_tmps; ++i) {
847
#if 0
848
            if(dst_type.sign)
849
               tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
850
            else
851
               tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
852
#else
853
            /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
854
            tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
855
#endif
856
          }
857
 
858
          if (src_scale != 1.0) {
859
             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
860
             for(i = 0; i < num_tmps; ++i)
861
                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
862
          }
863
      }
864
    }
865
    else {
866
       unsigned src_shift = lp_const_shift(src_type);
867
       unsigned dst_shift = lp_const_shift(dst_type);
868
       unsigned src_offset = lp_const_offset(src_type);
869
       unsigned dst_offset = lp_const_offset(dst_type);
870
 
871
       if (src_shift < dst_shift) {
872
          LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
873
          LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
874
 
875
          for (i = 0; i < num_tmps; ++i) {
876
             pre_shift[i] = tmp[i];
877
             tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
878
          }
879
 
880
          /* Compensate for different offsets */
881
          if (dst_offset > src_offset) {
882
             for (i = 0; i < num_tmps; ++i) {
883
                tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
884
             }
885
          }
886
       }
887
    }
888
 
889
   for(i = 0; i < num_dsts; ++i) {
890
      dst[i] = tmp[i];
891
      assert(lp_check_value(dst_type, dst[i]));
892
   }
893
}
894
 
895
 
896
/**
897
 * Bit mask conversion.
898
 *
899
 * This will convert the integer masks that match the given types.
900
 *
901
 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
902
 * Any other value will likely cause unpredictable results.
903
 *
904
 * This is basically a very trimmed down version of lp_build_conv.
905
 */
906
void
907
lp_build_conv_mask(struct gallivm_state *gallivm,
908
                   struct lp_type src_type,
909
                   struct lp_type dst_type,
910
                   const LLVMValueRef *src, unsigned num_srcs,
911
                   LLVMValueRef *dst, unsigned num_dsts)
912
{
913
 
914
   /* We must not loose or gain channels. Only precision */
915
   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
916
 
917
   /*
918
    * Drop
919
    *
920
    * We assume all values are 0 or -1
921
    */
922
 
923
   src_type.floating = FALSE;
924
   src_type.fixed = FALSE;
925
   src_type.sign = TRUE;
926
   src_type.norm = FALSE;
927
 
928
   dst_type.floating = FALSE;
929
   dst_type.fixed = FALSE;
930
   dst_type.sign = TRUE;
931
   dst_type.norm = FALSE;
932
 
933
   /*
934
    * Truncate or expand bit width
935
    */
936
 
937
   lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
938
}