Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /* Copyright (C) 2007-2015 Free Software Foundation, Inc.
  2.  
  3. This file is part of GCC.
  4.  
  5. GCC is free software; you can redistribute it and/or modify it under
  6. the terms of the GNU General Public License as published by the Free
  7. Software Foundation; either version 3, or (at your option) any later
  8. version.
  9.  
  10. GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12. FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13. for more details.
  14.  
  15. Under Section 7 of GPL version 3, you are granted additional
  16. permissions described in the GCC Runtime Library Exception, version
  17. 3.1, as published by the Free Software Foundation.
  18.  
  19. You should have received a copy of the GNU General Public License and
  20. a copy of the GCC Runtime Library Exception along with this program;
  21. see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22. <http://www.gnu.org/licenses/>.  */
  23.  
  24. #ifndef _SQRT_MACROS_H_
  25. #define _SQRT_MACROS_H_
  26.  
  27. #define FENCE __fence
  28.  
  29. #if DOUBLE_EXTENDED_ON
  30.  
  31. extern BINARY80 SQRT80 (BINARY80);
  32.  
  33.  
  34. __BID_INLINE__ UINT64
  35. short_sqrt128 (UINT128 A10) {
  36.   BINARY80 lx, ly, l64;
  37.   int_float f64;
  38.  
  39.   // 2^64
  40.   f64.i = 0x5f800000;
  41.   l64 = (BINARY80) f64.d;
  42.   lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
  43.   ly = SQRT80 (lx);
  44.   return (UINT64) ly;
  45. }
  46.  
  47.  
  48. __BID_INLINE__ void
  49. long_sqrt128 (UINT128 * pCS, UINT256 C256) {
  50.   UINT256 C4;
  51.   UINT128 CS;
  52.   UINT64 X;
  53.   SINT64 SE;
  54.   BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
  55.     l1, l0, lp, lCl;
  56.   int_float fx, f64, fm64;
  57.   int *ple = (int *) &lx;
  58.  
  59.   // 2^64
  60.   f64.i = 0x5f800000;
  61.   l64 = (BINARY80) f64.d;
  62.  
  63.   l128 = l64 * l64;
  64.   lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
  65.   l2 = (BINARY80) C256.w[2] * l128;
  66.   lx = FENCE (lx + l2);
  67.   l1 = (BINARY80) C256.w[1] * l64;
  68.   lx = FENCE (lx + l1);
  69.   l0 = (BINARY80) C256.w[0];
  70.   lx = FENCE (lx + l0);
  71.   // sqrt(C256)
  72.   lS = SQRT80 (lx);
  73.  
  74.   // get coefficient
  75.   // 2^(-64)
  76.   fm64.i = 0x1f800000;
  77.   lm64 = (BINARY80) fm64.d;
  78.   CS.w[1] = (UINT64) (lS * lm64);
  79.   CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
  80.  
  81.   ///////////////////////////////////////
  82.   //  CAUTION!
  83.   //  little endian code only
  84.   //  add solution for big endian
  85.   //////////////////////////////////////
  86.   lSH = lS;
  87.   *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
  88.  
  89.   // correction for C256 rounding
  90.   lCl = FENCE (l3 - lx);
  91.   lCl = FENCE (lCl + l2);
  92.   lCl = FENCE (lCl + l1);
  93.   lCl = FENCE (lCl + l0);
  94.  
  95.   lSL = lS - lSH;
  96.  
  97.   //////////////////////////////////////////
  98.   //   Watch for compiler re-ordering
  99.   //
  100.   /////////////////////////////////////////
  101.   // C256-S^2
  102.   lxL = FENCE (lx - lSH * lSH);
  103.   lp = lSH * lSL;
  104.   lp += lp;
  105.   lxL = FENCE (lxL - lp);
  106.   lSL *= lSL;
  107.   lxL = FENCE (lxL - lSL);
  108.   lCl += lxL;
  109.  
  110.   // correction term
  111.   lE = lCl / (lS + lS);
  112.  
  113.   // get low part of coefficient
  114.   X = CS.w[0];
  115.   if (lCl >= 0) {
  116.     SE = (SINT64) (lE);
  117.     CS.w[0] += SE;
  118.     if (CS.w[0] < X)
  119.       CS.w[1]++;
  120.   } else {
  121.     SE = (SINT64) (-lE);
  122.     CS.w[0] -= SE;
  123.     if (CS.w[0] > X)
  124.       CS.w[1]--;
  125.   }
  126.  
  127.   pCS->w[0] = CS.w[0];
  128.   pCS->w[1] = CS.w[1];
  129. }
  130.  
  131. #else
  132.  
  133. extern double sqrt (double);
  134.  
  135. __BID_INLINE__ UINT64
  136. short_sqrt128 (UINT128 A10) {
  137.   UINT256 ARS, ARS0, AE0, AE, S;
  138.  
  139.   UINT64 MY, ES, CY;
  140.   double lx, l64;
  141.   int_double f64, ly;
  142.   int ey, k;
  143.  
  144.   // 2^64
  145.   f64.i = 0x43f0000000000000ull;
  146.   l64 = f64.d;
  147.   lx = (double) A10.w[1] * l64 + (double) A10.w[0];
  148.   ly.d = 1.0 / sqrt (lx);
  149.  
  150.   MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
  151.   ey = 0x3ff - (ly.i >> 52);
  152.  
  153.   // A10*RS^2
  154.   __mul_64x128_to_192 (ARS0, MY, A10);
  155.   __mul_64x192_to_256 (ARS, MY, ARS0);
  156.  
  157.   // shr by 2*ey+40, to get a 64-bit value
  158.   k = (ey << 1) + 104 - 64;
  159.   if (k >= 128) {
  160.     if (k > 128)
  161.       ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
  162.     else
  163.       ES = ARS.w[2];
  164.   } else {
  165.     if (k >= 64) {
  166.       ARS.w[0] = ARS.w[1];
  167.       ARS.w[1] = ARS.w[2];
  168.       k -= 64;
  169.     }
  170.     if (k) {
  171.       __shr_128 (ARS, ARS, k);
  172.     }
  173.     ES = ARS.w[0];
  174.   }
  175.  
  176.   ES = ((SINT64) ES) >> 1;
  177.  
  178.   if (((SINT64) ES) < 0) {
  179.     ES = -ES;
  180.  
  181.     // A*RS*eps (scaled by 2^64)
  182.     __mul_64x192_to_256 (AE0, ES, ARS0);
  183.  
  184.     AE.w[0] = AE0.w[1];
  185.     AE.w[1] = AE0.w[2];
  186.     AE.w[2] = AE0.w[3];
  187.  
  188.     __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
  189.     __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
  190.     S.w[2] = ARS0.w[2] + AE.w[2] + CY;
  191.   } else {
  192.     // A*RS*eps (scaled by 2^64)
  193.     __mul_64x192_to_256 (AE0, ES, ARS0);
  194.  
  195.     AE.w[0] = AE0.w[1];
  196.     AE.w[1] = AE0.w[2];
  197.     AE.w[2] = AE0.w[3];
  198.  
  199.     __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
  200.     __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
  201.     S.w[2] = ARS0.w[2] - AE.w[2] - CY;
  202.   }
  203.  
  204.   k = ey + 51;
  205.  
  206.   if (k >= 64) {
  207.     if (k >= 128) {
  208.       S.w[0] = S.w[2];
  209.       S.w[1] = 0;
  210.       k -= 128;
  211.     } else {
  212.       S.w[0] = S.w[1];
  213.       S.w[1] = S.w[2];
  214.     }
  215.     k -= 64;
  216.   }
  217.   if (k) {
  218.     __shr_128 (S, S, k);
  219.   }
  220.  
  221.  
  222.   return (UINT64) ((S.w[0] + 1) >> 1);
  223.  
  224. }
  225.  
  226.  
  227.  
  228. __BID_INLINE__ void
  229. long_sqrt128 (UINT128 * pCS, UINT256 C256) {
  230.   UINT512 ARS0, ARS;
  231.   UINT256 ARS00, AE, AE2, S;
  232.   UINT128 ES, ES2, ARS1;
  233.   UINT64 ES32, CY, MY;
  234.   double l64, l128, lx, l2, l1, l0;
  235.   int_double f64, ly;
  236.   int ey, k, k2;
  237.  
  238.   // 2^64
  239.   f64.i = 0x43f0000000000000ull;
  240.   l64 = f64.d;
  241.  
  242.   l128 = l64 * l64;
  243.   lx = (double) C256.w[3] * l64 * l128;
  244.   l2 = (double) C256.w[2] * l128;
  245.   lx = FENCE (lx + l2);
  246.   l1 = (double) C256.w[1] * l64;
  247.   lx = FENCE (lx + l1);
  248.   l0 = (double) C256.w[0];
  249.   lx = FENCE (lx + l0);
  250.   // sqrt(C256)
  251.   ly.d = 1.0 / sqrt (lx);
  252.  
  253.   MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
  254.   ey = 0x3ff - (ly.i >> 52);
  255.  
  256.   // A10*RS^2, scaled by 2^(2*ey+104)
  257.   __mul_64x256_to_320 (ARS0, MY, C256);
  258.   __mul_64x320_to_384 (ARS, MY, ARS0);
  259.  
  260.   // shr by k=(2*ey+104)-128
  261.   // expect k is in the range (192, 256) if result in [10^33, 10^34)
  262.   // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
  263.   k = (ey << 1) + 104 - 128 - 192;
  264.   k2 = 64 - k;
  265.   ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
  266.   ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
  267.   ES.w[1] = ((SINT64) ES.w[1]) >> 1;
  268.  
  269.   // A*RS >> 192 (for error term computation)
  270.   ARS1.w[0] = ARS0.w[3];
  271.   ARS1.w[1] = ARS0.w[4];
  272.  
  273.   // A*RS>>64
  274.   ARS00.w[0] = ARS0.w[1];
  275.   ARS00.w[1] = ARS0.w[2];
  276.   ARS00.w[2] = ARS0.w[3];
  277.   ARS00.w[3] = ARS0.w[4];
  278.  
  279.   if (((SINT64) ES.w[1]) < 0) {
  280.     ES.w[0] = -ES.w[0];
  281.     ES.w[1] = -ES.w[1];
  282.     if (ES.w[0])
  283.       ES.w[1]--;
  284.  
  285.     // A*RS*eps
  286.     __mul_128x128_to_256 (AE, ES, ARS1);
  287.  
  288.     __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
  289.     __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
  290.     __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
  291.     S.w[3] = ARS00.w[3] + AE.w[3] + CY;
  292.   } else {
  293.     // A*RS*eps
  294.     __mul_128x128_to_256 (AE, ES, ARS1);
  295.  
  296.     __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
  297.     __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
  298.     __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
  299.     S.w[3] = ARS00.w[3] - AE.w[3] - CY;
  300.   }
  301.  
  302.   // 3/2*eps^2, scaled by 2^128
  303.   ES32 = ES.w[1] + (ES.w[1] >> 1);
  304.   __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
  305.   // A*RS*3/2*eps^2
  306.   __mul_128x128_to_256 (AE2, ES2, ARS1);
  307.  
  308.   // result, scaled by 2^(ey+52-64)
  309.   __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
  310.   __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
  311.   __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
  312.   S.w[3] = S.w[3] + AE2.w[3] + CY;
  313.  
  314.   // k in (0, 64)
  315.   k = ey + 51 - 128;
  316.   k2 = 64 - k;
  317.   S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
  318.   S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
  319.  
  320.   // round to nearest
  321.   S.w[0]++;
  322.   if (!S.w[0])
  323.     S.w[1]++;
  324.  
  325.   pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
  326.   pCS->w[1] = S.w[1] >> 1;
  327.  
  328. }
  329.  
  330. #endif
  331. #endif
  332.