Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6515 serge 1
/* Copyright (C) 2007-2015 Free Software Foundation, Inc.
2
 
3
This file is part of GCC.
4
 
5
GCC is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free
7
Software Foundation; either version 3, or (at your option) any later
8
version.
9
 
10
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11
WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
for more details.
14
 
15
Under Section 7 of GPL version 3, you are granted additional
16
permissions described in the GCC Runtime Library Exception, version
17
3.1, as published by the Free Software Foundation.
18
 
19
You should have received a copy of the GNU General Public License and
20
a copy of the GCC Runtime Library Exception along with this program;
21
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22
.  */
23
 
24
#include "bid_internal.h"
25
 
26
 
27
#if DECIMAL_CALL_BY_REFERENCE
28
void
29
bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
30
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
31
	     _EXC_INFO_PARAM) {
32
  UINT64 x = *px;
33
#if !DECIMAL_GLOBAL_ROUNDING
34
  unsigned int rnd_mode = *prnd_mode;
35
#endif
36
#else
37
UINT64
38
bid64dq_add (UINT64 x, UINT128 y
39
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
40
	     _EXC_INFO_PARAM) {
41
#endif
42
  UINT64 res = 0xbaddbaddbaddbaddull;
43
  UINT128 x1;
44
 
45
#if DECIMAL_CALL_BY_REFERENCE
46
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
47
  bid64qq_add (&res, &x1, py
48
	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
49
	       _EXC_INFO_ARG);
50
#else
51
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
52
  res = bid64qq_add (x1, y
53
		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
54
		     _EXC_INFO_ARG);
55
#endif
56
  BID_RETURN (res);
57
}
58
 
59
 
60
#if DECIMAL_CALL_BY_REFERENCE
61
void
62
bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
63
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
64
	     _EXC_INFO_PARAM) {
65
  UINT64 y = *py;
66
#if !DECIMAL_GLOBAL_ROUNDING
67
  unsigned int rnd_mode = *prnd_mode;
68
#endif
69
#else
70
UINT64
71
bid64qd_add (UINT128 x, UINT64 y
72
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
73
	     _EXC_INFO_PARAM) {
74
#endif
75
  UINT64 res = 0xbaddbaddbaddbaddull;
76
  UINT128 y1;
77
 
78
#if DECIMAL_CALL_BY_REFERENCE
79
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
80
  bid64qq_add (&res, px, &y1
81
	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
82
	       _EXC_INFO_ARG);
83
#else
84
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
85
  res = bid64qq_add (x, y1
86
		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
87
		     _EXC_INFO_ARG);
88
#endif
89
  BID_RETURN (res);
90
}
91
 
92
 
93
#if DECIMAL_CALL_BY_REFERENCE
94
void
95
bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
96
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
97
	     _EXC_INFO_PARAM) {
98
  UINT128 x = *px, y = *py;
99
#if !DECIMAL_GLOBAL_ROUNDING
100
  unsigned int rnd_mode = *prnd_mode;
101
#endif
102
#else
103
UINT64
104
bid64qq_add (UINT128 x, UINT128 y
105
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
106
	     _EXC_INFO_PARAM) {
107
#endif
108
 
109
  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
110
  };
111
  UINT64 res = 0xbaddbaddbaddbaddull;
112
 
113
  BID_SWAP128 (one);
114
#if DECIMAL_CALL_BY_REFERENCE
115
  bid64qqq_fma (&res, &one, &x, &y
116
		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
117
		_EXC_INFO_ARG);
118
#else
119
  res = bid64qqq_fma (one, x, y
120
		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
121
		      _EXC_INFO_ARG);
122
#endif
123
  BID_RETURN (res);
124
}
125
 
126
 
127
#if DECIMAL_CALL_BY_REFERENCE
128
void
129
bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
130
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
131
	      _EXC_INFO_PARAM) {
132
  UINT64 x = *px, y = *py;
133
#if !DECIMAL_GLOBAL_ROUNDING
134
  unsigned int rnd_mode = *prnd_mode;
135
#endif
136
#else
137
UINT128
138
bid128dd_add (UINT64 x, UINT64 y
139
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
140
	      _EXC_INFO_PARAM) {
141
#endif
142
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
143
  };
144
  UINT128 x1, y1;
145
 
146
#if DECIMAL_CALL_BY_REFERENCE
147
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
148
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
149
  bid128_add (&res, &x1, &y1
150
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
151
	      _EXC_INFO_ARG);
152
#else
153
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
154
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
155
  res = bid128_add (x1, y1
156
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
157
		    _EXC_INFO_ARG);
158
#endif
159
  BID_RETURN (res);
160
}
161
 
162
 
163
#if DECIMAL_CALL_BY_REFERENCE
164
void
165
bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
166
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
167
	      _EXC_INFO_PARAM) {
168
  UINT64 x = *px;
169
#if !DECIMAL_GLOBAL_ROUNDING
170
  unsigned int rnd_mode = *prnd_mode;
171
#endif
172
#else
173
UINT128
174
bid128dq_add (UINT64 x, UINT128 y
175
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
176
	      _EXC_INFO_PARAM) {
177
#endif
178
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
179
  };
180
  UINT128 x1;
181
 
182
#if DECIMAL_CALL_BY_REFERENCE
183
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
184
  bid128_add (&res, &x1, py
185
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
186
	      _EXC_INFO_ARG);
187
#else
188
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
189
  res = bid128_add (x1, y
190
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
191
		    _EXC_INFO_ARG);
192
#endif
193
  BID_RETURN (res);
194
}
195
 
196
 
197
#if DECIMAL_CALL_BY_REFERENCE
198
void
199
bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
200
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
201
	      _EXC_INFO_PARAM) {
202
  UINT64 y = *py;
203
#if !DECIMAL_GLOBAL_ROUNDING
204
  unsigned int rnd_mode = *prnd_mode;
205
#endif
206
#else
207
UINT128
208
bid128qd_add (UINT128 x, UINT64 y
209
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
210
	      _EXC_INFO_PARAM) {
211
#endif
212
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
213
  };
214
  UINT128 y1;
215
 
216
#if DECIMAL_CALL_BY_REFERENCE
217
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
218
  bid128_add (&res, px, &y1
219
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
220
	      _EXC_INFO_ARG);
221
#else
222
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
223
  res = bid128_add (x, y1
224
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
225
		    _EXC_INFO_ARG);
226
#endif
227
  BID_RETURN (res);
228
}
229
 
230
 
231
// bid128_add stands for bid128qq_add
232
 
233
 
234
/*****************************************************************************
235
 *  BID64/BID128 sub
236
 ****************************************************************************/
237
 
238
#if DECIMAL_CALL_BY_REFERENCE
239
void
240
bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
241
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
242
	     _EXC_INFO_PARAM) {
243
  UINT64 x = *px;
244
#if !DECIMAL_GLOBAL_ROUNDING
245
  unsigned int rnd_mode = *prnd_mode;
246
#endif
247
#else
248
UINT64
249
bid64dq_sub (UINT64 x, UINT128 y
250
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
251
	     _EXC_INFO_PARAM) {
252
#endif
253
  UINT64 res = 0xbaddbaddbaddbaddull;
254
  UINT128 x1;
255
 
256
#if DECIMAL_CALL_BY_REFERENCE
257
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
258
  bid64qq_sub (&res, &x1, py
259
	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
260
	       _EXC_INFO_ARG);
261
#else
262
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
263
  res = bid64qq_sub (x1, y
264
		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
265
		     _EXC_INFO_ARG);
266
#endif
267
  BID_RETURN (res);
268
}
269
 
270
 
271
#if DECIMAL_CALL_BY_REFERENCE
272
void
273
bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
274
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
275
	     _EXC_INFO_PARAM) {
276
  UINT64 y = *py;
277
#if !DECIMAL_GLOBAL_ROUNDING
278
  unsigned int rnd_mode = *prnd_mode;
279
#endif
280
#else
281
UINT64
282
bid64qd_sub (UINT128 x, UINT64 y
283
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
284
	     _EXC_INFO_PARAM) {
285
#endif
286
  UINT64 res = 0xbaddbaddbaddbaddull;
287
  UINT128 y1;
288
 
289
#if DECIMAL_CALL_BY_REFERENCE
290
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
291
  bid64qq_sub (&res, px, &y1
292
	       _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
293
	       _EXC_INFO_ARG);
294
#else
295
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
296
  res = bid64qq_sub (x, y1
297
		     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
298
		     _EXC_INFO_ARG);
299
#endif
300
  BID_RETURN (res);
301
}
302
 
303
 
304
#if DECIMAL_CALL_BY_REFERENCE
305
void
306
bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
307
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
308
	     _EXC_INFO_PARAM) {
309
  UINT128 x = *px, y = *py;
310
#if !DECIMAL_GLOBAL_ROUNDING
311
  unsigned int rnd_mode = *prnd_mode;
312
#endif
313
#else
314
UINT64
315
bid64qq_sub (UINT128 x, UINT128 y
316
	     _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
317
	     _EXC_INFO_PARAM) {
318
#endif
319
 
320
  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
321
  };
322
  UINT64 res = 0xbaddbaddbaddbaddull;
323
  UINT64 y_sign;
324
 
325
  BID_SWAP128 (one);
326
  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
327
    // change its sign
328
    y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
329
    if (y_sign)
330
      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
331
    else
332
      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
333
  }
334
#if DECIMAL_CALL_BY_REFERENCE
335
  bid64qqq_fma (&res, &one, &x, &y
336
		_RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
337
		_EXC_INFO_ARG);
338
#else
339
  res = bid64qqq_fma (one, x, y
340
		      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
341
		      _EXC_INFO_ARG);
342
#endif
343
  BID_RETURN (res);
344
}
345
 
346
 
347
#if DECIMAL_CALL_BY_REFERENCE
348
void
349
bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
350
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
351
	      _EXC_INFO_PARAM) {
352
  UINT64 x = *px, y = *py;
353
#if !DECIMAL_GLOBAL_ROUNDING
354
  unsigned int rnd_mode = *prnd_mode;
355
#endif
356
#else
357
UINT128
358
bid128dd_sub (UINT64 x, UINT64 y
359
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
360
	      _EXC_INFO_PARAM) {
361
#endif
362
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
363
  };
364
  UINT128 x1, y1;
365
 
366
#if DECIMAL_CALL_BY_REFERENCE
367
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
368
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
369
  bid128_sub (&res, &x1, &y1
370
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
371
	      _EXC_INFO_ARG);
372
#else
373
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
374
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
375
  res = bid128_sub (x1, y1
376
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
377
		    _EXC_INFO_ARG);
378
#endif
379
  BID_RETURN (res);
380
}
381
 
382
 
383
#if DECIMAL_CALL_BY_REFERENCE
384
void
385
bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
386
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
387
	      _EXC_INFO_PARAM) {
388
  UINT64 x = *px;
389
#if !DECIMAL_GLOBAL_ROUNDING
390
  unsigned int rnd_mode = *prnd_mode;
391
#endif
392
#else
393
UINT128
394
bid128dq_sub (UINT64 x, UINT128 y
395
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
396
	      _EXC_INFO_PARAM) {
397
#endif
398
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
399
  };
400
  UINT128 x1;
401
 
402
#if DECIMAL_CALL_BY_REFERENCE
403
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
404
  bid128_sub (&res, &x1, py
405
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
406
	      _EXC_INFO_ARG);
407
#else
408
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
409
  res = bid128_sub (x1, y
410
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
411
		    _EXC_INFO_ARG);
412
#endif
413
  BID_RETURN (res);
414
}
415
 
416
 
417
#if DECIMAL_CALL_BY_REFERENCE
418
void
419
bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
420
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
421
	      _EXC_INFO_PARAM) {
422
  UINT64 y = *py;
423
#if !DECIMAL_GLOBAL_ROUNDING
424
  unsigned int rnd_mode = *prnd_mode;
425
#endif
426
#else
427
UINT128
428
bid128qd_sub (UINT128 x, UINT64 y
429
	      _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
430
	      _EXC_INFO_PARAM) {
431
#endif
432
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
433
  };
434
  UINT128 y1;
435
 
436
#if DECIMAL_CALL_BY_REFERENCE
437
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
438
  bid128_sub (&res, px, &y1
439
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
440
	      _EXC_INFO_ARG);
441
#else
442
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
443
  res = bid128_sub (x, y1
444
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
445
		    _EXC_INFO_ARG);
446
#endif
447
  BID_RETURN (res);
448
}
449
 
450
#if DECIMAL_CALL_BY_REFERENCE
451
void
452
bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
453
	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
454
	    _EXC_INFO_PARAM) {
455
  UINT128 x = *px, y = *py;
456
#if !DECIMAL_GLOBAL_ROUNDING
457
  unsigned int rnd_mode = *prnd_mode;
458
#endif
459
#else
460
UINT128
461
bid128_add (UINT128 x, UINT128 y
462
	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
463
	    _EXC_INFO_PARAM) {
464
#endif
465
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
466
  };
467
  UINT64 x_sign, y_sign, tmp_sign;
468
  UINT64 x_exp, y_exp, tmp_exp;	// e1 = x_exp, e2 = y_exp
469
  UINT64 C1_hi, C2_hi, tmp_signif_hi;
470
  UINT64 C1_lo, C2_lo, tmp_signif_lo;
471
  // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
472
  // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
473
  UINT64 tmp64, tmp64A, tmp64B;
474
  BID_UI64DOUBLE tmp1, tmp2;
475
  int x_nr_bits, y_nr_bits;
476
  int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
477
  UINT64 halfulp64;
478
  UINT128 halfulp128;
479
  UINT128 C1, C2;
480
  UINT128 ten2m1;
481
  UINT128 highf2star;		// top 128 bits in f2*; low 128 bits in R256[1], R256[0]
482
  UINT256 P256, Q256, R256;
483
  int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
484
  int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
485
  int second_pass = 0;
486
 
487
  BID_SWAP128 (x);
488
  BID_SWAP128 (y);
489
  x_sign = x.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
490
  y_sign = y.w[1] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
491
 
492
  // check for NaN or Infinity
493
  if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
494
      || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
495
    // x is special or y is special
496
    if ((x.w[1] & MASK_NAN) == MASK_NAN) {	// x is NAN
497
      // check first for non-canonical NaN payload
498
      if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
499
	  (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
500
	   && (x.w[0] > 0x38c15b09ffffffffull))) {
501
	x.w[1] = x.w[1] & 0xffffc00000000000ull;
502
	x.w[0] = 0x0ull;
503
      }
504
      if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {	// x is SNAN
505
	// set invalid flag
506
	*pfpsf |= INVALID_EXCEPTION;
507
	// return quiet (x)
508
	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
509
	// clear out also G[6]-G[16]
510
	res.w[0] = x.w[0];
511
      } else {	// x is QNaN
512
	// return x
513
	res.w[1] = x.w[1] & 0xfc003fffffffffffull;
514
	// clear out G[6]-G[16]
515
	res.w[0] = x.w[0];
516
	// if y = SNaN signal invalid exception
517
	if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
518
	  // set invalid flag
519
	  *pfpsf |= INVALID_EXCEPTION;
520
	}
521
      }
522
      BID_SWAP128 (res);
523
      BID_RETURN (res);
524
    } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {	// y is NAN
525
      // check first for non-canonical NaN payload
526
      if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
527
	  (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
528
	   && (y.w[0] > 0x38c15b09ffffffffull))) {
529
	y.w[1] = y.w[1] & 0xffffc00000000000ull;
530
	y.w[0] = 0x0ull;
531
      }
532
      if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {	// y is SNAN
533
	// set invalid flag
534
	*pfpsf |= INVALID_EXCEPTION;
535
	// return quiet (y)
536
	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
537
	// clear out also G[6]-G[16]
538
	res.w[0] = y.w[0];
539
      } else {	// y is QNaN
540
	// return y
541
	res.w[1] = y.w[1] & 0xfc003fffffffffffull;
542
	// clear out G[6]-G[16]
543
	res.w[0] = y.w[0];
544
      }
545
      BID_SWAP128 (res);
546
      BID_RETURN (res);
547
    } else {	// neither x not y is NaN; at least one is infinity
548
      if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {	// x is infinity
549
	if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {	// y is infinity
550
	  // if same sign, return either of them
551
	  if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
552
	    res.w[1] = x_sign | MASK_INF;
553
	    res.w[0] = 0x0ull;
554
	  } else {	// x and y are infinities of opposite signs
555
	    // set invalid flag
556
	    *pfpsf |= INVALID_EXCEPTION;
557
	    // return QNaN Indefinite
558
	    res.w[1] = 0x7c00000000000000ull;
559
	    res.w[0] = 0x0000000000000000ull;
560
	  }
561
	} else {	// y is 0 or finite
562
	  // return x
563
	  res.w[1] = x_sign | MASK_INF;
564
	  res.w[0] = 0x0ull;
565
	}
566
      } else {	// x is not NaN or infinity, so y must be infinity
567
	res.w[1] = y_sign | MASK_INF;
568
	res.w[0] = 0x0ull;
569
      }
570
      BID_SWAP128 (res);
571
      BID_RETURN (res);
572
    }
573
  }
574
  // unpack the arguments
575
 
576
  // unpack x
577
  C1_hi = x.w[1] & MASK_COEFF;
578
  C1_lo = x.w[0];
579
  // test for non-canonical values:
580
  // - values whose encoding begins with x00, x01, or x10 and whose
581
  //   coefficient is larger than 10^34 -1, or
582
  // - values whose encoding begins with x1100, x1101, x1110 (if NaNs
583
  //   and infinitis were eliminated already this test is reduced to
584
  //   checking for x10x)
585
 
586
  // x is not infinity; check for non-canonical values - treated as zero
587
  if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
588
    // G0_G1=11; non-canonical
589
    x_exp = (x.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
590
    C1_hi = 0;	// significand high
591
    C1_lo = 0;	// significand low
592
  } else {	// G0_G1 != 11
593
    x_exp = x.w[1] & MASK_EXP;	// biased and shifted left 49 bits
594
    if (C1_hi > 0x0001ed09bead87c0ull ||
595
	(C1_hi == 0x0001ed09bead87c0ull
596
	 && C1_lo > 0x378d8e63ffffffffull)) {
597
      // x is non-canonical if coefficient is larger than 10^34 -1
598
      C1_hi = 0;
599
      C1_lo = 0;
600
    } else {	// canonical
601
      ;
602
    }
603
  }
604
 
605
  // unpack y
606
  C2_hi = y.w[1] & MASK_COEFF;
607
  C2_lo = y.w[0];
608
  // y is not infinity; check for non-canonical values - treated as zero
609
  if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
610
    // G0_G1=11; non-canonical
611
    y_exp = (y.w[1] << 2) & MASK_EXP;	// biased and shifted left 49 bits
612
    C2_hi = 0;	// significand high
613
    C2_lo = 0;	// significand low
614
  } else {	// G0_G1 != 11
615
    y_exp = y.w[1] & MASK_EXP;	// biased and shifted left 49 bits
616
    if (C2_hi > 0x0001ed09bead87c0ull ||
617
	(C2_hi == 0x0001ed09bead87c0ull
618
	 && C2_lo > 0x378d8e63ffffffffull)) {
619
      // y is non-canonical if coefficient is larger than 10^34 -1
620
      C2_hi = 0;
621
      C2_lo = 0;
622
    } else {	// canonical
623
      ;
624
    }
625
  }
626
 
627
  if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
628
    // x is 0 and y is not special
629
    // if y is 0 return 0 with the smaller exponent
630
    if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
631
      if (x_exp < y_exp)
632
	res.w[1] = x_exp;
633
      else
634
	res.w[1] = y_exp;
635
      if (x_sign && y_sign)
636
	res.w[1] = res.w[1] | x_sign;	// both negative
637
      else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
638
	res.w[1] = res.w[1] | 0x8000000000000000ull;	// -0
639
      // else; // res = +0
640
      res.w[0] = 0;
641
    } else {
642
      // for 0 + y return y, with the preferred exponent
643
      if (y_exp <= x_exp) {
644
	res.w[1] = y.w[1];
645
	res.w[0] = y.w[0];
646
      } else {	// if y_exp > x_exp
647
	// return (C2 * 10^scale) * 10^(y_exp - scale)
648
	// where scale = min (P34-q2, y_exp-x_exp)
649
	// determine q2 = nr. of decimal digits in y
650
	//  determine first the nr. of bits in y (y_nr_bits)
651
 
652
	if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
653
	  if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
654
	    // split the 64-bit value in two 32-bit halves to avoid
655
	    // rounding errors
656
	    if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
657
	      tmp2.d = (double) (C2_lo >> 32);	// exact conversion
658
	      y_nr_bits =
659
		32 +
660
		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
661
	    } else {	// y < 2^32
662
	      tmp2.d = (double) (C2_lo);	// exact conversion
663
	      y_nr_bits =
664
		((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
665
	    }
666
	  } else {	// if y < 2^53
667
	    tmp2.d = (double) C2_lo;	// exact conversion
668
	    y_nr_bits =
669
	      ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
670
	  }
671
	} else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
672
	  tmp2.d = (double) C2_hi;	// exact conversion
673
	  y_nr_bits =
674
	    64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
675
	}
676
	q2 = nr_digits[y_nr_bits].digits;
677
	if (q2 == 0) {
678
	  q2 = nr_digits[y_nr_bits].digits1;
679
	  if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
680
	      (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
681
	       C2_lo >= nr_digits[y_nr_bits].threshold_lo))
682
	    q2++;
683
	}
684
	// return (C2 * 10^scale) * 10^(y_exp - scale)
685
	// where scale = min (P34-q2, y_exp-x_exp)
686
	scale = P34 - q2;
687
	ind = (y_exp - x_exp) >> 49;
688
	if (ind < scale)
689
	  scale = ind;
690
	if (scale == 0) {
691
	  res.w[1] = y.w[1];
692
	  res.w[0] = y.w[0];
693
	} else if (q2 <= 19) {	// y fits in 64 bits
694
	  if (scale <= 19) {	// 10^scale fits in 64 bits
695
	    // 64 x 64 C2_lo * ten2k64[scale]
696
	    __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
697
	  } else {	// 10^scale fits in 128 bits
698
	    // 64 x 128 C2_lo * ten2k128[scale - 20]
699
	    __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
700
	  }
701
	} else {	// y fits in 128 bits, but 10^scale must fit in 64 bits
702
	  // 64 x 128 ten2k64[scale] * C2
703
	  C2.w[1] = C2_hi;
704
	  C2.w[0] = C2_lo;
705
	  __mul_128x64_to_128 (res, ten2k64[scale], C2);
706
	}
707
	// subtract scale from the exponent
708
	y_exp = y_exp - ((UINT64) scale << 49);
709
	res.w[1] = res.w[1] | y_sign | y_exp;
710
      }
711
    }
712
    BID_SWAP128 (res);
713
    BID_RETURN (res);
714
  } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
715
    // y is 0 and x is not special, and not zero
716
    // for x + 0 return x, with the preferred exponent
717
    if (x_exp <= y_exp) {
718
      res.w[1] = x.w[1];
719
      res.w[0] = x.w[0];
720
    } else {	// if x_exp > y_exp
721
      // return (C1 * 10^scale) * 10^(x_exp - scale)
722
      // where scale = min (P34-q1, x_exp-y_exp)
723
      // determine q1 = nr. of decimal digits in x
724
      //  determine first the nr. of bits in x
725
      if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
726
	if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
727
	  // split the 64-bit value in two 32-bit halves to avoid
728
	  // rounding errors
729
	  if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
730
	    tmp1.d = (double) (C1_lo >> 32);	// exact conversion
731
	    x_nr_bits =
732
	      32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
733
		    0x3ff);
734
	  } else {	// x < 2^32
735
	    tmp1.d = (double) (C1_lo);	// exact conversion
736
	    x_nr_bits =
737
	      ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
738
	  }
739
	} else {	// if x < 2^53
740
	  tmp1.d = (double) C1_lo;	// exact conversion
741
	  x_nr_bits =
742
	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
743
	}
744
      } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
745
	tmp1.d = (double) C1_hi;	// exact conversion
746
	x_nr_bits =
747
	  64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
748
      }
749
      q1 = nr_digits[x_nr_bits].digits;
750
      if (q1 == 0) {
751
	q1 = nr_digits[x_nr_bits].digits1;
752
	if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
753
	    (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
754
	     C1_lo >= nr_digits[x_nr_bits].threshold_lo))
755
	  q1++;
756
      }
757
      // return (C1 * 10^scale) * 10^(x_exp - scale)
758
      // where scale = min (P34-q1, x_exp-y_exp)
759
      scale = P34 - q1;
760
      ind = (x_exp - y_exp) >> 49;
761
      if (ind < scale)
762
	scale = ind;
763
      if (scale == 0) {
764
	res.w[1] = x.w[1];
765
	res.w[0] = x.w[0];
766
      } else if (q1 <= 19) {	// x fits in 64 bits
767
	if (scale <= 19) {	// 10^scale fits in 64 bits
768
	  // 64 x 64 C1_lo * ten2k64[scale]
769
	  __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
770
	} else {	// 10^scale fits in 128 bits
771
	  // 64 x 128 C1_lo * ten2k128[scale - 20]
772
	  __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
773
	}
774
      } else {	// x fits in 128 bits, but 10^scale must fit in 64 bits
775
	// 64 x 128 ten2k64[scale] * C1
776
	C1.w[1] = C1_hi;
777
	C1.w[0] = C1_lo;
778
	__mul_128x64_to_128 (res, ten2k64[scale], C1);
779
      }
780
      // subtract scale from the exponent
781
      x_exp = x_exp - ((UINT64) scale << 49);
782
      res.w[1] = res.w[1] | x_sign | x_exp;
783
    }
784
    BID_SWAP128 (res);
785
    BID_RETURN (res);
786
  } else {	// x and y are not canonical, not special, and are not zero
787
    // note that the result may still be zero, and then it has to have the
788
    // preferred exponent
789
    if (x_exp < y_exp) {	// if exp_x < exp_y then swap x and y
790
      tmp_sign = x_sign;
791
      tmp_exp = x_exp;
792
      tmp_signif_hi = C1_hi;
793
      tmp_signif_lo = C1_lo;
794
      x_sign = y_sign;
795
      x_exp = y_exp;
796
      C1_hi = C2_hi;
797
      C1_lo = C2_lo;
798
      y_sign = tmp_sign;
799
      y_exp = tmp_exp;
800
      C2_hi = tmp_signif_hi;
801
      C2_lo = tmp_signif_lo;
802
    }
803
    // q1 = nr. of decimal digits in x
804
    //  determine first the nr. of bits in x
805
    if (C1_hi == 0) {	// x_bits is the nr. of bits in C1_lo
806
      if (C1_lo >= 0x0020000000000000ull) {	// x >= 2^53
807
	//split the 64-bit value in two 32-bit halves to avoid rounding errors
808
	if (C1_lo >= 0x0000000100000000ull) {	// x >= 2^32
809
	  tmp1.d = (double) (C1_lo >> 32);	// exact conversion
810
	  x_nr_bits =
811
	    32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
812
	} else {	// x < 2^32
813
	  tmp1.d = (double) (C1_lo);	// exact conversion
814
	  x_nr_bits =
815
	    ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
816
	}
817
      } else {	// if x < 2^53
818
	tmp1.d = (double) C1_lo;	// exact conversion
819
	x_nr_bits =
820
	  ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
821
      }
822
    } else {	// C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
823
      tmp1.d = (double) C1_hi;	// exact conversion
824
      x_nr_bits =
825
	64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
826
    }
827
 
828
    q1 = nr_digits[x_nr_bits].digits;
829
    if (q1 == 0) {
830
      q1 = nr_digits[x_nr_bits].digits1;
831
      if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
832
	  (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
833
	   C1_lo >= nr_digits[x_nr_bits].threshold_lo))
834
	q1++;
835
    }
836
    // q2 = nr. of decimal digits in y
837
    //  determine first the nr. of bits in y (y_nr_bits)
838
    if (C2_hi == 0) {	// y_bits is the nr. of bits in C2_lo
839
      if (C2_lo >= 0x0020000000000000ull) {	// y >= 2^53
840
	//split the 64-bit value in two 32-bit halves to avoid rounding errors
841
	if (C2_lo >= 0x0000000100000000ull) {	// y >= 2^32
842
	  tmp2.d = (double) (C2_lo >> 32);	// exact conversion
843
	  y_nr_bits =
844
	    32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
845
	} else {	// y < 2^32
846
	  tmp2.d = (double) (C2_lo);	// exact conversion
847
	  y_nr_bits =
848
	    ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
849
	}
850
      } else {	// if y < 2^53
851
	tmp2.d = (double) C2_lo;	// exact conversion
852
	y_nr_bits =
853
	  ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
854
      }
855
    } else {	// C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
856
      tmp2.d = (double) C2_hi;	// exact conversion
857
      y_nr_bits =
858
	64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
859
    }
860
 
861
    q2 = nr_digits[y_nr_bits].digits;
862
    if (q2 == 0) {
863
      q2 = nr_digits[y_nr_bits].digits1;
864
      if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
865
	  (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
866
	   C2_lo >= nr_digits[y_nr_bits].threshold_lo))
867
	q2++;
868
    }
869
 
870
    delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
871
 
872
    if (delta >= P34) {
873
      // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
874
      // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
875
      // the result is inexact; the preferred exponent is the least possible
876
 
877
      if (delta >= P34 + 1) {
878
	// for RN the result is the operand with the larger magnitude,
879
	// possibly scaled up by 10^(P34-q1)
880
	// an overflow cannot occur in this case (rounding to nearest)
881
	if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
882
	  // Note: because delta >= P34+1 it is certain that
883
	  //     x_exp - ((UINT64)scale << 49) will stay above e_min
884
	  scale = P34 - q1;
885
	  if (q1 <= 19) {	// C1 fits in 64 bits
886
	    // 1 <= q1 <= 19 => 15 <= scale <= 33
887
	    if (scale <= 19) {	// 10^scale fits in 64 bits
888
	      __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
889
	    } else {	// if 20 <= scale <= 33
890
	      // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
891
	      // (C1 * 10^(scale-19)) fits in 64 bits
892
	      C1_lo = C1_lo * ten2k64[scale - 19];
893
	      __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
894
	    }
895
	  } else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
896
	    // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
897
	    C1.w[1] = C1_hi;
898
	    C1.w[0] = C1_lo;
899
	    // C1 = ten2k64[P34 - q1] * C1
900
	    __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
901
	  }
902
	  x_exp = x_exp - ((UINT64) scale << 49);
903
	  C1_hi = C1.w[1];
904
	  C1_lo = C1.w[0];
905
	}
906
	// some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1)
907
	// (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) =>
908
	// subtract 1 ulp
909
	// Note: do this only for rounding to nearest; for other rounding
910
	// modes the correction will be applied next
911
	if ((rnd_mode == ROUNDING_TO_NEAREST
912
	     || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
913
	    && C1_hi == 0x0000314dc6448d93ull
914
	    && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
915
	    && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
916
							     && (C2_hi >
917
								 midpoint128
918
								 [q2 -
919
								  20].
920
								 w[1]
921
								 ||
922
								 (C2_hi
923
								  ==
924
								  midpoint128
925
								  [q2 -
926
								   20].
927
								  w[1]
928
								  &&
929
								  C2_lo
930
								  >
931
								  midpoint128
932
								  [q2 -
933
								   20].
934
								  w
935
								  [0])))))
936
	{
937
	  // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
938
	  C1_hi = 0x0001ed09bead87c0ull;
939
	  C1_lo = 0x378d8e63ffffffffull;
940
	  x_exp = x_exp - EXP_P1;
941
	}
942
	if (rnd_mode != ROUNDING_TO_NEAREST) {
943
	  if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
944
	      (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
945
	    // add 1 ulp and then check for overflow
946
	    C1_lo = C1_lo + 1;
947
	    if (C1_lo == 0) {	// rounding overflow in the low 64 bits
948
	      C1_hi = C1_hi + 1;
949
	    }
950
	    if (C1_hi == 0x0001ed09bead87c0ull
951
		&& C1_lo == 0x378d8e6400000000ull) {
952
	      // C1 = 10^34 => rounding overflow
953
	      C1_hi = 0x0000314dc6448d93ull;
954
	      C1_lo = 0x38c15b0a00000000ull;	// 10^33
955
	      x_exp = x_exp + EXP_P1;
956
	      if (x_exp == EXP_MAX_P1) {	// overflow
957
		C1_hi = 0x7800000000000000ull;	// +inf
958
		C1_lo = 0x0ull;
959
		x_exp = 0;	// x_sign is preserved
960
		// set overflow flag (the inexact flag was set too)
961
		*pfpsf |= OVERFLOW_EXCEPTION;
962
	      }
963
	    }
964
	  } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
965
		     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
966
		     (rnd_mode == ROUNDING_TO_ZERO
967
		      && x_sign != y_sign)) {
968
	    // subtract 1 ulp from C1
969
	    // Note: because delta >= P34 + 1 the result cannot be zero
970
	    C1_lo = C1_lo - 1;
971
	    if (C1_lo == 0xffffffffffffffffull)
972
	      C1_hi = C1_hi - 1;
973
	    // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and
974
	    // decrease the exponent by 1 (because delta >= P34 + 1 the
975
	    // exponent will not become less than e_min)
976
	    // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
977
	    // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
978
	    if (C1_hi == 0x0000314dc6448d93ull
979
		&& C1_lo == 0x38c15b09ffffffffull) {
980
	      // make C1 = 10^34  - 1
981
	      C1_hi = 0x0001ed09bead87c0ull;
982
	      C1_lo = 0x378d8e63ffffffffull;
983
	      x_exp = x_exp - EXP_P1;
984
	    }
985
	  } else {
986
	    ;	// the result is already correct
987
	  }
988
	}
989
	// set the inexact flag
990
	*pfpsf |= INEXACT_EXCEPTION;
991
	// assemble the result
992
	res.w[1] = x_sign | x_exp | C1_hi;
993
	res.w[0] = C1_lo;
994
      } else {	// delta = P34
995
	// in most cases, the smaller operand may be < or = or > 1/2 ulp of the
996
	// larger operand
997
	// however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
998
	// to accuracy loss after subtraction, and will be treated separately
999
	if (x_sign == y_sign || (q1 <= 20
1000
				 && (C1_hi != 0
1001
				     || C1_lo != ten2k64[q1 - 1]))
1002
	    || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
1003
			     || C1_lo != ten2k128[q1 - 21].w[0]))) {
1004
	  // if x_sign == y_sign or C1 != 10^(q1-1)
1005
	  // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
1006
	  // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
1007
	  if (q2 <= 19) {	// C2 and 5*10^(q2-1) both fit in 64 bits
1008
	    halfulp64 = midpoint64[q2 - 1];	// 5 * 10^(q2-1)
1009
	    if (C2_lo < halfulp64) {	// n2 < 1/2 ulp (n1)
1010
	      // for RN the result is the operand with the larger magnitude,
1011
	      // possibly scaled up by 10^(P34-q1)
1012
	      // an overflow cannot occur in this case (rounding to nearest)
1013
	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1014
		// Note: because delta = P34 it is certain that
1015
		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1016
		scale = P34 - q1;
1017
		if (q1 <= 19) {	// C1 fits in 64 bits
1018
		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1019
		  if (scale <= 19) {	// 10^scale fits in 64 bits
1020
		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1021
		  } else {	// if 20 <= scale <= 33
1022
		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1023
		    // (C1 * 10^(scale-19)) fits in 64 bits
1024
		    C1_lo = C1_lo * ten2k64[scale - 19];
1025
		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1026
		  }
1027
		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1028
		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1029
		  C1.w[1] = C1_hi;
1030
		  C1.w[0] = C1_lo;
1031
		  // C1 = ten2k64[P34 - q1] * C1
1032
		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1033
		}
1034
		x_exp = x_exp - ((UINT64) scale << 49);
1035
		C1_hi = C1.w[1];
1036
		C1_lo = C1.w[0];
1037
	      }
1038
	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1039
		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1040
		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1041
		  // add 1 ulp and then check for overflow
1042
		  C1_lo = C1_lo + 1;
1043
		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1044
		    C1_hi = C1_hi + 1;
1045
		  }
1046
		  if (C1_hi == 0x0001ed09bead87c0ull
1047
		      && C1_lo == 0x378d8e6400000000ull) {
1048
		    // C1 = 10^34 => rounding overflow
1049
		    C1_hi = 0x0000314dc6448d93ull;
1050
		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1051
		    x_exp = x_exp + EXP_P1;
1052
		    if (x_exp == EXP_MAX_P1) {	// overflow
1053
		      C1_hi = 0x7800000000000000ull;	// +inf
1054
		      C1_lo = 0x0ull;
1055
		      x_exp = 0;	// x_sign is preserved
1056
		      // set overflow flag (the inexact flag was set too)
1057
		      *pfpsf |= OVERFLOW_EXCEPTION;
1058
		    }
1059
		  }
1060
		} else
1061
		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1062
		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1063
		      || (rnd_mode == ROUNDING_TO_ZERO
1064
			  && x_sign != y_sign)) {
1065
		  // subtract 1 ulp from C1
1066
		  // Note: because delta >= P34 + 1 the result cannot be zero
1067
		  C1_lo = C1_lo - 1;
1068
		  if (C1_lo == 0xffffffffffffffffull)
1069
		    C1_hi = C1_hi - 1;
1070
		  // if the coefficient is 10^33-1 then make it 10^34-1 and
1071
		  // decrease the exponent by 1 (because delta >= P34 + 1 the
1072
		  // exponent will not become less than e_min)
1073
		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1074
		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1075
		  if (C1_hi == 0x0000314dc6448d93ull
1076
		      && C1_lo == 0x38c15b09ffffffffull) {
1077
		    // make C1 = 10^34  - 1
1078
		    C1_hi = 0x0001ed09bead87c0ull;
1079
		    C1_lo = 0x378d8e63ffffffffull;
1080
		    x_exp = x_exp - EXP_P1;
1081
		  }
1082
		} else {
1083
		  ;	// the result is already correct
1084
		}
1085
	      }
1086
	      // set the inexact flag
1087
	      *pfpsf |= INEXACT_EXCEPTION;
1088
	      // assemble the result
1089
	      res.w[1] = x_sign | x_exp | C1_hi;
1090
	      res.w[0] = C1_lo;
1091
	    } else if ((C2_lo == halfulp64)
1092
		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1093
	      // n2 = 1/2 ulp (n1) and C1 is even
1094
	      // the result is the operand with the larger magnitude,
1095
	      // possibly scaled up by 10^(P34-q1)
1096
	      // an overflow cannot occur in this case (rounding to nearest)
1097
	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1098
		// Note: because delta = P34 it is certain that
1099
		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1100
		scale = P34 - q1;
1101
		if (q1 <= 19) {	// C1 fits in 64 bits
1102
		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1103
		  if (scale <= 19) {	// 10^scale fits in 64 bits
1104
		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1105
		  } else {	// if 20 <= scale <= 33
1106
		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1107
		    // (C1 * 10^(scale-19)) fits in 64 bits
1108
		    C1_lo = C1_lo * ten2k64[scale - 19];
1109
		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1110
		  }
1111
		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1112
		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1113
		  C1.w[1] = C1_hi;
1114
		  C1.w[0] = C1_lo;
1115
		  // C1 = ten2k64[P34 - q1] * C1
1116
		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1117
		}
1118
		x_exp = x_exp - ((UINT64) scale << 49);
1119
		C1_hi = C1.w[1];
1120
		C1_lo = C1.w[0];
1121
	      }
1122
	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
1123
		   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
1124
					  && x_sign == y_sign)
1125
		  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
1126
		  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
1127
		// add 1 ulp and then check for overflow
1128
		C1_lo = C1_lo + 1;
1129
		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1130
		  C1_hi = C1_hi + 1;
1131
		}
1132
		if (C1_hi == 0x0001ed09bead87c0ull
1133
		    && C1_lo == 0x378d8e6400000000ull) {
1134
		  // C1 = 10^34 => rounding overflow
1135
		  C1_hi = 0x0000314dc6448d93ull;
1136
		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1137
		  x_exp = x_exp + EXP_P1;
1138
		  if (x_exp == EXP_MAX_P1) {	// overflow
1139
		    C1_hi = 0x7800000000000000ull;	// +inf
1140
		    C1_lo = 0x0ull;
1141
		    x_exp = 0;	// x_sign is preserved
1142
		    // set overflow flag (the inexact flag was set too)
1143
		    *pfpsf |= OVERFLOW_EXCEPTION;
1144
		  }
1145
		}
1146
	      } else
1147
		if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
1148
		     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
1149
					    && !x_sign && y_sign)
1150
		    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1151
		    || (rnd_mode == ROUNDING_TO_ZERO
1152
			&& x_sign != y_sign)) {
1153
		// subtract 1 ulp from C1
1154
		// Note: because delta >= P34 + 1 the result cannot be zero
1155
		C1_lo = C1_lo - 1;
1156
		if (C1_lo == 0xffffffffffffffffull)
1157
		  C1_hi = C1_hi - 1;
1158
		// if the coefficient is 10^33 - 1 then make it 10^34 - 1
1159
		// and decrease the exponent by 1 (because delta >= P34 + 1
1160
		// the exponent will not become less than e_min)
1161
		// 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1162
		// 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1163
		if (C1_hi == 0x0000314dc6448d93ull
1164
		    && C1_lo == 0x38c15b09ffffffffull) {
1165
		  // make C1 = 10^34  - 1
1166
		  C1_hi = 0x0001ed09bead87c0ull;
1167
		  C1_lo = 0x378d8e63ffffffffull;
1168
		  x_exp = x_exp - EXP_P1;
1169
		}
1170
	      } else {
1171
		;	// the result is already correct
1172
	      }
1173
	      // set the inexact flag
1174
	      *pfpsf |= INEXACT_EXCEPTION;
1175
	      // assemble the result
1176
	      res.w[1] = x_sign | x_exp | C1_hi;
1177
	      res.w[0] = C1_lo;
1178
	    } else {	// if C2_lo > halfulp64 ||
1179
	      // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
1180
	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1181
	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1182
	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
1183
		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1184
		// because q1 < P34 we must first replace C1 by
1185
		// C1 * 10^(P34-q1), and must decrease the exponent by
1186
		// (P34-q1) (it will still be at least e_min)
1187
		scale = P34 - q1;
1188
		if (q1 <= 19) {	// C1 fits in 64 bits
1189
		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1190
		  if (scale <= 19) {	// 10^scale fits in 64 bits
1191
		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1192
		  } else {	// if 20 <= scale <= 33
1193
		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1194
		    // (C1 * 10^(scale-19)) fits in 64 bits
1195
		    C1_lo = C1_lo * ten2k64[scale - 19];
1196
		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1197
		  }
1198
		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1199
		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1200
		  C1.w[1] = C1_hi;
1201
		  C1.w[0] = C1_lo;
1202
		  // C1 = ten2k64[P34 - q1] * C1
1203
		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1204
		}
1205
		x_exp = x_exp - ((UINT64) scale << 49);
1206
		C1_hi = C1.w[1];
1207
		C1_lo = C1.w[0];
1208
		// check for rounding overflow
1209
		if (C1_hi == 0x0001ed09bead87c0ull
1210
		    && C1_lo == 0x378d8e6400000000ull) {
1211
		  // C1 = 10^34 => rounding overflow
1212
		  C1_hi = 0x0000314dc6448d93ull;
1213
		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1214
		  x_exp = x_exp + EXP_P1;
1215
		}
1216
	      }
1217
	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1218
		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1219
		      && C2_lo != halfulp64)
1220
		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1221
		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1222
		  || (rnd_mode == ROUNDING_TO_ZERO
1223
		      && x_sign != y_sign)) {
1224
		// the result is x - 1
1225
		// for RN n1 * n2 < 0; underflow not possible
1226
		C1_lo = C1_lo - 1;
1227
		if (C1_lo == 0xffffffffffffffffull)
1228
		  C1_hi--;
1229
		// check if we crossed into the lower decade
1230
		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1231
		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1232
		  C1_lo = 0x378d8e63ffffffffull;
1233
		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
1234
		}
1235
	      } else
1236
		if ((rnd_mode == ROUNDING_TO_NEAREST
1237
		     && x_sign == y_sign)
1238
		    || (rnd_mode == ROUNDING_TIES_AWAY
1239
			&& x_sign == y_sign)
1240
		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1241
		    || (rnd_mode == ROUNDING_UP && !x_sign
1242
			&& !y_sign)) {
1243
		// the result is x + 1
1244
		// for RN x_sign = y_sign, i.e. n1*n2 > 0
1245
		C1_lo = C1_lo + 1;
1246
		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1247
		  C1_hi = C1_hi + 1;
1248
		}
1249
		if (C1_hi == 0x0001ed09bead87c0ull
1250
		    && C1_lo == 0x378d8e6400000000ull) {
1251
		  // C1 = 10^34 => rounding overflow
1252
		  C1_hi = 0x0000314dc6448d93ull;
1253
		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1254
		  x_exp = x_exp + EXP_P1;
1255
		  if (x_exp == EXP_MAX_P1) {	// overflow
1256
		    C1_hi = 0x7800000000000000ull;	// +inf
1257
		    C1_lo = 0x0ull;
1258
		    x_exp = 0;	// x_sign is preserved
1259
		    // set the overflow flag
1260
		    *pfpsf |= OVERFLOW_EXCEPTION;
1261
		  }
1262
		}
1263
	      } else {
1264
		;	// the result is x
1265
	      }
1266
	      // set the inexact flag
1267
	      *pfpsf |= INEXACT_EXCEPTION;
1268
	      // assemble the result
1269
	      res.w[1] = x_sign | x_exp | C1_hi;
1270
	      res.w[0] = C1_lo;
1271
	    }
1272
	  } else {	// if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in
1273
	    // most cases) fit only in more than 64 bits
1274
	    halfulp128 = midpoint128[q2 - 20];	// 5 * 10^(q2-1)
1275
	    if ((C2_hi < halfulp128.w[1])
1276
		|| (C2_hi == halfulp128.w[1]
1277
		    && C2_lo < halfulp128.w[0])) {
1278
	      // n2 < 1/2 ulp (n1)
1279
	      // the result is the operand with the larger magnitude,
1280
	      // possibly scaled up by 10^(P34-q1)
1281
	      // an overflow cannot occur in this case (rounding to nearest)
1282
	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1283
		// Note: because delta = P34 it is certain that
1284
		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1285
		scale = P34 - q1;
1286
		if (q1 <= 19) {	// C1 fits in 64 bits
1287
		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1288
		  if (scale <= 19) {	// 10^scale fits in 64 bits
1289
		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1290
		  } else {	// if 20 <= scale <= 33
1291
		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1292
		    // (C1 * 10^(scale-19)) fits in 64 bits
1293
		    C1_lo = C1_lo * ten2k64[scale - 19];
1294
		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1295
		  }
1296
		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1297
		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1298
		  C1.w[1] = C1_hi;
1299
		  C1.w[0] = C1_lo;
1300
		  // C1 = ten2k64[P34 - q1] * C1
1301
		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1302
		}
1303
		C1_hi = C1.w[1];
1304
		C1_lo = C1.w[0];
1305
		x_exp = x_exp - ((UINT64) scale << 49);
1306
	      }
1307
	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1308
		if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1309
		    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1310
		  // add 1 ulp and then check for overflow
1311
		  C1_lo = C1_lo + 1;
1312
		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1313
		    C1_hi = C1_hi + 1;
1314
		  }
1315
		  if (C1_hi == 0x0001ed09bead87c0ull
1316
		      && C1_lo == 0x378d8e6400000000ull) {
1317
		    // C1 = 10^34 => rounding overflow
1318
		    C1_hi = 0x0000314dc6448d93ull;
1319
		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1320
		    x_exp = x_exp + EXP_P1;
1321
		    if (x_exp == EXP_MAX_P1) {	// overflow
1322
		      C1_hi = 0x7800000000000000ull;	// +inf
1323
		      C1_lo = 0x0ull;
1324
		      x_exp = 0;	// x_sign is preserved
1325
		      // set overflow flag (the inexact flag was set too)
1326
		      *pfpsf |= OVERFLOW_EXCEPTION;
1327
		    }
1328
		  }
1329
		} else
1330
		  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1331
		      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1332
		      || (rnd_mode == ROUNDING_TO_ZERO
1333
			  && x_sign != y_sign)) {
1334
		  // subtract 1 ulp from C1
1335
		  // Note: because delta >= P34 + 1 the result cannot be zero
1336
		  C1_lo = C1_lo - 1;
1337
		  if (C1_lo == 0xffffffffffffffffull)
1338
		    C1_hi = C1_hi - 1;
1339
		  // if the coefficient is 10^33-1 then make it 10^34-1 and
1340
		  // decrease the exponent by 1 (because delta >= P34 + 1 the
1341
		  // exponent will not become less than e_min)
1342
		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1343
		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1344
		  if (C1_hi == 0x0000314dc6448d93ull
1345
		      && C1_lo == 0x38c15b09ffffffffull) {
1346
		    // make C1 = 10^34  - 1
1347
		    C1_hi = 0x0001ed09bead87c0ull;
1348
		    C1_lo = 0x378d8e63ffffffffull;
1349
		    x_exp = x_exp - EXP_P1;
1350
		  }
1351
		} else {
1352
		  ;	// the result is already correct
1353
		}
1354
	      }
1355
	      // set the inexact flag
1356
	      *pfpsf |= INEXACT_EXCEPTION;
1357
	      // assemble the result
1358
	      res.w[1] = x_sign | x_exp | C1_hi;
1359
	      res.w[0] = C1_lo;
1360
	    } else if ((C2_hi == halfulp128.w[1]
1361
			&& C2_lo == halfulp128.w[0])
1362
		       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1363
	      // midpoint & lsb in C1 is 0
1364
	      // n2 = 1/2 ulp (n1) and C1 is even
1365
	      // the result is the operand with the larger magnitude,
1366
	      // possibly scaled up by 10^(P34-q1)
1367
	      // an overflow cannot occur in this case (rounding to nearest)
1368
	      if (q1 < P34) {	// scale C1 up by 10^(P34-q1)
1369
		// Note: because delta = P34 it is certain that
1370
		//     x_exp - ((UINT64)scale << 49) will stay above e_min
1371
		scale = P34 - q1;
1372
		if (q1 <= 19) {	// C1 fits in 64 bits
1373
		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1374
		  if (scale <= 19) {	// 10^scale fits in 64 bits
1375
		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1376
		  } else {	// if 20 <= scale <= 33
1377
		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1378
		    // (C1 * 10^(scale-19)) fits in 64 bits
1379
		    C1_lo = C1_lo * ten2k64[scale - 19];
1380
		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1381
		  }
1382
		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1383
		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1384
		  C1.w[1] = C1_hi;
1385
		  C1.w[0] = C1_lo;
1386
		  // C1 = ten2k64[P34 - q1] * C1
1387
		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1388
		}
1389
		x_exp = x_exp - ((UINT64) scale << 49);
1390
		C1_hi = C1.w[1];
1391
		C1_lo = C1.w[0];
1392
	      }
1393
	      if (rnd_mode != ROUNDING_TO_NEAREST) {
1394
		if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
1395
		    || (rnd_mode == ROUNDING_UP && !y_sign)) {
1396
		  // add 1 ulp and then check for overflow
1397
		  C1_lo = C1_lo + 1;
1398
		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1399
		    C1_hi = C1_hi + 1;
1400
		  }
1401
		  if (C1_hi == 0x0001ed09bead87c0ull
1402
		      && C1_lo == 0x378d8e6400000000ull) {
1403
		    // C1 = 10^34 => rounding overflow
1404
		    C1_hi = 0x0000314dc6448d93ull;
1405
		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
1406
		    x_exp = x_exp + EXP_P1;
1407
		    if (x_exp == EXP_MAX_P1) {	// overflow
1408
		      C1_hi = 0x7800000000000000ull;	// +inf
1409
		      C1_lo = 0x0ull;
1410
		      x_exp = 0;	// x_sign is preserved
1411
		      // set overflow flag (the inexact flag was set too)
1412
		      *pfpsf |= OVERFLOW_EXCEPTION;
1413
		    }
1414
		  }
1415
		} else if ((rnd_mode == ROUNDING_DOWN && y_sign)
1416
			   || (rnd_mode == ROUNDING_TO_ZERO
1417
			       && x_sign != y_sign)) {
1418
		  // subtract 1 ulp from C1
1419
		  // Note: because delta >= P34 + 1 the result cannot be zero
1420
		  C1_lo = C1_lo - 1;
1421
		  if (C1_lo == 0xffffffffffffffffull)
1422
		    C1_hi = C1_hi - 1;
1423
		  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
1424
		  // and decrease the exponent by 1 (because delta >= P34 + 1
1425
		  // the exponent will not become less than e_min)
1426
		  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1427
		  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1428
		  if (C1_hi == 0x0000314dc6448d93ull
1429
		      && C1_lo == 0x38c15b09ffffffffull) {
1430
		    // make C1 = 10^34  - 1
1431
		    C1_hi = 0x0001ed09bead87c0ull;
1432
		    C1_lo = 0x378d8e63ffffffffull;
1433
		    x_exp = x_exp - EXP_P1;
1434
		  }
1435
		} else {
1436
		  ;	// the result is already correct
1437
		}
1438
	      }
1439
	      // set the inexact flag
1440
	      *pfpsf |= INEXACT_EXCEPTION;
1441
	      // assemble the result
1442
	      res.w[1] = x_sign | x_exp | C1_hi;
1443
	      res.w[0] = C1_lo;
1444
	    } else {	// if C2 > halfulp128 ||
1445
	      // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
1446
	      // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1447
	      // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1448
	      if (q1 < P34) {	// then 1 ulp = 10^(e1+q1-P34) < 10^e1
1449
		// Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1450
		// because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
1451
		// and must decrease the exponent by (P34-q1) (it will still be
1452
		// at least e_min)
1453
		scale = P34 - q1;
1454
		if (q1 <= 19) {	// C1 fits in 64 bits
1455
		  // 1 <= q1 <= 19 => 15 <= scale <= 33
1456
		  if (scale <= 19) {	// 10^scale fits in 64 bits
1457
		    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1458
		  } else {	// if 20 <= scale <= 33
1459
		    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1460
		    // (C1 * 10^(scale-19)) fits in 64 bits
1461
		    C1_lo = C1_lo * ten2k64[scale - 19];
1462
		    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1463
		  }
1464
		} else {	//if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1465
		  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1466
		  C1.w[1] = C1_hi;
1467
		  C1.w[0] = C1_lo;
1468
		  // C1 = ten2k64[P34 - q1] * C1
1469
		  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1470
		}
1471
		C1_hi = C1.w[1];
1472
		C1_lo = C1.w[0];
1473
		x_exp = x_exp - ((UINT64) scale << 49);
1474
	      }
1475
	      if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1476
		  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1477
		      && (C2_hi != halfulp128.w[1]
1478
			  || C2_lo != halfulp128.w[0]))
1479
		  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1480
		  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1481
		  || (rnd_mode == ROUNDING_TO_ZERO
1482
		      && x_sign != y_sign)) {
1483
		// the result is x - 1
1484
		// for RN n1 * n2 < 0; underflow not possible
1485
		C1_lo = C1_lo - 1;
1486
		if (C1_lo == 0xffffffffffffffffull)
1487
		  C1_hi--;
1488
		// check if we crossed into the lower decade
1489
		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1490
		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1491
		  C1_lo = 0x378d8e63ffffffffull;
1492
		  x_exp = x_exp - EXP_P1;	// no underflow, because n1 >> n2
1493
		}
1494
	      } else
1495
		if ((rnd_mode == ROUNDING_TO_NEAREST
1496
		     && x_sign == y_sign)
1497
		    || (rnd_mode == ROUNDING_TIES_AWAY
1498
			&& x_sign == y_sign)
1499
		    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1500
		    || (rnd_mode == ROUNDING_UP && !x_sign
1501
			&& !y_sign)) {
1502
		// the result is x + 1
1503
		// for RN x_sign = y_sign, i.e. n1*n2 > 0
1504
		C1_lo = C1_lo + 1;
1505
		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1506
		  C1_hi = C1_hi + 1;
1507
		}
1508
		if (C1_hi == 0x0001ed09bead87c0ull
1509
		    && C1_lo == 0x378d8e6400000000ull) {
1510
		  // C1 = 10^34 => rounding overflow
1511
		  C1_hi = 0x0000314dc6448d93ull;
1512
		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
1513
		  x_exp = x_exp + EXP_P1;
1514
		  if (x_exp == EXP_MAX_P1) {	// overflow
1515
		    C1_hi = 0x7800000000000000ull;	// +inf
1516
		    C1_lo = 0x0ull;
1517
		    x_exp = 0;	// x_sign is preserved
1518
		    // set the overflow flag
1519
		    *pfpsf |= OVERFLOW_EXCEPTION;
1520
		  }
1521
		}
1522
	      } else {
1523
		;	// the result is x
1524
	      }
1525
	      // set the inexact flag
1526
	      *pfpsf |= INEXACT_EXCEPTION;
1527
	      // assemble the result
1528
	      res.w[1] = x_sign | x_exp | C1_hi;
1529
	      res.w[0] = C1_lo;
1530
	    }
1531
	  }	// end q1 >= 20
1532
	  // end case where C1 != 10^(q1-1)
1533
	} else {	// C1 = 10^(q1-1) and x_sign != y_sign
1534
	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
1535
	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
1536
	  // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
1537
	  // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34
1538
	  // digits and n = C' * 10^(e2+x1)
1539
	  // If the result has P34+1 digits, redo the steps above with x1+1
1540
	  // If the result has P34-1 digits or less, redo the steps above with
1541
	  // x1-1 but only if initially x1 >= 1
1542
	  // NOTE: these two steps can be improved, e.g we could guess if
1543
	  // P34+1 or P34-1 digits will be obtained by adding/subtracting
1544
	  // just the top 64 bits of the two operands
1545
	  // The result cannot be zero, and it cannot overflow
1546
	  x1 = q2 - 1;	// 0 <= x1 <= P34-1
1547
	  // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
1548
	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
1549
	  scale = P34 - q1 + 1;	// scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
1550
	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
1551
	  // but their product fits with certainty in 128 bits
1552
	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
1553
	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1554
	  } else {	// if (scale >= 1
1555
	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
1556
	    if (q1 <= 19) {	// C1 fits in 64 bits
1557
	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1558
	    } else {	// q1 >= 20
1559
	      C1.w[1] = C1_hi;
1560
	      C1.w[0] = C1_lo;
1561
	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1562
	    }
1563
	  }
1564
	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
1565
 
1566
	  // now round C2 to q2-x1 = 1 decimal digit
1567
	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
1568
	  ind = x1 - 1;	// -1 <= ind <= P34 - 2
1569
	  if (ind >= 0) {	// if (x1 >= 1)
1570
	    C2.w[0] = C2_lo;
1571
	    C2.w[1] = C2_hi;
1572
	    if (ind <= 18) {
1573
	      C2.w[0] = C2.w[0] + midpoint64[ind];
1574
	      if (C2.w[0] < C2_lo)
1575
		C2.w[1]++;
1576
	    } else {	// 19 <= ind <= 32
1577
	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
1578
	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
1579
	      if (C2.w[0] < C2_lo)
1580
		C2.w[1]++;
1581
	    }
1582
	    // the approximation of 10^(-x1) was rounded up to 118 bits
1583
	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
1584
	    // calculate C2* and f2*
1585
	    // C2* is actually floor(C2*) in this case
1586
	    // C2* and f2* need shifting and masking, as shown by
1587
	    // shiftright128[] and maskhigh128[]
1588
	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
1589
	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
1590
	    // if (0 < f2* < 10^(-x1)) then
1591
	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
1592
	    //       shift; C2* has p decimal digits, correct by Prop. 1)
1593
	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
1594
	    //       shift; C2* has p decimal digits, correct by Pr. 1)
1595
	    // else
1596
	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
1597
	    //       correct by Property 1)
1598
	    // n = C2* * 10^(e2+x1)
1599
 
1600
	    if (ind <= 2) {
1601
	      highf2star.w[1] = 0x0;
1602
	      highf2star.w[0] = 0x0;	// low f2* ok
1603
	    } else if (ind <= 21) {
1604
	      highf2star.w[1] = 0x0;
1605
	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
1606
	    } else {
1607
	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
1608
	      highf2star.w[0] = R256.w[2];	// low f2* is ok
1609
	    }
1610
	    // shift right C2* by Ex-128 = shiftright128[ind]
1611
	    if (ind >= 3) {
1612
	      shift = shiftright128[ind];
1613
	      if (shift < 64) {	// 3 <= shift <= 63
1614
		R256.w[2] =
1615
		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
1616
		R256.w[3] = (R256.w[3] >> shift);
1617
	      } else {	// 66 <= shift <= 102
1618
		R256.w[2] = (R256.w[3] >> (shift - 64));
1619
		R256.w[3] = 0x0ULL;
1620
	      }
1621
	    }
1622
	    // redundant
1623
	    is_inexact_lt_midpoint = 0;
1624
	    is_inexact_gt_midpoint = 0;
1625
	    is_midpoint_lt_even = 0;
1626
	    is_midpoint_gt_even = 0;
1627
	    // determine inexactness of the rounding of C2*
1628
	    // (cannot be followed by a second rounding)
1629
	    // if (0 < f2* - 1/2 < 10^(-x1)) then
1630
	    //   the result is exact
1631
	    // else (if f2* - 1/2 > T* then)
1632
	    //   the result of is inexact
1633
	    if (ind <= 2) {
1634
	      if (R256.w[1] > 0x8000000000000000ull ||
1635
		  (R256.w[1] == 0x8000000000000000ull
1636
		   && R256.w[0] > 0x0ull)) {
1637
		// f2* > 1/2 and the result may be exact
1638
		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
1639
		if ((tmp64A > ten2mk128trunc[ind].w[1]
1640
		     || (tmp64A == ten2mk128trunc[ind].w[1]
1641
			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
1642
		  // set the inexact flag
1643
		  *pfpsf |= INEXACT_EXCEPTION;
1644
		  // this rounding is applied to C2 only!
1645
		  // x_sign != y_sign
1646
		  is_inexact_gt_midpoint = 1;
1647
		}	// else the result is exact
1648
		// rounding down, unless a midpoint in [ODD, EVEN]
1649
	      } else {	// the result is inexact; f2* <= 1/2
1650
		// set the inexact flag
1651
		*pfpsf |= INEXACT_EXCEPTION;
1652
		// this rounding is applied to C2 only!
1653
		// x_sign != y_sign
1654
		is_inexact_lt_midpoint = 1;
1655
	      }
1656
	    } else if (ind <= 21) {	// if 3 <= ind <= 21
1657
	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
1658
					    && highf2star.w[0] >
1659
					    onehalf128[ind])
1660
		  || (highf2star.w[1] == 0x0
1661
		      && highf2star.w[0] == onehalf128[ind]
1662
		      && (R256.w[1] || R256.w[0]))) {
1663
		// f2* > 1/2 and the result may be exact
1664
		// Calculate f2* - 1/2
1665
		tmp64A = highf2star.w[0] - onehalf128[ind];
1666
		tmp64B = highf2star.w[1];
1667
		if (tmp64A > highf2star.w[0])
1668
		  tmp64B--;
1669
		if (tmp64B || tmp64A
1670
		    || R256.w[1] > ten2mk128trunc[ind].w[1]
1671
		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1672
			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
1673
		  // set the inexact flag
1674
		  *pfpsf |= INEXACT_EXCEPTION;
1675
		  // this rounding is applied to C2 only!
1676
		  // x_sign != y_sign
1677
		  is_inexact_gt_midpoint = 1;
1678
		}	// else the result is exact
1679
	      } else {	// the result is inexact; f2* <= 1/2
1680
		// set the inexact flag
1681
		*pfpsf |= INEXACT_EXCEPTION;
1682
		// this rounding is applied to C2 only!
1683
		// x_sign != y_sign
1684
		is_inexact_lt_midpoint = 1;
1685
	      }
1686
	    } else {	// if 22 <= ind <= 33
1687
	      if (highf2star.w[1] > onehalf128[ind]
1688
		  || (highf2star.w[1] == onehalf128[ind]
1689
		      && (highf2star.w[0] || R256.w[1]
1690
			  || R256.w[0]))) {
1691
		// f2* > 1/2 and the result may be exact
1692
		// Calculate f2* - 1/2
1693
		// tmp64A = highf2star.w[0];
1694
		tmp64B = highf2star.w[1] - onehalf128[ind];
1695
		if (tmp64B || highf2star.w[0]
1696
		    || R256.w[1] > ten2mk128trunc[ind].w[1]
1697
		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1698
			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
1699
		  // set the inexact flag
1700
		  *pfpsf |= INEXACT_EXCEPTION;
1701
		  // this rounding is applied to C2 only!
1702
		  // x_sign != y_sign
1703
		  is_inexact_gt_midpoint = 1;
1704
		}	// else the result is exact
1705
	      } else {	// the result is inexact; f2* <= 1/2
1706
		// set the inexact flag
1707
		*pfpsf |= INEXACT_EXCEPTION;
1708
		// this rounding is applied to C2 only!
1709
		// x_sign != y_sign
1710
		is_inexact_lt_midpoint = 1;
1711
	      }
1712
	    }
1713
	    // check for midpoints after determining inexactness
1714
	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
1715
		&& (highf2star.w[0] == 0)
1716
		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
1717
		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1718
			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
1719
	      // the result is a midpoint
1720
	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
1721
		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
1722
		R256.w[2]--;
1723
		if (R256.w[2] == 0xffffffffffffffffull)
1724
		  R256.w[3]--;
1725
		// this rounding is applied to C2 only!
1726
		// x_sign != y_sign
1727
		is_midpoint_lt_even = 1;
1728
		is_inexact_lt_midpoint = 0;
1729
		is_inexact_gt_midpoint = 0;
1730
	      } else {
1731
		// else MP in [ODD, EVEN]
1732
		// this rounding is applied to C2 only!
1733
		// x_sign != y_sign
1734
		is_midpoint_gt_even = 1;
1735
		is_inexact_lt_midpoint = 0;
1736
		is_inexact_gt_midpoint = 0;
1737
	      }
1738
	    }
1739
	  } else {	// if (ind == -1) only when x1 = 0
1740
	    R256.w[2] = C2_lo;
1741
	    R256.w[3] = C2_hi;
1742
	    is_midpoint_lt_even = 0;
1743
	    is_midpoint_gt_even = 0;
1744
	    is_inexact_lt_midpoint = 0;
1745
	    is_inexact_gt_midpoint = 0;
1746
	  }
1747
	  // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
1748
	  // because x_sign != y_sign this last operation is exact
1749
	  C1.w[0] = C1.w[0] - R256.w[2];
1750
	  C1.w[1] = C1.w[1] - R256.w[3];
1751
	  if (C1.w[0] > tmp64)
1752
	    C1.w[1]--;	// borrow
1753
	  if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
1754
	    C1.w[0] = ~C1.w[0];
1755
	    C1.w[0]++;
1756
	    C1.w[1] = ~C1.w[1];
1757
	    if (C1.w[0] == 0x0)
1758
	      C1.w[1]++;
1759
	    tmp_sign = y_sign;	// the result will have the sign of y
1760
	  } else {
1761
	    tmp_sign = x_sign;
1762
	  }
1763
	  // the difference has exactly P34 digits
1764
	  x_sign = tmp_sign;
1765
	  if (x1 >= 1)
1766
	    y_exp = y_exp + ((UINT64) x1 << 49);
1767
	  C1_hi = C1.w[1];
1768
	  C1_lo = C1.w[0];
1769
	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
1770
	  if (rnd_mode != ROUNDING_TO_NEAREST) {
1771
	    if ((!x_sign
1772
		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
1773
		     ||
1774
		     ((rnd_mode == ROUNDING_TIES_AWAY
1775
		       || rnd_mode == ROUNDING_UP)
1776
		      && is_midpoint_gt_even))) || (x_sign
1777
						    &&
1778
						    ((rnd_mode ==
1779
						      ROUNDING_DOWN
1780
						      &&
1781
						      is_inexact_lt_midpoint)
1782
						     ||
1783
						     ((rnd_mode ==
1784
						       ROUNDING_TIES_AWAY
1785
						       || rnd_mode ==
1786
						       ROUNDING_DOWN)
1787
						      &&
1788
						      is_midpoint_gt_even))))
1789
	    {
1790
	      // C1 = C1 + 1
1791
	      C1_lo = C1_lo + 1;
1792
	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
1793
		C1_hi = C1_hi + 1;
1794
	      }
1795
	      if (C1_hi == 0x0001ed09bead87c0ull
1796
		  && C1_lo == 0x378d8e6400000000ull) {
1797
		// C1 = 10^34 => rounding overflow
1798
		C1_hi = 0x0000314dc6448d93ull;
1799
		C1_lo = 0x38c15b0a00000000ull;	// 10^33
1800
		y_exp = y_exp + EXP_P1;
1801
	      }
1802
	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
1803
		       &&
1804
		       ((x_sign
1805
			 && (rnd_mode == ROUNDING_UP
1806
			     || rnd_mode == ROUNDING_TO_ZERO))
1807
			|| (!x_sign
1808
			    && (rnd_mode == ROUNDING_DOWN
1809
				|| rnd_mode == ROUNDING_TO_ZERO)))) {
1810
	      // C1 = C1 - 1
1811
	      C1_lo = C1_lo - 1;
1812
	      if (C1_lo == 0xffffffffffffffffull)
1813
		C1_hi--;
1814
	      // check if we crossed into the lower decade
1815
	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
1816
		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
1817
		C1_lo = 0x378d8e63ffffffffull;
1818
		y_exp = y_exp - EXP_P1;
1819
		// no underflow, because delta + q2 >= P34 + 1
1820
	      }
1821
	    } else {
1822
	      ;	// exact, the result is already correct
1823
	    }
1824
	  }
1825
	  // assemble the result
1826
	  res.w[1] = x_sign | y_exp | C1_hi;
1827
	  res.w[0] = C1_lo;
1828
	}
1829
      }	// end delta = P34
1830
    } else {	// if (|delta| <= P34 - 1)
1831
      if (delta >= 0) {	// if (0 <= delta <= P34 - 1)
1832
	if (delta <= P34 - 1 - q2) {
1833
	  // calculate C' directly; the result is exact
1834
	  // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
1835
	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1836
	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1837
	  // but their product fits with certainty in 128 bits (actually in 113)
1838
	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1839
 
1840
	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
1841
	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1842
	    C1_hi = C1.w[1];
1843
	    C1_lo = C1.w[0];
1844
	  } else if (scale >= 1) {
1845
	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1846
	    if (q1 <= 19) {	// C1 fits in 64 bits
1847
	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1848
	    } else {	// q1 >= 20
1849
	      C1.w[1] = C1_hi;
1850
	      C1.w[0] = C1_lo;
1851
	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1852
	    }
1853
	    C1_hi = C1.w[1];
1854
	    C1_lo = C1.w[0];
1855
	  } else {	// if (scale == 0) C1 is unchanged
1856
	    C1.w[0] = C1_lo;	// C1.w[1] = C1_hi;
1857
	  }
1858
	  // now add C2
1859
	  if (x_sign == y_sign) {
1860
	    // the result cannot overflow
1861
	    C1_lo = C1_lo + C2_lo;
1862
	    C1_hi = C1_hi + C2_hi;
1863
	    if (C1_lo < C1.w[0])
1864
	      C1_hi++;
1865
	  } else {	// if x_sign != y_sign
1866
	    C1_lo = C1_lo - C2_lo;
1867
	    C1_hi = C1_hi - C2_hi;
1868
	    if (C1_lo > C1.w[0])
1869
	      C1_hi--;
1870
	    // the result can be zero, but it cannot overflow
1871
	    if (C1_lo == 0 && C1_hi == 0) {
1872
	      // assemble the result
1873
	      if (x_exp < y_exp)
1874
		res.w[1] = x_exp;
1875
	      else
1876
		res.w[1] = y_exp;
1877
	      res.w[0] = 0;
1878
	      if (rnd_mode == ROUNDING_DOWN) {
1879
		res.w[1] |= 0x8000000000000000ull;
1880
	      }
1881
	      BID_SWAP128 (res);
1882
	      BID_RETURN (res);
1883
	    }
1884
	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
1885
	      C1_lo = ~C1_lo;
1886
	      C1_lo++;
1887
	      C1_hi = ~C1_hi;
1888
	      if (C1_lo == 0x0)
1889
		C1_hi++;
1890
	      x_sign = y_sign;	// the result will have the sign of y
1891
	    }
1892
	  }
1893
	  // assemble the result
1894
	  res.w[1] = x_sign | y_exp | C1_hi;
1895
	  res.w[0] = C1_lo;
1896
	} else if (delta == P34 - q2) {
1897
	  // calculate C' directly; the result may be inexact if it requires
1898
	  // P34+1 decimal digits; in this case the 'cutoff' point for addition
1899
	  // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
1900
	  // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1901
	  // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1902
	  // but their product fits with certainty in 128 bits (actually in 113)
1903
	  scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1904
	  if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
1905
	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1906
	  } else if (scale >= 1) {
1907
	    // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1908
	    if (q1 <= 19) {	// C1 fits in 64 bits
1909
	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1910
	    } else {	// q1 >= 20
1911
	      C1.w[1] = C1_hi;
1912
	      C1.w[0] = C1_lo;
1913
	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1914
	    }
1915
	  } else {	// if (scale == 0) C1 is unchanged
1916
	    C1.w[1] = C1_hi;
1917
	    C1.w[0] = C1_lo;	// only the low part is necessary
1918
	  }
1919
	  C1_hi = C1.w[1];
1920
	  C1_lo = C1.w[0];
1921
	  // now add C2
1922
	  if (x_sign == y_sign) {
1923
	    // the result can overflow!
1924
	    C1_lo = C1_lo + C2_lo;
1925
	    C1_hi = C1_hi + C2_hi;
1926
	    if (C1_lo < C1.w[0])
1927
	      C1_hi++;
1928
	    // test for overflow, possible only when C1 >= 10^34
1929
	    if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
1930
	      // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
1931
	      // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
1932
	      // decimal digits
1933
	      // Calculate C'' = C' + 1/2 * 10^x
1934
	      if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
1935
		C1_lo = C1_lo + 5;
1936
		C1_hi = C1_hi + 1;
1937
	      } else {
1938
		C1_lo = C1_lo + 5;
1939
	      }
1940
	      // the approximation of 10^(-1) was rounded up to 118 bits
1941
	      // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
1942
	      // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
1943
	      C1.w[1] = C1_hi;
1944
	      C1.w[0] = C1_lo;	// C''
1945
	      ten2m1.w[1] = 0x1999999999999999ull;
1946
	      ten2m1.w[0] = 0x9999999999999a00ull;
1947
	      __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
1948
	      // C* is actually floor(C*) in this case
1949
	      // the top Ex = 128 bits of 10^(-1) are
1950
	      // T* = 0x00199999999999999999999999999999
1951
	      // if (0 < f* < 10^(-x)) then
1952
	      //   if floor(C*) is even then C = floor(C*) - logical right
1953
	      //       shift; C has p decimal digits, correct by Prop. 1)
1954
	      //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
1955
	      //       shift; C has p decimal digits, correct by Pr. 1)
1956
	      // else
1957
	      //   C = floor(C*) (logical right shift; C has p decimal digits,
1958
	      //       correct by Property 1)
1959
	      // n = C * 10^(e2+x)
1960
	      if ((P256.w[1] || P256.w[0])
1961
		  && (P256.w[1] < 0x1999999999999999ull
1962
		      || (P256.w[1] == 0x1999999999999999ull
1963
			  && P256.w[0] <= 0x9999999999999999ull))) {
1964
		// the result is a midpoint
1965
		if (P256.w[2] & 0x01) {
1966
		  is_midpoint_gt_even = 1;
1967
		  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
1968
		  P256.w[2]--;
1969
		  if (P256.w[2] == 0xffffffffffffffffull)
1970
		    P256.w[3]--;
1971
		} else {
1972
		  is_midpoint_lt_even = 1;
1973
		}
1974
	      }
1975
	      // n = Cstar * 10^(e2+1)
1976
	      y_exp = y_exp + EXP_P1;
1977
	      // C* != 10^P because C* has P34 digits
1978
	      // check for overflow
1979
	      if (y_exp == EXP_MAX_P1
1980
		  && (rnd_mode == ROUNDING_TO_NEAREST
1981
		      || rnd_mode == ROUNDING_TIES_AWAY)) {
1982
		// overflow for RN
1983
		res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
1984
		res.w[0] = 0x0ull;
1985
		// set the inexact flag
1986
		*pfpsf |= INEXACT_EXCEPTION;
1987
		// set the overflow flag
1988
		*pfpsf |= OVERFLOW_EXCEPTION;
1989
		BID_SWAP128 (res);
1990
		BID_RETURN (res);
1991
	      }
1992
	      // if (0 < f* - 1/2 < 10^(-x)) then
1993
	      //   the result of the addition is exact
1994
	      // else
1995
	      //   the result of the addition is inexact
1996
	      if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
1997
		tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
1998
		if ((tmp64 > 0x1999999999999999ull
1999
		     || (tmp64 == 0x1999999999999999ull
2000
			 && P256.w[0] >= 0x9999999999999999ull))) {
2001
		  // set the inexact flag
2002
		  *pfpsf |= INEXACT_EXCEPTION;
2003
		  is_inexact = 1;
2004
		}	// else the result is exact
2005
	      } else {	// the result is inexact
2006
		// set the inexact flag
2007
		*pfpsf |= INEXACT_EXCEPTION;
2008
		is_inexact = 1;
2009
	      }
2010
	      C1_hi = P256.w[3];
2011
	      C1_lo = P256.w[2];
2012
	      if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2013
		is_inexact_lt_midpoint = is_inexact
2014
		  && (P256.w[1] & 0x8000000000000000ull);
2015
		is_inexact_gt_midpoint = is_inexact
2016
		  && !(P256.w[1] & 0x8000000000000000ull);
2017
	      }
2018
	      // general correction from RN to RA, RM, RP, RZ;
2019
	      // result uses y_exp
2020
	      if (rnd_mode != ROUNDING_TO_NEAREST) {
2021
		if ((!x_sign
2022
		     &&
2023
		     ((rnd_mode == ROUNDING_UP
2024
		       && is_inexact_lt_midpoint)
2025
		      ||
2026
		      ((rnd_mode == ROUNDING_TIES_AWAY
2027
			|| rnd_mode == ROUNDING_UP)
2028
		       && is_midpoint_gt_even))) || (x_sign
2029
						     &&
2030
						     ((rnd_mode ==
2031
						       ROUNDING_DOWN
2032
						       &&
2033
						       is_inexact_lt_midpoint)
2034
						      ||
2035
						      ((rnd_mode ==
2036
							ROUNDING_TIES_AWAY
2037
							|| rnd_mode ==
2038
							ROUNDING_DOWN)
2039
						       &&
2040
						       is_midpoint_gt_even))))
2041
		{
2042
		  // C1 = C1 + 1
2043
		  C1_lo = C1_lo + 1;
2044
		  if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2045
		    C1_hi = C1_hi + 1;
2046
		  }
2047
		  if (C1_hi == 0x0001ed09bead87c0ull
2048
		      && C1_lo == 0x378d8e6400000000ull) {
2049
		    // C1 = 10^34 => rounding overflow
2050
		    C1_hi = 0x0000314dc6448d93ull;
2051
		    C1_lo = 0x38c15b0a00000000ull;	// 10^33
2052
		    y_exp = y_exp + EXP_P1;
2053
		  }
2054
		} else
2055
		  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2056
		      &&
2057
		      ((x_sign
2058
			&& (rnd_mode == ROUNDING_UP
2059
			    || rnd_mode == ROUNDING_TO_ZERO))
2060
		       || (!x_sign
2061
			   && (rnd_mode == ROUNDING_DOWN
2062
			       || rnd_mode == ROUNDING_TO_ZERO)))) {
2063
		  // C1 = C1 - 1
2064
		  C1_lo = C1_lo - 1;
2065
		  if (C1_lo == 0xffffffffffffffffull)
2066
		    C1_hi--;
2067
		  // check if we crossed into the lower decade
2068
		  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2069
		    C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2070
		    C1_lo = 0x378d8e63ffffffffull;
2071
		    y_exp = y_exp - EXP_P1;
2072
		    // no underflow, because delta + q2 >= P34 + 1
2073
		  }
2074
		} else {
2075
		  ;	// exact, the result is already correct
2076
		}
2077
		// in all cases check for overflow (RN and RA solved already)
2078
		if (y_exp == EXP_MAX_P1) {	// overflow
2079
		  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2080
		      (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2081
		    C1_hi = 0x7800000000000000ull;	// +inf
2082
		    C1_lo = 0x0ull;
2083
		  } else {	// RM and res > 0, RP and res < 0, or RZ
2084
		    C1_hi = 0x5fffed09bead87c0ull;
2085
		    C1_lo = 0x378d8e63ffffffffull;
2086
		  }
2087
		  y_exp = 0;	// x_sign is preserved
2088
		  // set the inexact flag (in case the exact addition was exact)
2089
		  *pfpsf |= INEXACT_EXCEPTION;
2090
		  // set the overflow flag
2091
		  *pfpsf |= OVERFLOW_EXCEPTION;
2092
		}
2093
	      }
2094
	    }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2095
	  } else {	// if x_sign != y_sign the result is exact
2096
	    C1_lo = C1_lo - C2_lo;
2097
	    C1_hi = C1_hi - C2_hi;
2098
	    if (C1_lo > C1.w[0])
2099
	      C1_hi--;
2100
	    // the result can be zero, but it cannot overflow
2101
	    if (C1_lo == 0 && C1_hi == 0) {
2102
	      // assemble the result
2103
	      if (x_exp < y_exp)
2104
		res.w[1] = x_exp;
2105
	      else
2106
		res.w[1] = y_exp;
2107
	      res.w[0] = 0;
2108
	      if (rnd_mode == ROUNDING_DOWN) {
2109
		res.w[1] |= 0x8000000000000000ull;
2110
	      }
2111
	      BID_SWAP128 (res);
2112
	      BID_RETURN (res);
2113
	    }
2114
	    if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
2115
	      C1_lo = ~C1_lo;
2116
	      C1_lo++;
2117
	      C1_hi = ~C1_hi;
2118
	      if (C1_lo == 0x0)
2119
		C1_hi++;
2120
	      x_sign = y_sign;	// the result will have the sign of y
2121
	    }
2122
	  }
2123
	  // assemble the result
2124
	  res.w[1] = x_sign | y_exp | C1_hi;
2125
	  res.w[0] = C1_lo;
2126
	} else {	// if (delta >= P34 + 1 - q2)
2127
	  // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
2128
	  // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
2129
	  // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
2130
	  // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
2131
	  // If the result has P34+1 digits, redo the steps above with x1+1
2132
	  // If the result has P34-1 digits or less, redo the steps above with
2133
	  // x1-1 but only if initially x1 >= 1
2134
	  // NOTE: these two steps can be improved, e.g we could guess if
2135
	  // P34+1 or P34-1 digits will be obtained by adding/subtracting just
2136
	  // the top 64 bits of the two operands
2137
	  // The result cannot be zero, but it can overflow
2138
	  x1 = delta + q2 - P34;	// 1 <= x1 <= P34-1
2139
	roundC2:
2140
	  // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
2141
	  // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
2142
	  scale = delta - q1 + q2 - x1;	// scale = e1 - e2 - x1 = P34 - q1
2143
	  // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
2144
	  // but their product fits with certainty in 128 bits (actually in 113)
2145
	  if (scale >= 20) {	//10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
2146
	    __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2147
	  } else if (scale >= 1) {
2148
	    // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
2149
	    if (q1 <= 19) {	// C1 fits in 64 bits
2150
	      __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2151
	    } else {	// q1 >= 20
2152
	      C1.w[1] = C1_hi;
2153
	      C1.w[0] = C1_lo;
2154
	      __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2155
	    }
2156
	  } else {	// if (scale == 0) C1 is unchanged
2157
	    C1.w[1] = C1_hi;
2158
	    C1.w[0] = C1_lo;
2159
	  }
2160
	  tmp64 = C1.w[0];	// C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
2161
 
2162
	  // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
2163
	  // (but if we got here a second time after x1 = x1 - 1, then
2164
	  // x1 >= 0; note that for x1 = 0 C2 is unchanged)
2165
	  // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
2166
	  ind = x1 - 1;	// 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
2167
	  // during a second pass, then ind = -1
2168
	  if (ind >= 0) {	// if (x1 >= 1)
2169
	    C2.w[0] = C2_lo;
2170
	    C2.w[1] = C2_hi;
2171
	    if (ind <= 18) {
2172
	      C2.w[0] = C2.w[0] + midpoint64[ind];
2173
	      if (C2.w[0] < C2_lo)
2174
		C2.w[1]++;
2175
	    } else {	// 19 <= ind <= 32
2176
	      C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
2177
	      C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
2178
	      if (C2.w[0] < C2_lo)
2179
		C2.w[1]++;
2180
	    }
2181
	    // the approximation of 10^(-x1) was rounded up to 118 bits
2182
	    __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);	// R256 = C2*, f2*
2183
	    // calculate C2* and f2*
2184
	    // C2* is actually floor(C2*) in this case
2185
	    // C2* and f2* need shifting and masking, as shown by
2186
	    // shiftright128[] and maskhigh128[]
2187
	    // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
2188
	    // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
2189
	    // if (0 < f2* < 10^(-x1)) then
2190
	    //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
2191
	    //       shift; C2* has p decimal digits, correct by Prop. 1)
2192
	    //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
2193
	    //       shift; C2* has p decimal digits, correct by Pr. 1)
2194
	    // else
2195
	    //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
2196
	    //       correct by Property 1)
2197
	    // n = C2* * 10^(e2+x1)
2198
 
2199
	    if (ind <= 2) {
2200
	      highf2star.w[1] = 0x0;
2201
	      highf2star.w[0] = 0x0;	// low f2* ok
2202
	    } else if (ind <= 21) {
2203
	      highf2star.w[1] = 0x0;
2204
	      highf2star.w[0] = R256.w[2] & maskhigh128[ind];	// low f2* ok
2205
	    } else {
2206
	      highf2star.w[1] = R256.w[3] & maskhigh128[ind];
2207
	      highf2star.w[0] = R256.w[2];	// low f2* is ok
2208
	    }
2209
	    // shift right C2* by Ex-128 = shiftright128[ind]
2210
	    if (ind >= 3) {
2211
	      shift = shiftright128[ind];
2212
	      if (shift < 64) {	// 3 <= shift <= 63
2213
		R256.w[2] =
2214
		  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
2215
		R256.w[3] = (R256.w[3] >> shift);
2216
	      } else {	// 66 <= shift <= 102
2217
		R256.w[2] = (R256.w[3] >> (shift - 64));
2218
		R256.w[3] = 0x0ULL;
2219
	      }
2220
	    }
2221
	    if (second_pass) {
2222
	      is_inexact_lt_midpoint = 0;
2223
	      is_inexact_gt_midpoint = 0;
2224
	      is_midpoint_lt_even = 0;
2225
	      is_midpoint_gt_even = 0;
2226
	    }
2227
	    // determine inexactness of the rounding of C2* (this may be
2228
	    // followed by a second rounding only if we get P34+1
2229
	    // decimal digits)
2230
	    // if (0 < f2* - 1/2 < 10^(-x1)) then
2231
	    //   the result is exact
2232
	    // else (if f2* - 1/2 > T* then)
2233
	    //   the result of is inexact
2234
	    if (ind <= 2) {
2235
	      if (R256.w[1] > 0x8000000000000000ull ||
2236
		  (R256.w[1] == 0x8000000000000000ull
2237
		   && R256.w[0] > 0x0ull)) {
2238
		// f2* > 1/2 and the result may be exact
2239
		tmp64A = R256.w[1] - 0x8000000000000000ull;	// f* - 1/2
2240
		if ((tmp64A > ten2mk128trunc[ind].w[1]
2241
		     || (tmp64A == ten2mk128trunc[ind].w[1]
2242
			 && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
2243
		  // set the inexact flag
2244
		  // *pfpsf |= INEXACT_EXCEPTION;
2245
		  tmp_inexact = 1;	// may be set again during a second pass
2246
		  // this rounding is applied to C2 only!
2247
		  if (x_sign == y_sign)
2248
		    is_inexact_lt_midpoint = 1;
2249
		  else	// if (x_sign != y_sign)
2250
		    is_inexact_gt_midpoint = 1;
2251
		}	// else the result is exact
2252
		// rounding down, unless a midpoint in [ODD, EVEN]
2253
	      } else {	// the result is inexact; f2* <= 1/2
2254
		// set the inexact flag
2255
		// *pfpsf |= INEXACT_EXCEPTION;
2256
		tmp_inexact = 1;	// just in case we will round a second time
2257
		// rounding up, unless a midpoint in [EVEN, ODD]
2258
		// this rounding is applied to C2 only!
2259
		if (x_sign == y_sign)
2260
		  is_inexact_gt_midpoint = 1;
2261
		else	// if (x_sign != y_sign)
2262
		  is_inexact_lt_midpoint = 1;
2263
	      }
2264
	    } else if (ind <= 21) {	// if 3 <= ind <= 21
2265
	      if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
2266
					    && highf2star.w[0] >
2267
					    onehalf128[ind])
2268
		  || (highf2star.w[1] == 0x0
2269
		      && highf2star.w[0] == onehalf128[ind]
2270
		      && (R256.w[1] || R256.w[0]))) {
2271
		// f2* > 1/2 and the result may be exact
2272
		// Calculate f2* - 1/2
2273
		tmp64A = highf2star.w[0] - onehalf128[ind];
2274
		tmp64B = highf2star.w[1];
2275
		if (tmp64A > highf2star.w[0])
2276
		  tmp64B--;
2277
		if (tmp64B || tmp64A
2278
		    || R256.w[1] > ten2mk128trunc[ind].w[1]
2279
		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2280
			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
2281
		  // set the inexact flag
2282
		  // *pfpsf |= INEXACT_EXCEPTION;
2283
		  tmp_inexact = 1;	// may be set again during a second pass
2284
		  // this rounding is applied to C2 only!
2285
		  if (x_sign == y_sign)
2286
		    is_inexact_lt_midpoint = 1;
2287
		  else	// if (x_sign != y_sign)
2288
		    is_inexact_gt_midpoint = 1;
2289
		}	// else the result is exact
2290
	      } else {	// the result is inexact; f2* <= 1/2
2291
		// set the inexact flag
2292
		// *pfpsf |= INEXACT_EXCEPTION;
2293
		tmp_inexact = 1;	// may be set again during a second pass
2294
		// rounding up, unless a midpoint in [EVEN, ODD]
2295
		// this rounding is applied to C2 only!
2296
		if (x_sign == y_sign)
2297
		  is_inexact_gt_midpoint = 1;
2298
		else	// if (x_sign != y_sign)
2299
		  is_inexact_lt_midpoint = 1;
2300
	      }
2301
	    } else {	// if 22 <= ind <= 33
2302
	      if (highf2star.w[1] > onehalf128[ind]
2303
		  || (highf2star.w[1] == onehalf128[ind]
2304
		      && (highf2star.w[0] || R256.w[1]
2305
			  || R256.w[0]))) {
2306
		// f2* > 1/2 and the result may be exact
2307
		// Calculate f2* - 1/2
2308
		// tmp64A = highf2star.w[0];
2309
		tmp64B = highf2star.w[1] - onehalf128[ind];
2310
		if (tmp64B || highf2star.w[0]
2311
		    || R256.w[1] > ten2mk128trunc[ind].w[1]
2312
		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2313
			&& R256.w[0] > ten2mk128trunc[ind].w[0])) {
2314
		  // set the inexact flag
2315
		  // *pfpsf |= INEXACT_EXCEPTION;
2316
		  tmp_inexact = 1;	// may be set again during a second pass
2317
		  // this rounding is applied to C2 only!
2318
		  if (x_sign == y_sign)
2319
		    is_inexact_lt_midpoint = 1;
2320
		  else	// if (x_sign != y_sign)
2321
		    is_inexact_gt_midpoint = 1;
2322
		}	// else the result is exact
2323
	      } else {	// the result is inexact; f2* <= 1/2
2324
		// set the inexact flag
2325
		// *pfpsf |= INEXACT_EXCEPTION;
2326
		tmp_inexact = 1;	// may be set again during a second pass
2327
		// rounding up, unless a midpoint in [EVEN, ODD]
2328
		// this rounding is applied to C2 only!
2329
		if (x_sign == y_sign)
2330
		  is_inexact_gt_midpoint = 1;
2331
		else	// if (x_sign != y_sign)
2332
		  is_inexact_lt_midpoint = 1;
2333
	      }
2334
	    }
2335
	    // check for midpoints
2336
	    if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
2337
		&& (highf2star.w[0] == 0)
2338
		&& (R256.w[1] < ten2mk128trunc[ind].w[1]
2339
		    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2340
			&& R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
2341
	      // the result is a midpoint
2342
	      if ((tmp64 + R256.w[2]) & 0x01) {	// MP in [EVEN, ODD]
2343
		// if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
2344
		R256.w[2]--;
2345
		if (R256.w[2] == 0xffffffffffffffffull)
2346
		  R256.w[3]--;
2347
		// this rounding is applied to C2 only!
2348
		if (x_sign == y_sign)
2349
		  is_midpoint_gt_even = 1;
2350
		else	// if (x_sign != y_sign)
2351
		  is_midpoint_lt_even = 1;
2352
		is_inexact_lt_midpoint = 0;
2353
		is_inexact_gt_midpoint = 0;
2354
	      } else {
2355
		// else MP in [ODD, EVEN]
2356
		// this rounding is applied to C2 only!
2357
		if (x_sign == y_sign)
2358
		  is_midpoint_lt_even = 1;
2359
		else	// if (x_sign != y_sign)
2360
		  is_midpoint_gt_even = 1;
2361
		is_inexact_lt_midpoint = 0;
2362
		is_inexact_gt_midpoint = 0;
2363
	      }
2364
	    }
2365
	    // end if (ind >= 0)
2366
	  } else {	// if (ind == -1); only during a 2nd pass, and when x1 = 0
2367
	    R256.w[2] = C2_lo;
2368
	    R256.w[3] = C2_hi;
2369
	    tmp_inexact = 0;
2370
	    // to correct a possible setting to 1 from 1st pass
2371
	    if (second_pass) {
2372
	      is_midpoint_lt_even = 0;
2373
	      is_midpoint_gt_even = 0;
2374
	      is_inexact_lt_midpoint = 0;
2375
	      is_inexact_gt_midpoint = 0;
2376
	    }
2377
	  }
2378
	  // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
2379
	  if (x_sign == y_sign) {	// addition; could overflow
2380
	    // no second pass is possible this way (only for x_sign != y_sign)
2381
	    C1.w[0] = C1.w[0] + R256.w[2];
2382
	    C1.w[1] = C1.w[1] + R256.w[3];
2383
	    if (C1.w[0] < tmp64)
2384
	      C1.w[1]++;	// carry
2385
	    // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
2386
	    // with x1=x1+1
2387
	    if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
2388
	      // chop off one more digit from the sum, but make sure there is
2389
	      // no double-rounding error (see table - double rounding logic)
2390
	      // now round C1 from P34+1 to P34 decimal digits
2391
	      // C1' = C1 + 1/2 * 10 = C1 + 5
2392
	      if (C1.w[0] >= 0xfffffffffffffffbull) {	// low half add has carry
2393
		C1.w[0] = C1.w[0] + 5;
2394
		C1.w[1] = C1.w[1] + 1;
2395
	      } else {
2396
		C1.w[0] = C1.w[0] + 5;
2397
	      }
2398
	      // the approximation of 10^(-1) was rounded up to 118 bits
2399
	      __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);	// Q256 = C1*, f1*
2400
	      // C1* is actually floor(C1*) in this case
2401
	      // the top 128 bits of 10^(-1) are
2402
	      // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
2403
	      // if (0 < f1* < 10^(-1)) then
2404
	      //   if floor(C1*) is even then C1* = floor(C1*) - logical right
2405
	      //       shift; C1* has p decimal digits, correct by Prop. 1)
2406
	      //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
2407
	      //       shift; C1* has p decimal digits, correct by Pr. 1)
2408
	      // else
2409
	      //   C1* = floor(C1*) (logical right shift; C has p decimal digits
2410
	      //       correct by Property 1)
2411
	      // n = C1* * 10^(e2+x1+1)
2412
	      if ((Q256.w[1] || Q256.w[0])
2413
		  && (Q256.w[1] < ten2mk128trunc[0].w[1]
2414
		      || (Q256.w[1] == ten2mk128trunc[0].w[1]
2415
			  && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
2416
		// the result is a midpoint
2417
		if (is_inexact_lt_midpoint) {	// for the 1st rounding
2418
		  is_inexact_gt_midpoint = 1;
2419
		  is_inexact_lt_midpoint = 0;
2420
		  is_midpoint_gt_even = 0;
2421
		  is_midpoint_lt_even = 0;
2422
		} else if (is_inexact_gt_midpoint) {	// for the 1st rounding
2423
		  Q256.w[2]--;
2424
		  if (Q256.w[2] == 0xffffffffffffffffull)
2425
		    Q256.w[3]--;
2426
		  is_inexact_gt_midpoint = 0;
2427
		  is_inexact_lt_midpoint = 1;
2428
		  is_midpoint_gt_even = 0;
2429
		  is_midpoint_lt_even = 0;
2430
		} else if (is_midpoint_gt_even) {	// for the 1st rounding
2431
		  // Note: cannot have is_midpoint_lt_even
2432
		  is_inexact_gt_midpoint = 0;
2433
		  is_inexact_lt_midpoint = 1;
2434
		  is_midpoint_gt_even = 0;
2435
		  is_midpoint_lt_even = 0;
2436
		} else {	// the first rounding must have been exact
2437
		  if (Q256.w[2] & 0x01) {	// MP in [EVEN, ODD]
2438
		    // the truncated result is correct
2439
		    Q256.w[2]--;
2440
		    if (Q256.w[2] == 0xffffffffffffffffull)
2441
		      Q256.w[3]--;
2442
		    is_inexact_gt_midpoint = 0;
2443
		    is_inexact_lt_midpoint = 0;
2444
		    is_midpoint_gt_even = 1;
2445
		    is_midpoint_lt_even = 0;
2446
		  } else {	// MP in [ODD, EVEN]
2447
		    is_inexact_gt_midpoint = 0;
2448
		    is_inexact_lt_midpoint = 0;
2449
		    is_midpoint_gt_even = 0;
2450
		    is_midpoint_lt_even = 1;
2451
		  }
2452
		}
2453
		tmp_inexact = 1;	// in all cases
2454
	      } else {	// the result is not a midpoint
2455
		// determine inexactness of the rounding of C1 (the sum C1+C2*)
2456
		// if (0 < f1* - 1/2 < 10^(-1)) then
2457
		//   the result is exact
2458
		// else (if f1* - 1/2 > T* then)
2459
		//   the result of is inexact
2460
		// ind = 0
2461
		if (Q256.w[1] > 0x8000000000000000ull
2462
		    || (Q256.w[1] == 0x8000000000000000ull
2463
			&& Q256.w[0] > 0x0ull)) {
2464
		  // f1* > 1/2 and the result may be exact
2465
		  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;	// f1* - 1/2
2466
		  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
2467
		       || (Q256.w[1] == ten2mk128trunc[0].w[1]
2468
			   && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
2469
		    is_inexact_gt_midpoint = 0;
2470
		    is_inexact_lt_midpoint = 1;
2471
		    is_midpoint_gt_even = 0;
2472
		    is_midpoint_lt_even = 0;
2473
		    // set the inexact flag
2474
		    tmp_inexact = 1;
2475
		    // *pfpsf |= INEXACT_EXCEPTION;
2476
		  } else {	// else the result is exact for the 2nd rounding
2477
		    if (tmp_inexact) {	// if the previous rounding was inexact
2478
		      if (is_midpoint_lt_even) {
2479
			is_inexact_gt_midpoint = 1;
2480
			is_midpoint_lt_even = 0;
2481
		      } else if (is_midpoint_gt_even) {
2482
			is_inexact_lt_midpoint = 1;
2483
			is_midpoint_gt_even = 0;
2484
		      } else {
2485
			;	// no change
2486
		      }
2487
		    }
2488
		  }
2489
		  // rounding down, unless a midpoint in [ODD, EVEN]
2490
		} else {	// the result is inexact; f1* <= 1/2
2491
		  is_inexact_gt_midpoint = 1;
2492
		  is_inexact_lt_midpoint = 0;
2493
		  is_midpoint_gt_even = 0;
2494
		  is_midpoint_lt_even = 0;
2495
		  // set the inexact flag
2496
		  tmp_inexact = 1;
2497
		  // *pfpsf |= INEXACT_EXCEPTION;
2498
		}
2499
	      }	// end 'the result is not a midpoint'
2500
	      // n = C1 * 10^(e2+x1)
2501
	      C1.w[1] = Q256.w[3];
2502
	      C1.w[0] = Q256.w[2];
2503
	      y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
2504
	    } else {	// C1 < 10^34
2505
	      // C1.w[1] and C1.w[0] already set
2506
	      // n = C1 * 10^(e2+x1)
2507
	      y_exp = y_exp + ((UINT64) x1 << 49);
2508
	    }
2509
	    // check for overflow
2510
	    if (y_exp == EXP_MAX_P1
2511
		&& (rnd_mode == ROUNDING_TO_NEAREST
2512
		    || rnd_mode == ROUNDING_TIES_AWAY)) {
2513
	      res.w[1] = 0x7800000000000000ull | x_sign;	// +/-inf
2514
	      res.w[0] = 0x0ull;
2515
	      // set the inexact flag
2516
	      *pfpsf |= INEXACT_EXCEPTION;
2517
	      // set the overflow flag
2518
	      *pfpsf |= OVERFLOW_EXCEPTION;
2519
	      BID_SWAP128 (res);
2520
	      BID_RETURN (res);
2521
	    }	// else no overflow
2522
	  } else {	// if x_sign != y_sign the result of this subtract. is exact
2523
	    C1.w[0] = C1.w[0] - R256.w[2];
2524
	    C1.w[1] = C1.w[1] - R256.w[3];
2525
	    if (C1.w[0] > tmp64)
2526
	      C1.w[1]--;	// borrow
2527
	    if (C1.w[1] >= 0x8000000000000000ull) {	// negative coefficient!
2528
	      C1.w[0] = ~C1.w[0];
2529
	      C1.w[0]++;
2530
	      C1.w[1] = ~C1.w[1];
2531
	      if (C1.w[0] == 0x0)
2532
		C1.w[1]++;
2533
	      tmp_sign = y_sign;
2534
	      // the result will have the sign of y if last rnd
2535
	    } else {
2536
	      tmp_sign = x_sign;
2537
	    }
2538
	    // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
2539
	    //   redo the calculation with x1=x1-1;
2540
	    // redo the calculation also if C1 = 10^33 and
2541
	    //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
2542
	    //   (the last part should have really been
2543
	    //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
2544
	    //    the rounding of C2, but the position flags have been reversed)
2545
	    // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
2546
	    if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {	// C1=10^33
2547
	      x1 = x1 - 1;	// x1 >= 0
2548
	      if (x1 >= 0) {
2549
		// clear position flags and tmp_inexact
2550
		is_midpoint_lt_even = 0;
2551
		is_midpoint_gt_even = 0;
2552
		is_inexact_lt_midpoint = 0;
2553
		is_inexact_gt_midpoint = 0;
2554
		tmp_inexact = 0;
2555
		second_pass = 1;
2556
		goto roundC2;	// else result has less than P34 digits
2557
	      }
2558
	    }
2559
	    // if the coefficient of the result is 10^34 it means that this
2560
	    // must be the second pass, and we are done
2561
	    if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {	// if  C1 = 10^34
2562
	      C1.w[1] = 0x0000314dc6448d93ull;	// C1 = 10^33
2563
	      C1.w[0] = 0x38c15b0a00000000ull;
2564
	      y_exp = y_exp + ((UINT64) 1 << 49);
2565
	    }
2566
	    x_sign = tmp_sign;
2567
	    if (x1 >= 1)
2568
	      y_exp = y_exp + ((UINT64) x1 << 49);
2569
	    // x1 = -1 is possible at the end of a second pass when the
2570
	    // first pass started with x1 = 1
2571
	  }
2572
	  C1_hi = C1.w[1];
2573
	  C1_lo = C1.w[0];
2574
	  // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2575
	  if (rnd_mode != ROUNDING_TO_NEAREST) {
2576
	    if ((!x_sign
2577
		 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
2578
		     ||
2579
		     ((rnd_mode == ROUNDING_TIES_AWAY
2580
		       || rnd_mode == ROUNDING_UP)
2581
		      && is_midpoint_gt_even))) || (x_sign
2582
						    &&
2583
						    ((rnd_mode ==
2584
						      ROUNDING_DOWN
2585
						      &&
2586
						      is_inexact_lt_midpoint)
2587
						     ||
2588
						     ((rnd_mode ==
2589
						       ROUNDING_TIES_AWAY
2590
						       || rnd_mode ==
2591
						       ROUNDING_DOWN)
2592
						      &&
2593
						      is_midpoint_gt_even))))
2594
	    {
2595
	      // C1 = C1 + 1
2596
	      C1_lo = C1_lo + 1;
2597
	      if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2598
		C1_hi = C1_hi + 1;
2599
	      }
2600
	      if (C1_hi == 0x0001ed09bead87c0ull
2601
		  && C1_lo == 0x378d8e6400000000ull) {
2602
		// C1 = 10^34 => rounding overflow
2603
		C1_hi = 0x0000314dc6448d93ull;
2604
		C1_lo = 0x38c15b0a00000000ull;	// 10^33
2605
		y_exp = y_exp + EXP_P1;
2606
	      }
2607
	    } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2608
		       &&
2609
		       ((x_sign
2610
			 && (rnd_mode == ROUNDING_UP
2611
			     || rnd_mode == ROUNDING_TO_ZERO))
2612
			|| (!x_sign
2613
			    && (rnd_mode == ROUNDING_DOWN
2614
				|| rnd_mode == ROUNDING_TO_ZERO)))) {
2615
	      // C1 = C1 - 1
2616
	      C1_lo = C1_lo - 1;
2617
	      if (C1_lo == 0xffffffffffffffffull)
2618
		C1_hi--;
2619
	      // check if we crossed into the lower decade
2620
	      if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2621
		C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2622
		C1_lo = 0x378d8e63ffffffffull;
2623
		y_exp = y_exp - EXP_P1;
2624
		// no underflow, because delta + q2 >= P34 + 1
2625
	      }
2626
	    } else {
2627
	      ;	// exact, the result is already correct
2628
	    }
2629
	    // in all cases check for overflow (RN and RA solved already)
2630
	    if (y_exp == EXP_MAX_P1) {	// overflow
2631
	      if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2632
		  (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2633
		C1_hi = 0x7800000000000000ull;	// +inf
2634
		C1_lo = 0x0ull;
2635
	      } else {	// RM and res > 0, RP and res < 0, or RZ
2636
		C1_hi = 0x5fffed09bead87c0ull;
2637
		C1_lo = 0x378d8e63ffffffffull;
2638
	      }
2639
	      y_exp = 0;	// x_sign is preserved
2640
	      // set the inexact flag (in case the exact addition was exact)
2641
	      *pfpsf |= INEXACT_EXCEPTION;
2642
	      // set the overflow flag
2643
	      *pfpsf |= OVERFLOW_EXCEPTION;
2644
	    }
2645
	  }
2646
	  // assemble the result
2647
	  res.w[1] = x_sign | y_exp | C1_hi;
2648
	  res.w[0] = C1_lo;
2649
	  if (tmp_inexact)
2650
	    *pfpsf |= INEXACT_EXCEPTION;
2651
	}
2652
      } else {	// if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
2653
	// NOTE: the following, up to "} else { // if x_sign != y_sign
2654
	// the result is exact" is identical to "else if (delta == P34 - q2) {"
2655
	// from above; also, the code is not symmetric: a+b and b+a may take
2656
	// different paths (need to unify eventually!)
2657
	// calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be
2658
	// inexact if it requires P34 + 1 decimal digits; in either case the
2659
	// 'cutoff' point for addition is at the position of the lsb of C2
2660
	// The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
2661
	// exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
2662
	// but their product fits with certainty in 128 bits (actually in 113)
2663
	// Note that 0 <= e1 - e2 <= P34 - 2
2664
	//   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
2665
	//   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
2666
	//   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
2667
	//   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
2668
	scale = delta - q1 + q2;	// scale = (int)(e1 >> 49) - (int)(e2 >> 49)
2669
	if (scale >= 20) {	// 10^(e1-e2) does not fit in 64 bits, but C1 does
2670
	  __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2671
	} else if (scale >= 1) {
2672
	  // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
2673
	  if (q1 <= 19) {	// C1 fits in 64 bits
2674
	    __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2675
	  } else {	// q1 >= 20
2676
	    C1.w[1] = C1_hi;
2677
	    C1.w[0] = C1_lo;
2678
	    __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2679
	  }
2680
	} else {	// if (scale == 0) C1 is unchanged
2681
	  C1.w[1] = C1_hi;
2682
	  C1.w[0] = C1_lo;	// only the low part is necessary
2683
	}
2684
	C1_hi = C1.w[1];
2685
	C1_lo = C1.w[0];
2686
	// now add C2
2687
	if (x_sign == y_sign) {
2688
	  // the result can overflow!
2689
	  C1_lo = C1_lo + C2_lo;
2690
	  C1_hi = C1_hi + C2_hi;
2691
	  if (C1_lo < C1.w[0])
2692
	    C1_hi++;
2693
	  // test for overflow, possible only when C1 >= 10^34
2694
	  if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {	// C1 >= 10^34
2695
	    // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
2696
	    // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
2697
	    // decimal digits
2698
	    // Calculate C'' = C' + 1/2 * 10^x
2699
	    if (C1_lo >= 0xfffffffffffffffbull) {	// low half add has carry
2700
	      C1_lo = C1_lo + 5;
2701
	      C1_hi = C1_hi + 1;
2702
	    } else {
2703
	      C1_lo = C1_lo + 5;
2704
	    }
2705
	    // the approximation of 10^(-1) was rounded up to 118 bits
2706
	    // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
2707
	    // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
2708
	    C1.w[1] = C1_hi;
2709
	    C1.w[0] = C1_lo;	// C''
2710
	    ten2m1.w[1] = 0x1999999999999999ull;
2711
	    ten2m1.w[0] = 0x9999999999999a00ull;
2712
	    __mul_128x128_to_256 (P256, C1, ten2m1);	// P256 = C*, f*
2713
	    // C* is actually floor(C*) in this case
2714
	    // the top Ex = 128 bits of 10^(-1) are
2715
	    // T* = 0x00199999999999999999999999999999
2716
	    // if (0 < f* < 10^(-x)) then
2717
	    //   if floor(C*) is even then C = floor(C*) - logical right
2718
	    //       shift; C has p decimal digits, correct by Prop. 1)
2719
	    //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
2720
	    //       shift; C has p decimal digits, correct by Pr. 1)
2721
	    // else
2722
	    //   C = floor(C*) (logical right shift; C has p decimal digits,
2723
	    //       correct by Property 1)
2724
	    // n = C * 10^(e2+x)
2725
	    if ((P256.w[1] || P256.w[0])
2726
		&& (P256.w[1] < 0x1999999999999999ull
2727
		    || (P256.w[1] == 0x1999999999999999ull
2728
			&& P256.w[0] <= 0x9999999999999999ull))) {
2729
	      // the result is a midpoint
2730
	      if (P256.w[2] & 0x01) {
2731
		is_midpoint_gt_even = 1;
2732
		// if floor(C*) is odd C = floor(C*) - 1; the result is not 0
2733
		P256.w[2]--;
2734
		if (P256.w[2] == 0xffffffffffffffffull)
2735
		  P256.w[3]--;
2736
	      } else {
2737
		is_midpoint_lt_even = 1;
2738
	      }
2739
	    }
2740
	    // n = Cstar * 10^(e2+1)
2741
	    y_exp = y_exp + EXP_P1;
2742
	    // C* != 10^P34 because C* has P34 digits
2743
	    // check for overflow
2744
	    if (y_exp == EXP_MAX_P1
2745
		&& (rnd_mode == ROUNDING_TO_NEAREST
2746
		    || rnd_mode == ROUNDING_TIES_AWAY)) {
2747
	      // overflow for RN
2748
	      res.w[1] = x_sign | 0x7800000000000000ull;	// +/-inf
2749
	      res.w[0] = 0x0ull;
2750
	      // set the inexact flag
2751
	      *pfpsf |= INEXACT_EXCEPTION;
2752
	      // set the overflow flag
2753
	      *pfpsf |= OVERFLOW_EXCEPTION;
2754
	      BID_SWAP128 (res);
2755
	      BID_RETURN (res);
2756
	    }
2757
	    // if (0 < f* - 1/2 < 10^(-x)) then
2758
	    //   the result of the addition is exact
2759
	    // else
2760
	    //   the result of the addition is inexact
2761
	    if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {	// the result may be exact
2762
	      tmp64 = P256.w[1] - 0x8000000000000000ull;	// f* - 1/2
2763
	      if ((tmp64 > 0x1999999999999999ull
2764
		   || (tmp64 == 0x1999999999999999ull
2765
		       && P256.w[0] >= 0x9999999999999999ull))) {
2766
		// set the inexact flag
2767
		*pfpsf |= INEXACT_EXCEPTION;
2768
		is_inexact = 1;
2769
	      }	// else the result is exact
2770
	    } else {	// the result is inexact
2771
	      // set the inexact flag
2772
	      *pfpsf |= INEXACT_EXCEPTION;
2773
	      is_inexact = 1;
2774
	    }
2775
	    C1_hi = P256.w[3];
2776
	    C1_lo = P256.w[2];
2777
	    if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2778
	      is_inexact_lt_midpoint = is_inexact
2779
		&& (P256.w[1] & 0x8000000000000000ull);
2780
	      is_inexact_gt_midpoint = is_inexact
2781
		&& !(P256.w[1] & 0x8000000000000000ull);
2782
	    }
2783
	    // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2784
	    if (rnd_mode != ROUNDING_TO_NEAREST) {
2785
	      if ((!x_sign
2786
		   && ((rnd_mode == ROUNDING_UP
2787
			&& is_inexact_lt_midpoint)
2788
		       || ((rnd_mode == ROUNDING_TIES_AWAY
2789
			    || rnd_mode == ROUNDING_UP)
2790
			   && is_midpoint_gt_even))) || (x_sign
2791
							 &&
2792
							 ((rnd_mode ==
2793
							   ROUNDING_DOWN
2794
							   &&
2795
							   is_inexact_lt_midpoint)
2796
							  ||
2797
							  ((rnd_mode ==
2798
							    ROUNDING_TIES_AWAY
2799
							    || rnd_mode
2800
							    ==
2801
							    ROUNDING_DOWN)
2802
							   &&
2803
							   is_midpoint_gt_even))))
2804
	      {
2805
		// C1 = C1 + 1
2806
		C1_lo = C1_lo + 1;
2807
		if (C1_lo == 0) {	// rounding overflow in the low 64 bits
2808
		  C1_hi = C1_hi + 1;
2809
		}
2810
		if (C1_hi == 0x0001ed09bead87c0ull
2811
		    && C1_lo == 0x378d8e6400000000ull) {
2812
		  // C1 = 10^34 => rounding overflow
2813
		  C1_hi = 0x0000314dc6448d93ull;
2814
		  C1_lo = 0x38c15b0a00000000ull;	// 10^33
2815
		  y_exp = y_exp + EXP_P1;
2816
		}
2817
	      } else
2818
		if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
2819
		    ((x_sign && (rnd_mode == ROUNDING_UP ||
2820
				 rnd_mode == ROUNDING_TO_ZERO)) ||
2821
		     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
2822
				  rnd_mode == ROUNDING_TO_ZERO)))) {
2823
		// C1 = C1 - 1
2824
		C1_lo = C1_lo - 1;
2825
		if (C1_lo == 0xffffffffffffffffull)
2826
		  C1_hi--;
2827
		// check if we crossed into the lower decade
2828
		if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {	// 10^33 - 1
2829
		  C1_hi = 0x0001ed09bead87c0ull;	// 10^34 - 1
2830
		  C1_lo = 0x378d8e63ffffffffull;
2831
		  y_exp = y_exp - EXP_P1;
2832
		  // no underflow, because delta + q2 >= P34 + 1
2833
		}
2834
	      } else {
2835
		;	// exact, the result is already correct
2836
	      }
2837
	      // in all cases check for overflow (RN and RA solved already)
2838
	      if (y_exp == EXP_MAX_P1) {	// overflow
2839
		if ((rnd_mode == ROUNDING_DOWN && x_sign) ||	// RM and res < 0
2840
		    (rnd_mode == ROUNDING_UP && !x_sign)) {	// RP and res > 0
2841
		  C1_hi = 0x7800000000000000ull;	// +inf
2842
		  C1_lo = 0x0ull;
2843
		} else {	// RM and res > 0, RP and res < 0, or RZ
2844
		  C1_hi = 0x5fffed09bead87c0ull;
2845
		  C1_lo = 0x378d8e63ffffffffull;
2846
		}
2847
		y_exp = 0;	// x_sign is preserved
2848
		// set the inexact flag (in case the exact addition was exact)
2849
		*pfpsf |= INEXACT_EXCEPTION;
2850
		// set the overflow flag
2851
		*pfpsf |= OVERFLOW_EXCEPTION;
2852
	      }
2853
	    }
2854
	  }	// else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2855
	  // assemble the result
2856
	  res.w[1] = x_sign | y_exp | C1_hi;
2857
	  res.w[0] = C1_lo;
2858
	} else {	// if x_sign != y_sign the result is exact
2859
	  C1_lo = C2_lo - C1_lo;
2860
	  C1_hi = C2_hi - C1_hi;
2861
	  if (C1_lo > C2_lo)
2862
	    C1_hi--;
2863
	  if (C1_hi >= 0x8000000000000000ull) {	// negative coefficient!
2864
	    C1_lo = ~C1_lo;
2865
	    C1_lo++;
2866
	    C1_hi = ~C1_hi;
2867
	    if (C1_lo == 0x0)
2868
	      C1_hi++;
2869
	    x_sign = y_sign;	// the result will have the sign of y
2870
	  }
2871
	  // the result can be zero, but it cannot overflow
2872
	  if (C1_lo == 0 && C1_hi == 0) {
2873
	    // assemble the result
2874
	    if (x_exp < y_exp)
2875
	      res.w[1] = x_exp;
2876
	    else
2877
	      res.w[1] = y_exp;
2878
	    res.w[0] = 0;
2879
	    if (rnd_mode == ROUNDING_DOWN) {
2880
	      res.w[1] |= 0x8000000000000000ull;
2881
	    }
2882
	    BID_SWAP128 (res);
2883
	    BID_RETURN (res);
2884
	  }
2885
	  // assemble the result
2886
	  res.w[1] = y_sign | y_exp | C1_hi;
2887
	  res.w[0] = C1_lo;
2888
	}
2889
      }
2890
    }
2891
    BID_SWAP128 (res);
2892
    BID_RETURN (res)
2893
  }
2894
}
2895
 
2896
 
2897
 
2898
// bid128_sub stands for bid128qq_sub
2899
 
2900
/*****************************************************************************
2901
 *  BID128 sub
2902
 ****************************************************************************/
2903
 
2904
#if DECIMAL_CALL_BY_REFERENCE
2905
void
2906
bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
2907
	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2908
	    _EXC_INFO_PARAM) {
2909
  UINT128 x = *px, y = *py;
2910
#if !DECIMAL_GLOBAL_ROUNDING
2911
  unsigned int rnd_mode = *prnd_mode;
2912
#endif
2913
#else
2914
UINT128
2915
bid128_sub (UINT128 x, UINT128 y
2916
	    _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2917
	    _EXC_INFO_PARAM) {
2918
#endif
2919
 
2920
  UINT128 res;
2921
  UINT64 y_sign;
2922
 
2923
  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {	// y is not NAN
2924
    // change its sign
2925
    y_sign = y.w[HIGH_128W] & MASK_SIGN;	// 0 for positive, MASK_SIGN for negative
2926
    if (y_sign)
2927
      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
2928
    else
2929
      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
2930
  }
2931
#if DECIMAL_CALL_BY_REFERENCE
2932
  bid128_add (&res, &x, &y
2933
	      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2934
	      _EXC_INFO_ARG);
2935
#else
2936
  res = bid128_add (x, y
2937
		    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2938
		    _EXC_INFO_ARG);
2939
#endif
2940
  BID_RETURN (res);
2941
}