4
* Derived from SoftFloat.
7
/*============================================================================
9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12
Written by John R. Hauser. This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704. Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980. The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
36
=============================================================================*/
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
* target-dependent and needs the TARGET_* macros.
43
#include "fpu/softfloat.h"
45
/* We only need stdlib for abort() */
48
/*----------------------------------------------------------------------------
49
| Primitive arithmetic functions, including multi-word arithmetic, and
50
| division and square root approximations. (Can be specialized to target if
52
*----------------------------------------------------------------------------*/
53
#include "softfloat-macros.h"
55
/*----------------------------------------------------------------------------
56
| Functions and definitions to determine: (1) whether tininess for underflow
57
| is detected before or after rounding by default, (2) what (if anything)
58
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
59
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60
| are propagated from function inputs to output. These details are target-
62
*----------------------------------------------------------------------------*/
63
#include "softfloat-specialize.h"
65
/*----------------------------------------------------------------------------
66
| Returns the fraction bits of the half-precision floating-point value `a'.
67
*----------------------------------------------------------------------------*/
69
INLINE uint32_t extractFloat16Frac(float16 a)
71
return float16_val(a) & 0x3ff;
74
/*----------------------------------------------------------------------------
75
| Returns the exponent bits of the half-precision floating-point value `a'.
76
*----------------------------------------------------------------------------*/
78
INLINE int_fast16_t extractFloat16Exp(float16 a)
80
return (float16_val(a) >> 10) & 0x1f;
83
/*----------------------------------------------------------------------------
84
| Returns the sign bit of the single-precision floating-point value `a'.
85
*----------------------------------------------------------------------------*/
87
INLINE flag extractFloat16Sign(float16 a)
89
return float16_val(a)>>15;
92
/*----------------------------------------------------------------------------
93
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94
| and 7, and returns the properly rounded 32-bit integer corresponding to the
95
| input. If `zSign' is 1, the input is negated before being converted to an
96
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
97
| is simply rounded to an integer, with the inexact exception raised if the
98
| input cannot be represented exactly as an integer. However, if the fixed-
99
| point input is too large, the invalid exception is raised and the largest
100
| positive or negative integer is returned.
101
*----------------------------------------------------------------------------*/
103
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
106
flag roundNearestEven;
107
int8 roundIncrement, roundBits;
110
roundingMode = STATUS(float_rounding_mode);
111
roundNearestEven = ( roundingMode == float_round_nearest_even );
112
switch (roundingMode) {
113
case float_round_nearest_even:
114
roundIncrement = 0x40;
116
case float_round_to_zero:
120
roundIncrement = zSign ? 0 : 0x7f;
122
case float_round_down:
123
roundIncrement = zSign ? 0x7f : 0;
128
roundBits = absZ & 0x7F;
129
absZ = ( absZ + roundIncrement )>>7;
130
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132
if ( zSign ) z = - z;
133
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
134
float_raise( float_flag_invalid STATUS_VAR);
135
return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
137
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
142
/*----------------------------------------------------------------------------
143
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
144
| `absZ1', with binary point between bits 63 and 64 (between the input words),
145
| and returns the properly rounded 64-bit integer corresponding to the input.
146
| If `zSign' is 1, the input is negated before being converted to an integer.
147
| Ordinarily, the fixed-point input is simply rounded to an integer, with
148
| the inexact exception raised if the input cannot be represented exactly as
149
| an integer. However, if the fixed-point input is too large, the invalid
150
| exception is raised and the largest positive or negative integer is
152
*----------------------------------------------------------------------------*/
154
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
157
flag roundNearestEven, increment;
160
roundingMode = STATUS(float_rounding_mode);
161
roundNearestEven = ( roundingMode == float_round_nearest_even );
162
switch (roundingMode) {
163
case float_round_nearest_even:
164
increment = ((int64_t) absZ1 < 0);
166
case float_round_to_zero:
170
increment = !zSign && absZ1;
172
case float_round_down:
173
increment = zSign && absZ1;
180
if ( absZ0 == 0 ) goto overflow;
181
absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184
if ( zSign ) z = - z;
185
if ( z && ( ( z < 0 ) ^ zSign ) ) {
187
float_raise( float_flag_invalid STATUS_VAR);
189
zSign ? (int64_t) LIT64( 0x8000000000000000 )
190
: LIT64( 0x7FFFFFFFFFFFFFFF );
192
if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
197
/*----------------------------------------------------------------------------
198
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
199
| `absZ1', with binary point between bits 63 and 64 (between the input words),
200
| and returns the properly rounded 64-bit unsigned integer corresponding to the
201
| input. Ordinarily, the fixed-point input is simply rounded to an integer,
202
| with the inexact exception raised if the input cannot be represented exactly
203
| as an integer. However, if the fixed-point input is too large, the invalid
204
| exception is raised and the largest unsigned integer is returned.
205
*----------------------------------------------------------------------------*/
207
static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
208
uint64_t absZ1 STATUS_PARAM)
211
flag roundNearestEven, increment;
213
roundingMode = STATUS(float_rounding_mode);
214
roundNearestEven = (roundingMode == float_round_nearest_even);
215
switch (roundingMode) {
216
case float_round_nearest_even:
217
increment = ((int64_t)absZ1 < 0);
219
case float_round_to_zero:
223
increment = !zSign && absZ1;
225
case float_round_down:
226
increment = zSign && absZ1;
234
float_raise(float_flag_invalid STATUS_VAR);
235
return LIT64(0xFFFFFFFFFFFFFFFF);
237
absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
240
if (zSign && absZ0) {
241
float_raise(float_flag_invalid STATUS_VAR);
246
STATUS(float_exception_flags) |= float_flag_inexact;
251
/*----------------------------------------------------------------------------
252
| Returns the fraction bits of the single-precision floating-point value `a'.
253
*----------------------------------------------------------------------------*/
255
INLINE uint32_t extractFloat32Frac( float32 a )
258
return float32_val(a) & 0x007FFFFF;
262
/*----------------------------------------------------------------------------
263
| Returns the exponent bits of the single-precision floating-point value `a'.
264
*----------------------------------------------------------------------------*/
266
INLINE int_fast16_t extractFloat32Exp(float32 a)
269
return ( float32_val(a)>>23 ) & 0xFF;
273
/*----------------------------------------------------------------------------
274
| Returns the sign bit of the single-precision floating-point value `a'.
275
*----------------------------------------------------------------------------*/
277
INLINE flag extractFloat32Sign( float32 a )
280
return float32_val(a)>>31;
284
/*----------------------------------------------------------------------------
285
| If `a' is denormal and we are in flush-to-zero mode then set the
286
| input-denormal exception and return zero. Otherwise just return the value.
287
*----------------------------------------------------------------------------*/
288
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
290
if (STATUS(flush_inputs_to_zero)) {
291
if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
292
float_raise(float_flag_input_denormal STATUS_VAR);
293
return make_float32(float32_val(a) & 0x80000000);
299
/*----------------------------------------------------------------------------
300
| Normalizes the subnormal single-precision floating-point value represented
301
| by the denormalized significand `aSig'. The normalized exponent and
302
| significand are stored at the locations pointed to by `zExpPtr' and
303
| `zSigPtr', respectively.
304
*----------------------------------------------------------------------------*/
307
normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
311
shiftCount = countLeadingZeros32( aSig ) - 8;
312
*zSigPtr = aSig<<shiftCount;
313
*zExpPtr = 1 - shiftCount;
317
/*----------------------------------------------------------------------------
318
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
319
| single-precision floating-point value, returning the result. After being
320
| shifted into the proper positions, the three fields are simply added
321
| together to form the result. This means that any integer portion of `zSig'
322
| will be added into the exponent. Since a properly normalized significand
323
| will have an integer portion equal to 1, the `zExp' input should be 1 less
324
| than the desired result exponent whenever `zSig' is a complete, normalized
326
*----------------------------------------------------------------------------*/
328
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
332
( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
336
/*----------------------------------------------------------------------------
337
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
338
| and significand `zSig', and returns the proper single-precision floating-
339
| point value corresponding to the abstract input. Ordinarily, the abstract
340
| value is simply rounded and packed into the single-precision format, with
341
| the inexact exception raised if the abstract input cannot be represented
342
| exactly. However, if the abstract value is too large, the overflow and
343
| inexact exceptions are raised and an infinity or maximal finite value is
344
| returned. If the abstract value is too small, the input value is rounded to
345
| a subnormal number, and the underflow and inexact exceptions are raised if
346
| the abstract input cannot be represented exactly as a subnormal single-
347
| precision floating-point number.
348
| The input significand `zSig' has its binary point between bits 30
349
| and 29, which is 7 bits to the left of the usual location. This shifted
350
| significand must be normalized or smaller. If `zSig' is not normalized,
351
| `zExp' must be 0; in that case, the result returned is a subnormal number,
352
| and it must not require rounding. In the usual case that `zSig' is
353
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
354
| The handling of underflow and overflow follows the IEC/IEEE Standard for
355
| Binary Floating-Point Arithmetic.
356
*----------------------------------------------------------------------------*/
358
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
361
flag roundNearestEven;
362
int8 roundIncrement, roundBits;
365
roundingMode = STATUS(float_rounding_mode);
366
roundNearestEven = ( roundingMode == float_round_nearest_even );
367
switch (roundingMode) {
368
case float_round_nearest_even:
369
roundIncrement = 0x40;
371
case float_round_to_zero:
375
roundIncrement = zSign ? 0 : 0x7f;
377
case float_round_down:
378
roundIncrement = zSign ? 0x7f : 0;
384
roundBits = zSig & 0x7F;
385
if ( 0xFD <= (uint16_t) zExp ) {
387
|| ( ( zExp == 0xFD )
388
&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
390
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
391
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
394
if (STATUS(flush_to_zero)) {
395
float_raise(float_flag_output_denormal STATUS_VAR);
396
return packFloat32(zSign, 0, 0);
399
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
401
|| ( zSig + roundIncrement < 0x80000000 );
402
shift32RightJamming( zSig, - zExp, &zSig );
404
roundBits = zSig & 0x7F;
405
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
408
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
409
zSig = ( zSig + roundIncrement )>>7;
410
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
411
if ( zSig == 0 ) zExp = 0;
412
return packFloat32( zSign, zExp, zSig );
416
/*----------------------------------------------------------------------------
417
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
418
| and significand `zSig', and returns the proper single-precision floating-
419
| point value corresponding to the abstract input. This routine is just like
420
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
421
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
422
| floating-point exponent.
423
*----------------------------------------------------------------------------*/
426
normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
430
shiftCount = countLeadingZeros32( zSig ) - 1;
431
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
435
/*----------------------------------------------------------------------------
436
| Returns the fraction bits of the double-precision floating-point value `a'.
437
*----------------------------------------------------------------------------*/
439
INLINE uint64_t extractFloat64Frac( float64 a )
442
return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
446
/*----------------------------------------------------------------------------
447
| Returns the exponent bits of the double-precision floating-point value `a'.
448
*----------------------------------------------------------------------------*/
450
INLINE int_fast16_t extractFloat64Exp(float64 a)
453
return ( float64_val(a)>>52 ) & 0x7FF;
457
/*----------------------------------------------------------------------------
458
| Returns the sign bit of the double-precision floating-point value `a'.
459
*----------------------------------------------------------------------------*/
461
INLINE flag extractFloat64Sign( float64 a )
464
return float64_val(a)>>63;
468
/*----------------------------------------------------------------------------
469
| If `a' is denormal and we are in flush-to-zero mode then set the
470
| input-denormal exception and return zero. Otherwise just return the value.
471
*----------------------------------------------------------------------------*/
472
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
474
if (STATUS(flush_inputs_to_zero)) {
475
if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
476
float_raise(float_flag_input_denormal STATUS_VAR);
477
return make_float64(float64_val(a) & (1ULL << 63));
483
/*----------------------------------------------------------------------------
484
| Normalizes the subnormal double-precision floating-point value represented
485
| by the denormalized significand `aSig'. The normalized exponent and
486
| significand are stored at the locations pointed to by `zExpPtr' and
487
| `zSigPtr', respectively.
488
*----------------------------------------------------------------------------*/
491
normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
495
shiftCount = countLeadingZeros64( aSig ) - 11;
496
*zSigPtr = aSig<<shiftCount;
497
*zExpPtr = 1 - shiftCount;
501
/*----------------------------------------------------------------------------
502
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
503
| double-precision floating-point value, returning the result. After being
504
| shifted into the proper positions, the three fields are simply added
505
| together to form the result. This means that any integer portion of `zSig'
506
| will be added into the exponent. Since a properly normalized significand
507
| will have an integer portion equal to 1, the `zExp' input should be 1 less
508
| than the desired result exponent whenever `zSig' is a complete, normalized
510
*----------------------------------------------------------------------------*/
512
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
516
( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
520
/*----------------------------------------------------------------------------
521
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
522
| and significand `zSig', and returns the proper double-precision floating-
523
| point value corresponding to the abstract input. Ordinarily, the abstract
524
| value is simply rounded and packed into the double-precision format, with
525
| the inexact exception raised if the abstract input cannot be represented
526
| exactly. However, if the abstract value is too large, the overflow and
527
| inexact exceptions are raised and an infinity or maximal finite value is
528
| returned. If the abstract value is too small, the input value is rounded
529
| to a subnormal number, and the underflow and inexact exceptions are raised
530
| if the abstract input cannot be represented exactly as a subnormal double-
531
| precision floating-point number.
532
| The input significand `zSig' has its binary point between bits 62
533
| and 61, which is 10 bits to the left of the usual location. This shifted
534
| significand must be normalized or smaller. If `zSig' is not normalized,
535
| `zExp' must be 0; in that case, the result returned is a subnormal number,
536
| and it must not require rounding. In the usual case that `zSig' is
537
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
538
| The handling of underflow and overflow follows the IEC/IEEE Standard for
539
| Binary Floating-Point Arithmetic.
540
*----------------------------------------------------------------------------*/
542
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
545
flag roundNearestEven;
546
int_fast16_t roundIncrement, roundBits;
549
roundingMode = STATUS(float_rounding_mode);
550
roundNearestEven = ( roundingMode == float_round_nearest_even );
551
switch (roundingMode) {
552
case float_round_nearest_even:
553
roundIncrement = 0x200;
555
case float_round_to_zero:
559
roundIncrement = zSign ? 0 : 0x3ff;
561
case float_round_down:
562
roundIncrement = zSign ? 0x3ff : 0;
567
roundBits = zSig & 0x3FF;
568
if ( 0x7FD <= (uint16_t) zExp ) {
569
if ( ( 0x7FD < zExp )
570
|| ( ( zExp == 0x7FD )
571
&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
573
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
574
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
577
if (STATUS(flush_to_zero)) {
578
float_raise(float_flag_output_denormal STATUS_VAR);
579
return packFloat64(zSign, 0, 0);
582
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
584
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
585
shift64RightJamming( zSig, - zExp, &zSig );
587
roundBits = zSig & 0x3FF;
588
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
591
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
592
zSig = ( zSig + roundIncrement )>>10;
593
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
594
if ( zSig == 0 ) zExp = 0;
595
return packFloat64( zSign, zExp, zSig );
599
/*----------------------------------------------------------------------------
600
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
601
| and significand `zSig', and returns the proper double-precision floating-
602
| point value corresponding to the abstract input. This routine is just like
603
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
604
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
605
| floating-point exponent.
606
*----------------------------------------------------------------------------*/
609
normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
613
shiftCount = countLeadingZeros64( zSig ) - 1;
614
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
618
/*----------------------------------------------------------------------------
619
| Returns the fraction bits of the extended double-precision floating-point
621
*----------------------------------------------------------------------------*/
623
INLINE uint64_t extractFloatx80Frac( floatx80 a )
630
/*----------------------------------------------------------------------------
631
| Returns the exponent bits of the extended double-precision floating-point
633
*----------------------------------------------------------------------------*/
635
INLINE int32 extractFloatx80Exp( floatx80 a )
638
return a.high & 0x7FFF;
642
/*----------------------------------------------------------------------------
643
| Returns the sign bit of the extended double-precision floating-point value
645
*----------------------------------------------------------------------------*/
647
INLINE flag extractFloatx80Sign( floatx80 a )
654
/*----------------------------------------------------------------------------
655
| Normalizes the subnormal extended double-precision floating-point value
656
| represented by the denormalized significand `aSig'. The normalized exponent
657
| and significand are stored at the locations pointed to by `zExpPtr' and
658
| `zSigPtr', respectively.
659
*----------------------------------------------------------------------------*/
662
normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
666
shiftCount = countLeadingZeros64( aSig );
667
*zSigPtr = aSig<<shiftCount;
668
*zExpPtr = 1 - shiftCount;
672
/*----------------------------------------------------------------------------
673
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
674
| extended double-precision floating-point value, returning the result.
675
*----------------------------------------------------------------------------*/
677
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
682
z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
687
/*----------------------------------------------------------------------------
688
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
689
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
690
| and returns the proper extended double-precision floating-point value
691
| corresponding to the abstract input. Ordinarily, the abstract value is
692
| rounded and packed into the extended double-precision format, with the
693
| inexact exception raised if the abstract input cannot be represented
694
| exactly. However, if the abstract value is too large, the overflow and
695
| inexact exceptions are raised and an infinity or maximal finite value is
696
| returned. If the abstract value is too small, the input value is rounded to
697
| a subnormal number, and the underflow and inexact exceptions are raised if
698
| the abstract input cannot be represented exactly as a subnormal extended
699
| double-precision floating-point number.
700
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
701
| number of bits as single or double precision, respectively. Otherwise, the
702
| result is rounded to the full precision of the extended double-precision
704
| The input significand must be normalized or smaller. If the input
705
| significand is not normalized, `zExp' must be 0; in that case, the result
706
| returned is a subnormal number, and it must not require rounding. The
707
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
708
| Floating-Point Arithmetic.
709
*----------------------------------------------------------------------------*/
712
roundAndPackFloatx80(
713
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
717
flag roundNearestEven, increment, isTiny;
718
int64 roundIncrement, roundMask, roundBits;
720
roundingMode = STATUS(float_rounding_mode);
721
roundNearestEven = ( roundingMode == float_round_nearest_even );
722
if ( roundingPrecision == 80 ) goto precision80;
723
if ( roundingPrecision == 64 ) {
724
roundIncrement = LIT64( 0x0000000000000400 );
725
roundMask = LIT64( 0x00000000000007FF );
727
else if ( roundingPrecision == 32 ) {
728
roundIncrement = LIT64( 0x0000008000000000 );
729
roundMask = LIT64( 0x000000FFFFFFFFFF );
734
zSig0 |= ( zSig1 != 0 );
735
switch (roundingMode) {
736
case float_round_nearest_even:
738
case float_round_to_zero:
742
roundIncrement = zSign ? 0 : roundMask;
744
case float_round_down:
745
roundIncrement = zSign ? roundMask : 0;
750
roundBits = zSig0 & roundMask;
751
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
752
if ( ( 0x7FFE < zExp )
753
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
758
if (STATUS(flush_to_zero)) {
759
float_raise(float_flag_output_denormal STATUS_VAR);
760
return packFloatx80(zSign, 0, 0);
763
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
765
|| ( zSig0 <= zSig0 + roundIncrement );
766
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
768
roundBits = zSig0 & roundMask;
769
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
770
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
771
zSig0 += roundIncrement;
772
if ( (int64_t) zSig0 < 0 ) zExp = 1;
773
roundIncrement = roundMask + 1;
774
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
775
roundMask |= roundIncrement;
777
zSig0 &= ~ roundMask;
778
return packFloatx80( zSign, zExp, zSig0 );
781
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
782
zSig0 += roundIncrement;
783
if ( zSig0 < roundIncrement ) {
785
zSig0 = LIT64( 0x8000000000000000 );
787
roundIncrement = roundMask + 1;
788
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
789
roundMask |= roundIncrement;
791
zSig0 &= ~ roundMask;
792
if ( zSig0 == 0 ) zExp = 0;
793
return packFloatx80( zSign, zExp, zSig0 );
795
switch (roundingMode) {
796
case float_round_nearest_even:
797
increment = ((int64_t)zSig1 < 0);
799
case float_round_to_zero:
803
increment = !zSign && zSig1;
805
case float_round_down:
806
increment = zSign && zSig1;
811
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
812
if ( ( 0x7FFE < zExp )
813
|| ( ( zExp == 0x7FFE )
814
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
820
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
821
if ( ( roundingMode == float_round_to_zero )
822
|| ( zSign && ( roundingMode == float_round_up ) )
823
|| ( ! zSign && ( roundingMode == float_round_down ) )
825
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
827
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
831
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
834
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
835
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
837
if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
838
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
839
switch (roundingMode) {
840
case float_round_nearest_even:
841
increment = ((int64_t)zSig1 < 0);
843
case float_round_to_zero:
847
increment = !zSign && zSig1;
849
case float_round_down:
850
increment = zSign && zSig1;
858
~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
859
if ( (int64_t) zSig0 < 0 ) zExp = 1;
861
return packFloatx80( zSign, zExp, zSig0 );
864
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
869
zSig0 = LIT64( 0x8000000000000000 );
872
zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
876
if ( zSig0 == 0 ) zExp = 0;
878
return packFloatx80( zSign, zExp, zSig0 );
882
/*----------------------------------------------------------------------------
883
| Takes an abstract floating-point value having sign `zSign', exponent
884
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
885
| and returns the proper extended double-precision floating-point value
886
| corresponding to the abstract input. This routine is just like
887
| `roundAndPackFloatx80' except that the input significand does not have to be
889
*----------------------------------------------------------------------------*/
892
normalizeRoundAndPackFloatx80(
893
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
903
shiftCount = countLeadingZeros64( zSig0 );
904
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
907
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
911
/*----------------------------------------------------------------------------
912
| Returns the least-significant 64 fraction bits of the quadruple-precision
913
| floating-point value `a'.
914
*----------------------------------------------------------------------------*/
916
INLINE uint64_t extractFloat128Frac1( float128 a )
923
/*----------------------------------------------------------------------------
924
| Returns the most-significant 48 fraction bits of the quadruple-precision
925
| floating-point value `a'.
926
*----------------------------------------------------------------------------*/
928
INLINE uint64_t extractFloat128Frac0( float128 a )
931
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
935
/*----------------------------------------------------------------------------
936
| Returns the exponent bits of the quadruple-precision floating-point value
938
*----------------------------------------------------------------------------*/
940
INLINE int32 extractFloat128Exp( float128 a )
943
return ( a.high>>48 ) & 0x7FFF;
947
/*----------------------------------------------------------------------------
948
| Returns the sign bit of the quadruple-precision floating-point value `a'.
949
*----------------------------------------------------------------------------*/
951
INLINE flag extractFloat128Sign( float128 a )
958
/*----------------------------------------------------------------------------
959
| Normalizes the subnormal quadruple-precision floating-point value
960
| represented by the denormalized significand formed by the concatenation of
961
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
962
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
963
| significand are stored at the location pointed to by `zSig0Ptr', and the
964
| least significant 64 bits of the normalized significand are stored at the
965
| location pointed to by `zSig1Ptr'.
966
*----------------------------------------------------------------------------*/
969
normalizeFloat128Subnormal(
980
shiftCount = countLeadingZeros64( aSig1 ) - 15;
981
if ( shiftCount < 0 ) {
982
*zSig0Ptr = aSig1>>( - shiftCount );
983
*zSig1Ptr = aSig1<<( shiftCount & 63 );
986
*zSig0Ptr = aSig1<<shiftCount;
989
*zExpPtr = - shiftCount - 63;
992
shiftCount = countLeadingZeros64( aSig0 ) - 15;
993
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
994
*zExpPtr = 1 - shiftCount;
999
/*----------------------------------------------------------------------------
1000
| Packs the sign `zSign', the exponent `zExp', and the significand formed
1001
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1002
| floating-point value, returning the result. After being shifted into the
1003
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1004
| added together to form the most significant 32 bits of the result. This
1005
| means that any integer portion of `zSig0' will be added into the exponent.
1006
| Since a properly normalized significand will have an integer portion equal
1007
| to 1, the `zExp' input should be 1 less than the desired result exponent
1008
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1010
*----------------------------------------------------------------------------*/
1013
packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1018
z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1023
/*----------------------------------------------------------------------------
1024
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1025
| and extended significand formed by the concatenation of `zSig0', `zSig1',
1026
| and `zSig2', and returns the proper quadruple-precision floating-point value
1027
| corresponding to the abstract input. Ordinarily, the abstract value is
1028
| simply rounded and packed into the quadruple-precision format, with the
1029
| inexact exception raised if the abstract input cannot be represented
1030
| exactly. However, if the abstract value is too large, the overflow and
1031
| inexact exceptions are raised and an infinity or maximal finite value is
1032
| returned. If the abstract value is too small, the input value is rounded to
1033
| a subnormal number, and the underflow and inexact exceptions are raised if
1034
| the abstract input cannot be represented exactly as a subnormal quadruple-
1035
| precision floating-point number.
1036
| The input significand must be normalized or smaller. If the input
1037
| significand is not normalized, `zExp' must be 0; in that case, the result
1038
| returned is a subnormal number, and it must not require rounding. In the
1039
| usual case that the input significand is normalized, `zExp' must be 1 less
1040
| than the ``true'' floating-point exponent. The handling of underflow and
1041
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1042
*----------------------------------------------------------------------------*/
1045
roundAndPackFloat128(
1046
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1049
flag roundNearestEven, increment, isTiny;
1051
roundingMode = STATUS(float_rounding_mode);
1052
roundNearestEven = ( roundingMode == float_round_nearest_even );
1053
switch (roundingMode) {
1054
case float_round_nearest_even:
1055
increment = ((int64_t)zSig2 < 0);
1057
case float_round_to_zero:
1060
case float_round_up:
1061
increment = !zSign && zSig2;
1063
case float_round_down:
1064
increment = zSign && zSig2;
1069
if ( 0x7FFD <= (uint32_t) zExp ) {
1070
if ( ( 0x7FFD < zExp )
1071
|| ( ( zExp == 0x7FFD )
1073
LIT64( 0x0001FFFFFFFFFFFF ),
1074
LIT64( 0xFFFFFFFFFFFFFFFF ),
1081
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1082
if ( ( roundingMode == float_round_to_zero )
1083
|| ( zSign && ( roundingMode == float_round_up ) )
1084
|| ( ! zSign && ( roundingMode == float_round_down ) )
1090
LIT64( 0x0000FFFFFFFFFFFF ),
1091
LIT64( 0xFFFFFFFFFFFFFFFF )
1094
return packFloat128( zSign, 0x7FFF, 0, 0 );
1097
if (STATUS(flush_to_zero)) {
1098
float_raise(float_flag_output_denormal STATUS_VAR);
1099
return packFloat128(zSign, 0, 0, 0);
1102
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1108
LIT64( 0x0001FFFFFFFFFFFF ),
1109
LIT64( 0xFFFFFFFFFFFFFFFF )
1111
shift128ExtraRightJamming(
1112
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1114
if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1115
switch (roundingMode) {
1116
case float_round_nearest_even:
1117
increment = ((int64_t)zSig2 < 0);
1119
case float_round_to_zero:
1122
case float_round_up:
1123
increment = !zSign && zSig2;
1125
case float_round_down:
1126
increment = zSign && zSig2;
1133
if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1135
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1136
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1139
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1141
return packFloat128( zSign, zExp, zSig0, zSig1 );
1145
/*----------------------------------------------------------------------------
1146
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1147
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1148
| returns the proper quadruple-precision floating-point value corresponding
1149
| to the abstract input. This routine is just like `roundAndPackFloat128'
1150
| except that the input significand has fewer bits and does not have to be
1151
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1153
*----------------------------------------------------------------------------*/
1156
normalizeRoundAndPackFloat128(
1157
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1167
shiftCount = countLeadingZeros64( zSig0 ) - 15;
1168
if ( 0 <= shiftCount ) {
1170
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1173
shift128ExtraRightJamming(
1174
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1177
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1181
/*----------------------------------------------------------------------------
1182
| Returns the result of converting the 32-bit two's complement integer `a'
1183
| to the single-precision floating-point format. The conversion is performed
1184
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1185
*----------------------------------------------------------------------------*/
1187
float32 int32_to_float32(int32_t a STATUS_PARAM)
1191
if ( a == 0 ) return float32_zero;
1192
if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1194
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1198
/*----------------------------------------------------------------------------
1199
| Returns the result of converting the 32-bit two's complement integer `a'
1200
| to the double-precision floating-point format. The conversion is performed
1201
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202
*----------------------------------------------------------------------------*/
1204
float64 int32_to_float64(int32_t a STATUS_PARAM)
1211
if ( a == 0 ) return float64_zero;
1213
absA = zSign ? - a : a;
1214
shiftCount = countLeadingZeros32( absA ) + 21;
1216
return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1220
/*----------------------------------------------------------------------------
1221
| Returns the result of converting the 32-bit two's complement integer `a'
1222
| to the extended double-precision floating-point format. The conversion
1223
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1225
*----------------------------------------------------------------------------*/
1227
floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1234
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1236
absA = zSign ? - a : a;
1237
shiftCount = countLeadingZeros32( absA ) + 32;
1239
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1243
/*----------------------------------------------------------------------------
1244
| Returns the result of converting the 32-bit two's complement integer `a' to
1245
| the quadruple-precision floating-point format. The conversion is performed
1246
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247
*----------------------------------------------------------------------------*/
1249
float128 int32_to_float128(int32_t a STATUS_PARAM)
1256
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1258
absA = zSign ? - a : a;
1259
shiftCount = countLeadingZeros32( absA ) + 17;
1261
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1265
/*----------------------------------------------------------------------------
1266
| Returns the result of converting the 64-bit two's complement integer `a'
1267
| to the single-precision floating-point format. The conversion is performed
1268
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1269
*----------------------------------------------------------------------------*/
1271
float32 int64_to_float32(int64_t a STATUS_PARAM)
1277
if ( a == 0 ) return float32_zero;
1279
absA = zSign ? - a : a;
1280
shiftCount = countLeadingZeros64( absA ) - 40;
1281
if ( 0 <= shiftCount ) {
1282
return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1286
if ( shiftCount < 0 ) {
1287
shift64RightJamming( absA, - shiftCount, &absA );
1290
absA <<= shiftCount;
1292
return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1297
float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1301
if ( a == 0 ) return float32_zero;
1302
shiftCount = countLeadingZeros64( a ) - 40;
1303
if ( 0 <= shiftCount ) {
1304
return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1308
if ( shiftCount < 0 ) {
1309
shift64RightJamming( a, - shiftCount, &a );
1314
return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1318
/*----------------------------------------------------------------------------
1319
| Returns the result of converting the 64-bit two's complement integer `a'
1320
| to the double-precision floating-point format. The conversion is performed
1321
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1322
*----------------------------------------------------------------------------*/
1324
float64 int64_to_float64(int64_t a STATUS_PARAM)
1328
if ( a == 0 ) return float64_zero;
1329
if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1330
return packFloat64( 1, 0x43E, 0 );
1333
return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1337
float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1342
return float64_zero;
1344
if ((int64_t)a < 0) {
1345
shift64RightJamming(a, 1, &a);
1348
return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1351
/*----------------------------------------------------------------------------
1352
| Returns the result of converting the 64-bit two's complement integer `a'
1353
| to the extended double-precision floating-point format. The conversion
1354
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1356
*----------------------------------------------------------------------------*/
1358
floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1364
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1366
absA = zSign ? - a : a;
1367
shiftCount = countLeadingZeros64( absA );
1368
return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1372
/*----------------------------------------------------------------------------
1373
| Returns the result of converting the 64-bit two's complement integer `a' to
1374
| the quadruple-precision floating-point format. The conversion is performed
1375
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1376
*----------------------------------------------------------------------------*/
1378
float128 int64_to_float128(int64_t a STATUS_PARAM)
1384
uint64_t zSig0, zSig1;
1386
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1388
absA = zSign ? - a : a;
1389
shiftCount = countLeadingZeros64( absA ) + 49;
1390
zExp = 0x406E - shiftCount;
1391
if ( 64 <= shiftCount ) {
1400
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1401
return packFloat128( zSign, zExp, zSig0, zSig1 );
1405
float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1408
return float128_zero;
1410
return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1413
/*----------------------------------------------------------------------------
1414
| Returns the result of converting the single-precision floating-point value
1415
| `a' to the 32-bit two's complement integer format. The conversion is
1416
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1417
| Arithmetic---which means in particular that the conversion is rounded
1418
| according to the current rounding mode. If `a' is a NaN, the largest
1419
| positive integer is returned. Otherwise, if the conversion overflows, the
1420
| largest integer with the same sign as `a' is returned.
1421
*----------------------------------------------------------------------------*/
1423
int32 float32_to_int32( float32 a STATUS_PARAM )
1426
int_fast16_t aExp, shiftCount;
1430
a = float32_squash_input_denormal(a STATUS_VAR);
1431
aSig = extractFloat32Frac( a );
1432
aExp = extractFloat32Exp( a );
1433
aSign = extractFloat32Sign( a );
1434
if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1435
if ( aExp ) aSig |= 0x00800000;
1436
shiftCount = 0xAF - aExp;
1439
if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1440
return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1444
/*----------------------------------------------------------------------------
1445
| Returns the result of converting the single-precision floating-point value
1446
| `a' to the 32-bit two's complement integer format. The conversion is
1447
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1448
| Arithmetic, except that the conversion is always rounded toward zero.
1449
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1450
| the conversion overflows, the largest integer with the same sign as `a' is
1452
*----------------------------------------------------------------------------*/
1454
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1457
int_fast16_t aExp, shiftCount;
1460
a = float32_squash_input_denormal(a STATUS_VAR);
1462
aSig = extractFloat32Frac( a );
1463
aExp = extractFloat32Exp( a );
1464
aSign = extractFloat32Sign( a );
1465
shiftCount = aExp - 0x9E;
1466
if ( 0 <= shiftCount ) {
1467
if ( float32_val(a) != 0xCF000000 ) {
1468
float_raise( float_flag_invalid STATUS_VAR);
1469
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1471
return (int32_t) 0x80000000;
1473
else if ( aExp <= 0x7E ) {
1474
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1477
aSig = ( aSig | 0x00800000 )<<8;
1478
z = aSig>>( - shiftCount );
1479
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1480
STATUS(float_exception_flags) |= float_flag_inexact;
1482
if ( aSign ) z = - z;
1487
/*----------------------------------------------------------------------------
1488
| Returns the result of converting the single-precision floating-point value
1489
| `a' to the 16-bit two's complement integer format. The conversion is
1490
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1491
| Arithmetic, except that the conversion is always rounded toward zero.
1492
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1493
| the conversion overflows, the largest integer with the same sign as `a' is
1495
*----------------------------------------------------------------------------*/
1497
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1500
int_fast16_t aExp, shiftCount;
1504
aSig = extractFloat32Frac( a );
1505
aExp = extractFloat32Exp( a );
1506
aSign = extractFloat32Sign( a );
1507
shiftCount = aExp - 0x8E;
1508
if ( 0 <= shiftCount ) {
1509
if ( float32_val(a) != 0xC7000000 ) {
1510
float_raise( float_flag_invalid STATUS_VAR);
1511
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1515
return (int32_t) 0xffff8000;
1517
else if ( aExp <= 0x7E ) {
1518
if ( aExp | aSig ) {
1519
STATUS(float_exception_flags) |= float_flag_inexact;
1524
aSig = ( aSig | 0x00800000 )<<8;
1525
z = aSig>>( - shiftCount );
1526
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1527
STATUS(float_exception_flags) |= float_flag_inexact;
1536
/*----------------------------------------------------------------------------
1537
| Returns the result of converting the single-precision floating-point value
1538
| `a' to the 64-bit two's complement integer format. The conversion is
1539
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1540
| Arithmetic---which means in particular that the conversion is rounded
1541
| according to the current rounding mode. If `a' is a NaN, the largest
1542
| positive integer is returned. Otherwise, if the conversion overflows, the
1543
| largest integer with the same sign as `a' is returned.
1544
*----------------------------------------------------------------------------*/
1546
int64 float32_to_int64( float32 a STATUS_PARAM )
1549
int_fast16_t aExp, shiftCount;
1551
uint64_t aSig64, aSigExtra;
1552
a = float32_squash_input_denormal(a STATUS_VAR);
1554
aSig = extractFloat32Frac( a );
1555
aExp = extractFloat32Exp( a );
1556
aSign = extractFloat32Sign( a );
1557
shiftCount = 0xBE - aExp;
1558
if ( shiftCount < 0 ) {
1559
float_raise( float_flag_invalid STATUS_VAR);
1560
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1561
return LIT64( 0x7FFFFFFFFFFFFFFF );
1563
return (int64_t) LIT64( 0x8000000000000000 );
1565
if ( aExp ) aSig |= 0x00800000;
1568
shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1569
return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1573
/*----------------------------------------------------------------------------
1574
| Returns the result of converting the single-precision floating-point value
1575
| `a' to the 64-bit unsigned integer format. The conversion is
1576
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1577
| Arithmetic---which means in particular that the conversion is rounded
1578
| according to the current rounding mode. If `a' is a NaN, the largest
1579
| unsigned integer is returned. Otherwise, if the conversion overflows, the
1580
| largest unsigned integer is returned. If the 'a' is negative, the result
1581
| is rounded and zero is returned; values that do not round to zero will
1582
| raise the inexact exception flag.
1583
*----------------------------------------------------------------------------*/
1585
uint64 float32_to_uint64(float32 a STATUS_PARAM)
1588
int_fast16_t aExp, shiftCount;
1590
uint64_t aSig64, aSigExtra;
1591
a = float32_squash_input_denormal(a STATUS_VAR);
1593
aSig = extractFloat32Frac(a);
1594
aExp = extractFloat32Exp(a);
1595
aSign = extractFloat32Sign(a);
1596
if ((aSign) && (aExp > 126)) {
1597
float_raise(float_flag_invalid STATUS_VAR);
1598
if (float32_is_any_nan(a)) {
1599
return LIT64(0xFFFFFFFFFFFFFFFF);
1604
shiftCount = 0xBE - aExp;
1608
if (shiftCount < 0) {
1609
float_raise(float_flag_invalid STATUS_VAR);
1610
return LIT64(0xFFFFFFFFFFFFFFFF);
1615
shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1616
return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1619
/*----------------------------------------------------------------------------
1620
| Returns the result of converting the single-precision floating-point value
1621
| `a' to the 64-bit two's complement integer format. The conversion is
1622
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1623
| Arithmetic, except that the conversion is always rounded toward zero. If
1624
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1625
| conversion overflows, the largest integer with the same sign as `a' is
1627
*----------------------------------------------------------------------------*/
1629
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1632
int_fast16_t aExp, shiftCount;
1636
a = float32_squash_input_denormal(a STATUS_VAR);
1638
aSig = extractFloat32Frac( a );
1639
aExp = extractFloat32Exp( a );
1640
aSign = extractFloat32Sign( a );
1641
shiftCount = aExp - 0xBE;
1642
if ( 0 <= shiftCount ) {
1643
if ( float32_val(a) != 0xDF000000 ) {
1644
float_raise( float_flag_invalid STATUS_VAR);
1645
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1646
return LIT64( 0x7FFFFFFFFFFFFFFF );
1649
return (int64_t) LIT64( 0x8000000000000000 );
1651
else if ( aExp <= 0x7E ) {
1652
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1655
aSig64 = aSig | 0x00800000;
1657
z = aSig64>>( - shiftCount );
1658
if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1659
STATUS(float_exception_flags) |= float_flag_inexact;
1661
if ( aSign ) z = - z;
1666
/*----------------------------------------------------------------------------
1667
| Returns the result of converting the single-precision floating-point value
1668
| `a' to the double-precision floating-point format. The conversion is
1669
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1671
*----------------------------------------------------------------------------*/
1673
float64 float32_to_float64( float32 a STATUS_PARAM )
1678
a = float32_squash_input_denormal(a STATUS_VAR);
1680
aSig = extractFloat32Frac( a );
1681
aExp = extractFloat32Exp( a );
1682
aSign = extractFloat32Sign( a );
1683
if ( aExp == 0xFF ) {
1684
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1685
return packFloat64( aSign, 0x7FF, 0 );
1688
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1689
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1692
return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1696
/*----------------------------------------------------------------------------
1697
| Returns the result of converting the single-precision floating-point value
1698
| `a' to the extended double-precision floating-point format. The conversion
1699
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1701
*----------------------------------------------------------------------------*/
1703
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1709
a = float32_squash_input_denormal(a STATUS_VAR);
1710
aSig = extractFloat32Frac( a );
1711
aExp = extractFloat32Exp( a );
1712
aSign = extractFloat32Sign( a );
1713
if ( aExp == 0xFF ) {
1714
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1715
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1718
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1719
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1722
return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1726
/*----------------------------------------------------------------------------
1727
| Returns the result of converting the single-precision floating-point value
1728
| `a' to the double-precision floating-point format. The conversion is
1729
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1731
*----------------------------------------------------------------------------*/
1733
float128 float32_to_float128( float32 a STATUS_PARAM )
1739
a = float32_squash_input_denormal(a STATUS_VAR);
1740
aSig = extractFloat32Frac( a );
1741
aExp = extractFloat32Exp( a );
1742
aSign = extractFloat32Sign( a );
1743
if ( aExp == 0xFF ) {
1744
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1745
return packFloat128( aSign, 0x7FFF, 0, 0 );
1748
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1749
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1752
return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1756
/*----------------------------------------------------------------------------
1757
| Rounds the single-precision floating-point value `a' to an integer, and
1758
| returns the result as a single-precision floating-point value. The
1759
| operation is performed according to the IEC/IEEE Standard for Binary
1760
| Floating-Point Arithmetic.
1761
*----------------------------------------------------------------------------*/
1763
float32 float32_round_to_int( float32 a STATUS_PARAM)
1767
uint32_t lastBitMask, roundBitsMask;
1769
a = float32_squash_input_denormal(a STATUS_VAR);
1771
aExp = extractFloat32Exp( a );
1772
if ( 0x96 <= aExp ) {
1773
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1774
return propagateFloat32NaN( a, a STATUS_VAR );
1778
if ( aExp <= 0x7E ) {
1779
if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1780
STATUS(float_exception_flags) |= float_flag_inexact;
1781
aSign = extractFloat32Sign( a );
1782
switch ( STATUS(float_rounding_mode) ) {
1783
case float_round_nearest_even:
1784
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1785
return packFloat32( aSign, 0x7F, 0 );
1788
case float_round_down:
1789
return make_float32(aSign ? 0xBF800000 : 0);
1790
case float_round_up:
1791
return make_float32(aSign ? 0x80000000 : 0x3F800000);
1793
return packFloat32( aSign, 0, 0 );
1796
lastBitMask <<= 0x96 - aExp;
1797
roundBitsMask = lastBitMask - 1;
1799
switch (STATUS(float_rounding_mode)) {
1800
case float_round_nearest_even:
1801
z += lastBitMask>>1;
1802
if ((z & roundBitsMask) == 0) {
1806
case float_round_to_zero:
1808
case float_round_up:
1809
if (!extractFloat32Sign(make_float32(z))) {
1813
case float_round_down:
1814
if (extractFloat32Sign(make_float32(z))) {
1821
z &= ~ roundBitsMask;
1822
if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1823
return make_float32(z);
1827
/*----------------------------------------------------------------------------
1828
| Returns the result of adding the absolute values of the single-precision
1829
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1830
| before being returned. `zSign' is ignored if the result is a NaN.
1831
| The addition is performed according to the IEC/IEEE Standard for Binary
1832
| Floating-Point Arithmetic.
1833
*----------------------------------------------------------------------------*/
1835
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1837
int_fast16_t aExp, bExp, zExp;
1838
uint32_t aSig, bSig, zSig;
1839
int_fast16_t expDiff;
1841
aSig = extractFloat32Frac( a );
1842
aExp = extractFloat32Exp( a );
1843
bSig = extractFloat32Frac( b );
1844
bExp = extractFloat32Exp( b );
1845
expDiff = aExp - bExp;
1848
if ( 0 < expDiff ) {
1849
if ( aExp == 0xFF ) {
1850
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1859
shift32RightJamming( bSig, expDiff, &bSig );
1862
else if ( expDiff < 0 ) {
1863
if ( bExp == 0xFF ) {
1864
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1865
return packFloat32( zSign, 0xFF, 0 );
1873
shift32RightJamming( aSig, - expDiff, &aSig );
1877
if ( aExp == 0xFF ) {
1878
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1882
if (STATUS(flush_to_zero)) {
1884
float_raise(float_flag_output_denormal STATUS_VAR);
1886
return packFloat32(zSign, 0, 0);
1888
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1890
zSig = 0x40000000 + aSig + bSig;
1895
zSig = ( aSig + bSig )<<1;
1897
if ( (int32_t) zSig < 0 ) {
1902
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1906
/*----------------------------------------------------------------------------
1907
| Returns the result of subtracting the absolute values of the single-
1908
| precision floating-point values `a' and `b'. If `zSign' is 1, the
1909
| difference is negated before being returned. `zSign' is ignored if the
1910
| result is a NaN. The subtraction is performed according to the IEC/IEEE
1911
| Standard for Binary Floating-Point Arithmetic.
1912
*----------------------------------------------------------------------------*/
1914
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1916
int_fast16_t aExp, bExp, zExp;
1917
uint32_t aSig, bSig, zSig;
1918
int_fast16_t expDiff;
1920
aSig = extractFloat32Frac( a );
1921
aExp = extractFloat32Exp( a );
1922
bSig = extractFloat32Frac( b );
1923
bExp = extractFloat32Exp( b );
1924
expDiff = aExp - bExp;
1927
if ( 0 < expDiff ) goto aExpBigger;
1928
if ( expDiff < 0 ) goto bExpBigger;
1929
if ( aExp == 0xFF ) {
1930
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1931
float_raise( float_flag_invalid STATUS_VAR);
1932
return float32_default_nan;
1938
if ( bSig < aSig ) goto aBigger;
1939
if ( aSig < bSig ) goto bBigger;
1940
return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1942
if ( bExp == 0xFF ) {
1943
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1944
return packFloat32( zSign ^ 1, 0xFF, 0 );
1952
shift32RightJamming( aSig, - expDiff, &aSig );
1958
goto normalizeRoundAndPack;
1960
if ( aExp == 0xFF ) {
1961
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1970
shift32RightJamming( bSig, expDiff, &bSig );
1975
normalizeRoundAndPack:
1977
return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1981
/*----------------------------------------------------------------------------
1982
| Returns the result of adding the single-precision floating-point values `a'
1983
| and `b'. The operation is performed according to the IEC/IEEE Standard for
1984
| Binary Floating-Point Arithmetic.
1985
*----------------------------------------------------------------------------*/
1987
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1990
a = float32_squash_input_denormal(a STATUS_VAR);
1991
b = float32_squash_input_denormal(b STATUS_VAR);
1993
aSign = extractFloat32Sign( a );
1994
bSign = extractFloat32Sign( b );
1995
if ( aSign == bSign ) {
1996
return addFloat32Sigs( a, b, aSign STATUS_VAR);
1999
return subFloat32Sigs( a, b, aSign STATUS_VAR );
2004
/*----------------------------------------------------------------------------
2005
| Returns the result of subtracting the single-precision floating-point values
2006
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2007
| for Binary Floating-Point Arithmetic.
2008
*----------------------------------------------------------------------------*/
2010
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2013
a = float32_squash_input_denormal(a STATUS_VAR);
2014
b = float32_squash_input_denormal(b STATUS_VAR);
2016
aSign = extractFloat32Sign( a );
2017
bSign = extractFloat32Sign( b );
2018
if ( aSign == bSign ) {
2019
return subFloat32Sigs( a, b, aSign STATUS_VAR );
2022
return addFloat32Sigs( a, b, aSign STATUS_VAR );
2027
/*----------------------------------------------------------------------------
2028
| Returns the result of multiplying the single-precision floating-point values
2029
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2030
| for Binary Floating-Point Arithmetic.
2031
*----------------------------------------------------------------------------*/
2033
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2035
flag aSign, bSign, zSign;
2036
int_fast16_t aExp, bExp, zExp;
2037
uint32_t aSig, bSig;
2041
a = float32_squash_input_denormal(a STATUS_VAR);
2042
b = float32_squash_input_denormal(b STATUS_VAR);
2044
aSig = extractFloat32Frac( a );
2045
aExp = extractFloat32Exp( a );
2046
aSign = extractFloat32Sign( a );
2047
bSig = extractFloat32Frac( b );
2048
bExp = extractFloat32Exp( b );
2049
bSign = extractFloat32Sign( b );
2050
zSign = aSign ^ bSign;
2051
if ( aExp == 0xFF ) {
2052
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2053
return propagateFloat32NaN( a, b STATUS_VAR );
2055
if ( ( bExp | bSig ) == 0 ) {
2056
float_raise( float_flag_invalid STATUS_VAR);
2057
return float32_default_nan;
2059
return packFloat32( zSign, 0xFF, 0 );
2061
if ( bExp == 0xFF ) {
2062
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063
if ( ( aExp | aSig ) == 0 ) {
2064
float_raise( float_flag_invalid STATUS_VAR);
2065
return float32_default_nan;
2067
return packFloat32( zSign, 0xFF, 0 );
2070
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2071
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2074
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2075
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2077
zExp = aExp + bExp - 0x7F;
2078
aSig = ( aSig | 0x00800000 )<<7;
2079
bSig = ( bSig | 0x00800000 )<<8;
2080
shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2082
if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2086
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2090
/*----------------------------------------------------------------------------
2091
| Returns the result of dividing the single-precision floating-point value `a'
2092
| by the corresponding value `b'. The operation is performed according to the
2093
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2094
*----------------------------------------------------------------------------*/
2096
float32 float32_div( float32 a, float32 b STATUS_PARAM )
2098
flag aSign, bSign, zSign;
2099
int_fast16_t aExp, bExp, zExp;
2100
uint32_t aSig, bSig, zSig;
2101
a = float32_squash_input_denormal(a STATUS_VAR);
2102
b = float32_squash_input_denormal(b STATUS_VAR);
2104
aSig = extractFloat32Frac( a );
2105
aExp = extractFloat32Exp( a );
2106
aSign = extractFloat32Sign( a );
2107
bSig = extractFloat32Frac( b );
2108
bExp = extractFloat32Exp( b );
2109
bSign = extractFloat32Sign( b );
2110
zSign = aSign ^ bSign;
2111
if ( aExp == 0xFF ) {
2112
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2113
if ( bExp == 0xFF ) {
2114
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2115
float_raise( float_flag_invalid STATUS_VAR);
2116
return float32_default_nan;
2118
return packFloat32( zSign, 0xFF, 0 );
2120
if ( bExp == 0xFF ) {
2121
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2122
return packFloat32( zSign, 0, 0 );
2126
if ( ( aExp | aSig ) == 0 ) {
2127
float_raise( float_flag_invalid STATUS_VAR);
2128
return float32_default_nan;
2130
float_raise( float_flag_divbyzero STATUS_VAR);
2131
return packFloat32( zSign, 0xFF, 0 );
2133
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2136
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2137
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2139
zExp = aExp - bExp + 0x7D;
2140
aSig = ( aSig | 0x00800000 )<<7;
2141
bSig = ( bSig | 0x00800000 )<<8;
2142
if ( bSig <= ( aSig + aSig ) ) {
2146
zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2147
if ( ( zSig & 0x3F ) == 0 ) {
2148
zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2150
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2154
/*----------------------------------------------------------------------------
2155
| Returns the remainder of the single-precision floating-point value `a'
2156
| with respect to the corresponding value `b'. The operation is performed
2157
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2158
*----------------------------------------------------------------------------*/
2160
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2163
int_fast16_t aExp, bExp, expDiff;
2164
uint32_t aSig, bSig;
2166
uint64_t aSig64, bSig64, q64;
2167
uint32_t alternateASig;
2169
a = float32_squash_input_denormal(a STATUS_VAR);
2170
b = float32_squash_input_denormal(b STATUS_VAR);
2172
aSig = extractFloat32Frac( a );
2173
aExp = extractFloat32Exp( a );
2174
aSign = extractFloat32Sign( a );
2175
bSig = extractFloat32Frac( b );
2176
bExp = extractFloat32Exp( b );
2177
if ( aExp == 0xFF ) {
2178
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2179
return propagateFloat32NaN( a, b STATUS_VAR );
2181
float_raise( float_flag_invalid STATUS_VAR);
2182
return float32_default_nan;
2184
if ( bExp == 0xFF ) {
2185
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2190
float_raise( float_flag_invalid STATUS_VAR);
2191
return float32_default_nan;
2193
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2196
if ( aSig == 0 ) return a;
2197
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2199
expDiff = aExp - bExp;
2202
if ( expDiff < 32 ) {
2205
if ( expDiff < 0 ) {
2206
if ( expDiff < -1 ) return a;
2209
q = ( bSig <= aSig );
2210
if ( q ) aSig -= bSig;
2211
if ( 0 < expDiff ) {
2212
q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2215
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2223
if ( bSig <= aSig ) aSig -= bSig;
2224
aSig64 = ( (uint64_t) aSig )<<40;
2225
bSig64 = ( (uint64_t) bSig )<<40;
2227
while ( 0 < expDiff ) {
2228
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2229
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2230
aSig64 = - ( ( bSig * q64 )<<38 );
2234
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2235
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2236
q = q64>>( 64 - expDiff );
2238
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2241
alternateASig = aSig;
2244
} while ( 0 <= (int32_t) aSig );
2245
sigMean = aSig + alternateASig;
2246
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2247
aSig = alternateASig;
2249
zSign = ( (int32_t) aSig < 0 );
2250
if ( zSign ) aSig = - aSig;
2251
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2255
/*----------------------------------------------------------------------------
2256
| Returns the result of multiplying the single-precision floating-point values
2257
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2258
| multiplication. The operation is performed according to the IEC/IEEE
2259
| Standard for Binary Floating-Point Arithmetic 754-2008.
2260
| The flags argument allows the caller to select negation of the
2261
| addend, the intermediate product, or the final result. (The difference
2262
| between this and having the caller do a separate negation is that negating
2263
| externally will flip the sign bit on NaNs.)
2264
*----------------------------------------------------------------------------*/
2266
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2268
flag aSign, bSign, cSign, zSign;
2269
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2270
uint32_t aSig, bSig, cSig;
2271
flag pInf, pZero, pSign;
2272
uint64_t pSig64, cSig64, zSig64;
2275
flag signflip, infzero;
2277
a = float32_squash_input_denormal(a STATUS_VAR);
2278
b = float32_squash_input_denormal(b STATUS_VAR);
2279
c = float32_squash_input_denormal(c STATUS_VAR);
2280
aSig = extractFloat32Frac(a);
2281
aExp = extractFloat32Exp(a);
2282
aSign = extractFloat32Sign(a);
2283
bSig = extractFloat32Frac(b);
2284
bExp = extractFloat32Exp(b);
2285
bSign = extractFloat32Sign(b);
2286
cSig = extractFloat32Frac(c);
2287
cExp = extractFloat32Exp(c);
2288
cSign = extractFloat32Sign(c);
2290
infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2291
(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2293
/* It is implementation-defined whether the cases of (0,inf,qnan)
2294
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2295
* they return if they do), so we have to hand this information
2296
* off to the target-specific pick-a-NaN routine.
2298
if (((aExp == 0xff) && aSig) ||
2299
((bExp == 0xff) && bSig) ||
2300
((cExp == 0xff) && cSig)) {
2301
return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2305
float_raise(float_flag_invalid STATUS_VAR);
2306
return float32_default_nan;
2309
if (flags & float_muladd_negate_c) {
2313
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2315
/* Work out the sign and type of the product */
2316
pSign = aSign ^ bSign;
2317
if (flags & float_muladd_negate_product) {
2320
pInf = (aExp == 0xff) || (bExp == 0xff);
2321
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2324
if (pInf && (pSign ^ cSign)) {
2325
/* addition of opposite-signed infinities => InvalidOperation */
2326
float_raise(float_flag_invalid STATUS_VAR);
2327
return float32_default_nan;
2329
/* Otherwise generate an infinity of the same sign */
2330
return packFloat32(cSign ^ signflip, 0xff, 0);
2334
return packFloat32(pSign ^ signflip, 0xff, 0);
2340
/* Adding two exact zeroes */
2341
if (pSign == cSign) {
2343
} else if (STATUS(float_rounding_mode) == float_round_down) {
2348
return packFloat32(zSign ^ signflip, 0, 0);
2350
/* Exact zero plus a denorm */
2351
if (STATUS(flush_to_zero)) {
2352
float_raise(float_flag_output_denormal STATUS_VAR);
2353
return packFloat32(cSign ^ signflip, 0, 0);
2356
/* Zero plus something non-zero : just return the something */
2357
return packFloat32(cSign ^ signflip, cExp, cSig);
2361
normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2364
normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2367
/* Calculate the actual result a * b + c */
2369
/* Multiply first; this is easy. */
2370
/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2371
* because we want the true exponent, not the "one-less-than"
2372
* flavour that roundAndPackFloat32() takes.
2374
pExp = aExp + bExp - 0x7e;
2375
aSig = (aSig | 0x00800000) << 7;
2376
bSig = (bSig | 0x00800000) << 8;
2377
pSig64 = (uint64_t)aSig * bSig;
2378
if ((int64_t)(pSig64 << 1) >= 0) {
2383
zSign = pSign ^ signflip;
2385
/* Now pSig64 is the significand of the multiply, with the explicit bit in
2390
/* Throw out the special case of c being an exact zero now */
2391
shift64RightJamming(pSig64, 32, &pSig64);
2393
return roundAndPackFloat32(zSign, pExp - 1,
2396
normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2399
cSig64 = (uint64_t)cSig << (62 - 23);
2400
cSig64 |= LIT64(0x4000000000000000);
2401
expDiff = pExp - cExp;
2403
if (pSign == cSign) {
2406
/* scale c to match p */
2407
shift64RightJamming(cSig64, expDiff, &cSig64);
2409
} else if (expDiff < 0) {
2410
/* scale p to match c */
2411
shift64RightJamming(pSig64, -expDiff, &pSig64);
2414
/* no scaling needed */
2417
/* Add significands and make sure explicit bit ends up in posn 62 */
2418
zSig64 = pSig64 + cSig64;
2419
if ((int64_t)zSig64 < 0) {
2420
shift64RightJamming(zSig64, 1, &zSig64);
2427
shift64RightJamming(cSig64, expDiff, &cSig64);
2428
zSig64 = pSig64 - cSig64;
2430
} else if (expDiff < 0) {
2431
shift64RightJamming(pSig64, -expDiff, &pSig64);
2432
zSig64 = cSig64 - pSig64;
2437
if (cSig64 < pSig64) {
2438
zSig64 = pSig64 - cSig64;
2439
} else if (pSig64 < cSig64) {
2440
zSig64 = cSig64 - pSig64;
2445
if (STATUS(float_rounding_mode) == float_round_down) {
2448
return packFloat32(zSign, 0, 0);
2452
/* Normalize to put the explicit bit back into bit 62. */
2453
shiftcount = countLeadingZeros64(zSig64) - 1;
2454
zSig64 <<= shiftcount;
2457
shift64RightJamming(zSig64, 32, &zSig64);
2458
return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2462
/*----------------------------------------------------------------------------
2463
| Returns the square root of the single-precision floating-point value `a'.
2464
| The operation is performed according to the IEC/IEEE Standard for Binary
2465
| Floating-Point Arithmetic.
2466
*----------------------------------------------------------------------------*/
2468
float32 float32_sqrt( float32 a STATUS_PARAM )
2471
int_fast16_t aExp, zExp;
2472
uint32_t aSig, zSig;
2474
a = float32_squash_input_denormal(a STATUS_VAR);
2476
aSig = extractFloat32Frac( a );
2477
aExp = extractFloat32Exp( a );
2478
aSign = extractFloat32Sign( a );
2479
if ( aExp == 0xFF ) {
2480
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2481
if ( ! aSign ) return a;
2482
float_raise( float_flag_invalid STATUS_VAR);
2483
return float32_default_nan;
2486
if ( ( aExp | aSig ) == 0 ) return a;
2487
float_raise( float_flag_invalid STATUS_VAR);
2488
return float32_default_nan;
2491
if ( aSig == 0 ) return float32_zero;
2492
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2494
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2495
aSig = ( aSig | 0x00800000 )<<8;
2496
zSig = estimateSqrt32( aExp, aSig ) + 2;
2497
if ( ( zSig & 0x7F ) <= 5 ) {
2503
term = ( (uint64_t) zSig ) * zSig;
2504
rem = ( ( (uint64_t) aSig )<<32 ) - term;
2505
while ( (int64_t) rem < 0 ) {
2507
rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2509
zSig |= ( rem != 0 );
2511
shift32RightJamming( zSig, 1, &zSig );
2513
return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2517
/*----------------------------------------------------------------------------
2518
| Returns the binary exponential of the single-precision floating-point value
2519
| `a'. The operation is performed according to the IEC/IEEE Standard for
2520
| Binary Floating-Point Arithmetic.
2522
| Uses the following identities:
2524
| 1. -------------------------------------------------------------------------
2528
| 2. -------------------------------------------------------------------------
2531
| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2533
*----------------------------------------------------------------------------*/
2535
static const float64 float32_exp2_coefficients[15] =
2537
const_float64( 0x3ff0000000000000ll ), /* 1 */
2538
const_float64( 0x3fe0000000000000ll ), /* 2 */
2539
const_float64( 0x3fc5555555555555ll ), /* 3 */
2540
const_float64( 0x3fa5555555555555ll ), /* 4 */
2541
const_float64( 0x3f81111111111111ll ), /* 5 */
2542
const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2543
const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2544
const_float64( 0x3efa01a01a01a01all ), /* 8 */
2545
const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2546
const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2547
const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2548
const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2549
const_float64( 0x3de6124613a86d09ll ), /* 13 */
2550
const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2551
const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2554
float32 float32_exp2( float32 a STATUS_PARAM )
2561
a = float32_squash_input_denormal(a STATUS_VAR);
2563
aSig = extractFloat32Frac( a );
2564
aExp = extractFloat32Exp( a );
2565
aSign = extractFloat32Sign( a );
2567
if ( aExp == 0xFF) {
2568
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2569
return (aSign) ? float32_zero : a;
2572
if (aSig == 0) return float32_one;
2575
float_raise( float_flag_inexact STATUS_VAR);
2577
/* ******************************* */
2578
/* using float64 for approximation */
2579
/* ******************************* */
2580
x = float32_to_float64(a STATUS_VAR);
2581
x = float64_mul(x, float64_ln2 STATUS_VAR);
2585
for (i = 0 ; i < 15 ; i++) {
2588
f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2589
r = float64_add(r, f STATUS_VAR);
2591
xn = float64_mul(xn, x STATUS_VAR);
2594
return float64_to_float32(r, status);
2597
/*----------------------------------------------------------------------------
2598
| Returns the binary log of the single-precision floating-point value `a'.
2599
| The operation is performed according to the IEC/IEEE Standard for Binary
2600
| Floating-Point Arithmetic.
2601
*----------------------------------------------------------------------------*/
2602
float32 float32_log2( float32 a STATUS_PARAM )
2606
uint32_t aSig, zSig, i;
2608
a = float32_squash_input_denormal(a STATUS_VAR);
2609
aSig = extractFloat32Frac( a );
2610
aExp = extractFloat32Exp( a );
2611
aSign = extractFloat32Sign( a );
2614
if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2615
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2618
float_raise( float_flag_invalid STATUS_VAR);
2619
return float32_default_nan;
2621
if ( aExp == 0xFF ) {
2622
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2631
for (i = 1 << 22; i > 0; i >>= 1) {
2632
aSig = ( (uint64_t)aSig * aSig ) >> 23;
2633
if ( aSig & 0x01000000 ) {
2642
return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2645
/*----------------------------------------------------------------------------
2646
| Returns 1 if the single-precision floating-point value `a' is equal to
2647
| the corresponding value `b', and 0 otherwise. The invalid exception is
2648
| raised if either operand is a NaN. Otherwise, the comparison is performed
2649
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2650
*----------------------------------------------------------------------------*/
2652
int float32_eq( float32 a, float32 b STATUS_PARAM )
2655
a = float32_squash_input_denormal(a STATUS_VAR);
2656
b = float32_squash_input_denormal(b STATUS_VAR);
2658
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2659
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2661
float_raise( float_flag_invalid STATUS_VAR);
2664
av = float32_val(a);
2665
bv = float32_val(b);
2666
return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2669
/*----------------------------------------------------------------------------
2670
| Returns 1 if the single-precision floating-point value `a' is less than
2671
| or equal to the corresponding value `b', and 0 otherwise. The invalid
2672
| exception is raised if either operand is a NaN. The comparison is performed
2673
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2674
*----------------------------------------------------------------------------*/
2676
int float32_le( float32 a, float32 b STATUS_PARAM )
2680
a = float32_squash_input_denormal(a STATUS_VAR);
2681
b = float32_squash_input_denormal(b STATUS_VAR);
2683
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2684
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2686
float_raise( float_flag_invalid STATUS_VAR);
2689
aSign = extractFloat32Sign( a );
2690
bSign = extractFloat32Sign( b );
2691
av = float32_val(a);
2692
bv = float32_val(b);
2693
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2694
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2698
/*----------------------------------------------------------------------------
2699
| Returns 1 if the single-precision floating-point value `a' is less than
2700
| the corresponding value `b', and 0 otherwise. The invalid exception is
2701
| raised if either operand is a NaN. The comparison is performed according
2702
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2703
*----------------------------------------------------------------------------*/
2705
int float32_lt( float32 a, float32 b STATUS_PARAM )
2709
a = float32_squash_input_denormal(a STATUS_VAR);
2710
b = float32_squash_input_denormal(b STATUS_VAR);
2712
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2713
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2715
float_raise( float_flag_invalid STATUS_VAR);
2718
aSign = extractFloat32Sign( a );
2719
bSign = extractFloat32Sign( b );
2720
av = float32_val(a);
2721
bv = float32_val(b);
2722
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2723
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2727
/*----------------------------------------------------------------------------
2728
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2729
| be compared, and 0 otherwise. The invalid exception is raised if either
2730
| operand is a NaN. The comparison is performed according to the IEC/IEEE
2731
| Standard for Binary Floating-Point Arithmetic.
2732
*----------------------------------------------------------------------------*/
2734
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2736
a = float32_squash_input_denormal(a STATUS_VAR);
2737
b = float32_squash_input_denormal(b STATUS_VAR);
2739
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2740
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2742
float_raise( float_flag_invalid STATUS_VAR);
2748
/*----------------------------------------------------------------------------
2749
| Returns 1 if the single-precision floating-point value `a' is equal to
2750
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2751
| exception. The comparison is performed according to the IEC/IEEE Standard
2752
| for Binary Floating-Point Arithmetic.
2753
*----------------------------------------------------------------------------*/
2755
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2757
a = float32_squash_input_denormal(a STATUS_VAR);
2758
b = float32_squash_input_denormal(b STATUS_VAR);
2760
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2761
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2763
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2764
float_raise( float_flag_invalid STATUS_VAR);
2768
return ( float32_val(a) == float32_val(b) ) ||
2769
( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2772
/*----------------------------------------------------------------------------
2773
| Returns 1 if the single-precision floating-point value `a' is less than or
2774
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2775
| cause an exception. Otherwise, the comparison is performed according to the
2776
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2777
*----------------------------------------------------------------------------*/
2779
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2783
a = float32_squash_input_denormal(a STATUS_VAR);
2784
b = float32_squash_input_denormal(b STATUS_VAR);
2786
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2787
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2789
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2790
float_raise( float_flag_invalid STATUS_VAR);
2794
aSign = extractFloat32Sign( a );
2795
bSign = extractFloat32Sign( b );
2796
av = float32_val(a);
2797
bv = float32_val(b);
2798
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2799
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2803
/*----------------------------------------------------------------------------
2804
| Returns 1 if the single-precision floating-point value `a' is less than
2805
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2806
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2807
| Standard for Binary Floating-Point Arithmetic.
2808
*----------------------------------------------------------------------------*/
2810
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2814
a = float32_squash_input_denormal(a STATUS_VAR);
2815
b = float32_squash_input_denormal(b STATUS_VAR);
2817
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2818
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2820
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2821
float_raise( float_flag_invalid STATUS_VAR);
2825
aSign = extractFloat32Sign( a );
2826
bSign = extractFloat32Sign( b );
2827
av = float32_val(a);
2828
bv = float32_val(b);
2829
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2830
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2834
/*----------------------------------------------------------------------------
2835
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2836
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2837
| comparison is performed according to the IEC/IEEE Standard for Binary
2838
| Floating-Point Arithmetic.
2839
*----------------------------------------------------------------------------*/
2841
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2843
a = float32_squash_input_denormal(a STATUS_VAR);
2844
b = float32_squash_input_denormal(b STATUS_VAR);
2846
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2847
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2849
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2850
float_raise( float_flag_invalid STATUS_VAR);
2857
/*----------------------------------------------------------------------------
2858
| Returns the result of converting the double-precision floating-point value
2859
| `a' to the 32-bit two's complement integer format. The conversion is
2860
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2861
| Arithmetic---which means in particular that the conversion is rounded
2862
| according to the current rounding mode. If `a' is a NaN, the largest
2863
| positive integer is returned. Otherwise, if the conversion overflows, the
2864
| largest integer with the same sign as `a' is returned.
2865
*----------------------------------------------------------------------------*/
2867
int32 float64_to_int32( float64 a STATUS_PARAM )
2870
int_fast16_t aExp, shiftCount;
2872
a = float64_squash_input_denormal(a STATUS_VAR);
2874
aSig = extractFloat64Frac( a );
2875
aExp = extractFloat64Exp( a );
2876
aSign = extractFloat64Sign( a );
2877
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2878
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2879
shiftCount = 0x42C - aExp;
2880
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2881
return roundAndPackInt32( aSign, aSig STATUS_VAR );
2885
/*----------------------------------------------------------------------------
2886
| Returns the result of converting the double-precision floating-point value
2887
| `a' to the 32-bit two's complement integer format. The conversion is
2888
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2889
| Arithmetic, except that the conversion is always rounded toward zero.
2890
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2891
| the conversion overflows, the largest integer with the same sign as `a' is
2893
*----------------------------------------------------------------------------*/
2895
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2898
int_fast16_t aExp, shiftCount;
2899
uint64_t aSig, savedASig;
2901
a = float64_squash_input_denormal(a STATUS_VAR);
2903
aSig = extractFloat64Frac( a );
2904
aExp = extractFloat64Exp( a );
2905
aSign = extractFloat64Sign( a );
2906
if ( 0x41E < aExp ) {
2907
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2910
else if ( aExp < 0x3FF ) {
2911
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2914
aSig |= LIT64( 0x0010000000000000 );
2915
shiftCount = 0x433 - aExp;
2917
aSig >>= shiftCount;
2919
if ( aSign ) z = - z;
2920
if ( ( z < 0 ) ^ aSign ) {
2922
float_raise( float_flag_invalid STATUS_VAR);
2923
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2925
if ( ( aSig<<shiftCount ) != savedASig ) {
2926
STATUS(float_exception_flags) |= float_flag_inexact;
2932
/*----------------------------------------------------------------------------
2933
| Returns the result of converting the double-precision floating-point value
2934
| `a' to the 16-bit two's complement integer format. The conversion is
2935
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2936
| Arithmetic, except that the conversion is always rounded toward zero.
2937
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2938
| the conversion overflows, the largest integer with the same sign as `a' is
2940
*----------------------------------------------------------------------------*/
2942
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2945
int_fast16_t aExp, shiftCount;
2946
uint64_t aSig, savedASig;
2949
aSig = extractFloat64Frac( a );
2950
aExp = extractFloat64Exp( a );
2951
aSign = extractFloat64Sign( a );
2952
if ( 0x40E < aExp ) {
2953
if ( ( aExp == 0x7FF ) && aSig ) {
2958
else if ( aExp < 0x3FF ) {
2959
if ( aExp || aSig ) {
2960
STATUS(float_exception_flags) |= float_flag_inexact;
2964
aSig |= LIT64( 0x0010000000000000 );
2965
shiftCount = 0x433 - aExp;
2967
aSig >>= shiftCount;
2972
if ( ( (int16_t)z < 0 ) ^ aSign ) {
2974
float_raise( float_flag_invalid STATUS_VAR);
2975
return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2977
if ( ( aSig<<shiftCount ) != savedASig ) {
2978
STATUS(float_exception_flags) |= float_flag_inexact;
2983
/*----------------------------------------------------------------------------
2984
| Returns the result of converting the double-precision floating-point value
2985
| `a' to the 64-bit two's complement integer format. The conversion is
2986
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2987
| Arithmetic---which means in particular that the conversion is rounded
2988
| according to the current rounding mode. If `a' is a NaN, the largest
2989
| positive integer is returned. Otherwise, if the conversion overflows, the
2990
| largest integer with the same sign as `a' is returned.
2991
*----------------------------------------------------------------------------*/
2993
int64 float64_to_int64( float64 a STATUS_PARAM )
2996
int_fast16_t aExp, shiftCount;
2997
uint64_t aSig, aSigExtra;
2998
a = float64_squash_input_denormal(a STATUS_VAR);
3000
aSig = extractFloat64Frac( a );
3001
aExp = extractFloat64Exp( a );
3002
aSign = extractFloat64Sign( a );
3003
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3004
shiftCount = 0x433 - aExp;
3005
if ( shiftCount <= 0 ) {
3006
if ( 0x43E < aExp ) {
3007
float_raise( float_flag_invalid STATUS_VAR);
3009
|| ( ( aExp == 0x7FF )
3010
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
3012
return LIT64( 0x7FFFFFFFFFFFFFFF );
3014
return (int64_t) LIT64( 0x8000000000000000 );
3017
aSig <<= - shiftCount;
3020
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3022
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3026
/*----------------------------------------------------------------------------
3027
| Returns the result of converting the double-precision floating-point value
3028
| `a' to the 64-bit two's complement integer format. The conversion is
3029
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3030
| Arithmetic, except that the conversion is always rounded toward zero.
3031
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3032
| the conversion overflows, the largest integer with the same sign as `a' is
3034
*----------------------------------------------------------------------------*/
3036
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3039
int_fast16_t aExp, shiftCount;
3042
a = float64_squash_input_denormal(a STATUS_VAR);
3044
aSig = extractFloat64Frac( a );
3045
aExp = extractFloat64Exp( a );
3046
aSign = extractFloat64Sign( a );
3047
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3048
shiftCount = aExp - 0x433;
3049
if ( 0 <= shiftCount ) {
3050
if ( 0x43E <= aExp ) {
3051
if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3052
float_raise( float_flag_invalid STATUS_VAR);
3054
|| ( ( aExp == 0x7FF )
3055
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
3057
return LIT64( 0x7FFFFFFFFFFFFFFF );
3060
return (int64_t) LIT64( 0x8000000000000000 );
3062
z = aSig<<shiftCount;
3065
if ( aExp < 0x3FE ) {
3066
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3069
z = aSig>>( - shiftCount );
3070
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3071
STATUS(float_exception_flags) |= float_flag_inexact;
3074
if ( aSign ) z = - z;
3079
/*----------------------------------------------------------------------------
3080
| Returns the result of converting the double-precision floating-point value
3081
| `a' to the single-precision floating-point format. The conversion is
3082
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3084
*----------------------------------------------------------------------------*/
3086
float32 float64_to_float32( float64 a STATUS_PARAM )
3092
a = float64_squash_input_denormal(a STATUS_VAR);
3094
aSig = extractFloat64Frac( a );
3095
aExp = extractFloat64Exp( a );
3096
aSign = extractFloat64Sign( a );
3097
if ( aExp == 0x7FF ) {
3098
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3099
return packFloat32( aSign, 0xFF, 0 );
3101
shift64RightJamming( aSig, 22, &aSig );
3103
if ( aExp || zSig ) {
3107
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3112
/*----------------------------------------------------------------------------
3113
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3114
| half-precision floating-point value, returning the result. After being
3115
| shifted into the proper positions, the three fields are simply added
3116
| together to form the result. This means that any integer portion of `zSig'
3117
| will be added into the exponent. Since a properly normalized significand
3118
| will have an integer portion equal to 1, the `zExp' input should be 1 less
3119
| than the desired result exponent whenever `zSig' is a complete, normalized
3121
*----------------------------------------------------------------------------*/
3122
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3124
return make_float16(
3125
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3128
/*----------------------------------------------------------------------------
3129
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3130
| and significand `zSig', and returns the proper half-precision floating-
3131
| point value corresponding to the abstract input. Ordinarily, the abstract
3132
| value is simply rounded and packed into the half-precision format, with
3133
| the inexact exception raised if the abstract input cannot be represented
3134
| exactly. However, if the abstract value is too large, the overflow and
3135
| inexact exceptions are raised and an infinity or maximal finite value is
3136
| returned. If the abstract value is too small, the input value is rounded to
3137
| a subnormal number, and the underflow and inexact exceptions are raised if
3138
| the abstract input cannot be represented exactly as a subnormal half-
3139
| precision floating-point number.
3140
| The `ieee' flag indicates whether to use IEEE standard half precision, or
3141
| ARM-style "alternative representation", which omits the NaN and Inf
3142
| encodings in order to raise the maximum representable exponent by one.
3143
| The input significand `zSig' has its binary point between bits 22
3144
| and 23, which is 13 bits to the left of the usual location. This shifted
3145
| significand must be normalized or smaller. If `zSig' is not normalized,
3146
| `zExp' must be 0; in that case, the result returned is a subnormal number,
3147
| and it must not require rounding. In the usual case that `zSig' is
3148
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3149
| Note the slightly odd position of the binary point in zSig compared with the
3150
| other roundAndPackFloat functions. This should probably be fixed if we
3151
| need to implement more float16 routines than just conversion.
3152
| The handling of underflow and overflow follows the IEC/IEEE Standard for
3153
| Binary Floating-Point Arithmetic.
3154
*----------------------------------------------------------------------------*/
3156
static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3157
uint32_t zSig, flag ieee STATUS_PARAM)
3159
int maxexp = ieee ? 29 : 30;
3162
bool rounding_bumps_exp;
3163
bool is_tiny = false;
3165
/* Calculate the mask of bits of the mantissa which are not
3166
* representable in half-precision and will be lost.
3169
/* Will be denormal in halfprec */
3175
/* Normal number in halfprec */
3179
switch (STATUS(float_rounding_mode)) {
3180
case float_round_nearest_even:
3181
increment = (mask + 1) >> 1;
3182
if ((zSig & mask) == increment) {
3183
increment = zSig & (increment << 1);
3186
case float_round_up:
3187
increment = zSign ? 0 : mask;
3189
case float_round_down:
3190
increment = zSign ? mask : 0;
3192
default: /* round_to_zero */
3197
rounding_bumps_exp = (zSig + increment >= 0x01000000);
3199
if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3201
float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3202
return packFloat16(zSign, 0x1f, 0);
3204
float_raise(float_flag_invalid STATUS_VAR);
3205
return packFloat16(zSign, 0x1f, 0x3ff);
3210
/* Note that flush-to-zero does not affect half-precision results */
3212
(STATUS(float_detect_tininess) == float_tininess_before_rounding)
3214
|| (!rounding_bumps_exp);
3217
float_raise(float_flag_inexact STATUS_VAR);
3219
float_raise(float_flag_underflow STATUS_VAR);
3224
if (rounding_bumps_exp) {
3230
return packFloat16(zSign, 0, 0);
3236
return packFloat16(zSign, zExp, zSig >> 13);
3239
static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3242
int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3243
*zSigPtr = aSig << shiftCount;
3244
*zExpPtr = 1 - shiftCount;
3247
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3248
The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3250
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3256
aSign = extractFloat16Sign(a);
3257
aExp = extractFloat16Exp(a);
3258
aSig = extractFloat16Frac(a);
3260
if (aExp == 0x1f && ieee) {
3262
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3264
return packFloat32(aSign, 0xff, 0);
3268
return packFloat32(aSign, 0, 0);
3271
normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3274
return packFloat32( aSign, aExp + 0x70, aSig << 13);
3277
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3283
a = float32_squash_input_denormal(a STATUS_VAR);
3285
aSig = extractFloat32Frac( a );
3286
aExp = extractFloat32Exp( a );
3287
aSign = extractFloat32Sign( a );
3288
if ( aExp == 0xFF ) {
3290
/* Input is a NaN */
3292
float_raise(float_flag_invalid STATUS_VAR);
3293
return packFloat16(aSign, 0, 0);
3295
return commonNaNToFloat16(
3296
float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3300
float_raise(float_flag_invalid STATUS_VAR);
3301
return packFloat16(aSign, 0x1f, 0x3ff);
3303
return packFloat16(aSign, 0x1f, 0);
3305
if (aExp == 0 && aSig == 0) {
3306
return packFloat16(aSign, 0, 0);
3308
/* Decimal point between bits 22 and 23. Note that we add the 1 bit
3309
* even if the input is denormal; however this is harmless because
3310
* the largest possible single-precision denormal is still smaller
3311
* than the smallest representable half-precision denormal, and so we
3312
* will end up ignoring aSig and returning via the "always return zero"
3318
return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3321
float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3327
aSign = extractFloat16Sign(a);
3328
aExp = extractFloat16Exp(a);
3329
aSig = extractFloat16Frac(a);
3331
if (aExp == 0x1f && ieee) {
3333
return commonNaNToFloat64(
3334
float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3336
return packFloat64(aSign, 0x7ff, 0);
3340
return packFloat64(aSign, 0, 0);
3343
normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3346
return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3349
float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3356
a = float64_squash_input_denormal(a STATUS_VAR);
3358
aSig = extractFloat64Frac(a);
3359
aExp = extractFloat64Exp(a);
3360
aSign = extractFloat64Sign(a);
3361
if (aExp == 0x7FF) {
3363
/* Input is a NaN */
3365
float_raise(float_flag_invalid STATUS_VAR);
3366
return packFloat16(aSign, 0, 0);
3368
return commonNaNToFloat16(
3369
float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3373
float_raise(float_flag_invalid STATUS_VAR);
3374
return packFloat16(aSign, 0x1f, 0x3ff);
3376
return packFloat16(aSign, 0x1f, 0);
3378
shift64RightJamming(aSig, 29, &aSig);
3380
if (aExp == 0 && zSig == 0) {
3381
return packFloat16(aSign, 0, 0);
3383
/* Decimal point between bits 22 and 23. Note that we add the 1 bit
3384
* even if the input is denormal; however this is harmless because
3385
* the largest possible single-precision denormal is still smaller
3386
* than the smallest representable half-precision denormal, and so we
3387
* will end up ignoring aSig and returning via the "always return zero"
3393
return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3396
/*----------------------------------------------------------------------------
3397
| Returns the result of converting the double-precision floating-point value
3398
| `a' to the extended double-precision floating-point format. The conversion
3399
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3401
*----------------------------------------------------------------------------*/
3403
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3409
a = float64_squash_input_denormal(a STATUS_VAR);
3410
aSig = extractFloat64Frac( a );
3411
aExp = extractFloat64Exp( a );
3412
aSign = extractFloat64Sign( a );
3413
if ( aExp == 0x7FF ) {
3414
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3415
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3418
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3419
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3423
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3427
/*----------------------------------------------------------------------------
3428
| Returns the result of converting the double-precision floating-point value
3429
| `a' to the quadruple-precision floating-point format. The conversion is
3430
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3432
*----------------------------------------------------------------------------*/
3434
float128 float64_to_float128( float64 a STATUS_PARAM )
3438
uint64_t aSig, zSig0, zSig1;
3440
a = float64_squash_input_denormal(a STATUS_VAR);
3441
aSig = extractFloat64Frac( a );
3442
aExp = extractFloat64Exp( a );
3443
aSign = extractFloat64Sign( a );
3444
if ( aExp == 0x7FF ) {
3445
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3446
return packFloat128( aSign, 0x7FFF, 0, 0 );
3449
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3450
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3453
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3454
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3458
/*----------------------------------------------------------------------------
3459
| Rounds the double-precision floating-point value `a' to an integer, and
3460
| returns the result as a double-precision floating-point value. The
3461
| operation is performed according to the IEC/IEEE Standard for Binary
3462
| Floating-Point Arithmetic.
3463
*----------------------------------------------------------------------------*/
3465
float64 float64_round_to_int( float64 a STATUS_PARAM )
3469
uint64_t lastBitMask, roundBitsMask;
3471
a = float64_squash_input_denormal(a STATUS_VAR);
3473
aExp = extractFloat64Exp( a );
3474
if ( 0x433 <= aExp ) {
3475
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3476
return propagateFloat64NaN( a, a STATUS_VAR );
3480
if ( aExp < 0x3FF ) {
3481
if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3482
STATUS(float_exception_flags) |= float_flag_inexact;
3483
aSign = extractFloat64Sign( a );
3484
switch ( STATUS(float_rounding_mode) ) {
3485
case float_round_nearest_even:
3486
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3487
return packFloat64( aSign, 0x3FF, 0 );
3490
case float_round_down:
3491
return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3492
case float_round_up:
3493
return make_float64(
3494
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3496
return packFloat64( aSign, 0, 0 );
3499
lastBitMask <<= 0x433 - aExp;
3500
roundBitsMask = lastBitMask - 1;
3502
switch (STATUS(float_rounding_mode)) {
3503
case float_round_nearest_even:
3504
z += lastBitMask >> 1;
3505
if ((z & roundBitsMask) == 0) {
3509
case float_round_to_zero:
3511
case float_round_up:
3512
if (!extractFloat64Sign(make_float64(z))) {
3516
case float_round_down:
3517
if (extractFloat64Sign(make_float64(z))) {
3524
z &= ~ roundBitsMask;
3525
if ( z != float64_val(a) )
3526
STATUS(float_exception_flags) |= float_flag_inexact;
3527
return make_float64(z);
3531
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3535
oldmode = STATUS(float_rounding_mode);
3536
STATUS(float_rounding_mode) = float_round_to_zero;
3537
res = float64_round_to_int(a STATUS_VAR);
3538
STATUS(float_rounding_mode) = oldmode;
3542
/*----------------------------------------------------------------------------
3543
| Returns the result of adding the absolute values of the double-precision
3544
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3545
| before being returned. `zSign' is ignored if the result is a NaN.
3546
| The addition is performed according to the IEC/IEEE Standard for Binary
3547
| Floating-Point Arithmetic.
3548
*----------------------------------------------------------------------------*/
3550
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3552
int_fast16_t aExp, bExp, zExp;
3553
uint64_t aSig, bSig, zSig;
3554
int_fast16_t expDiff;
3556
aSig = extractFloat64Frac( a );
3557
aExp = extractFloat64Exp( a );
3558
bSig = extractFloat64Frac( b );
3559
bExp = extractFloat64Exp( b );
3560
expDiff = aExp - bExp;
3563
if ( 0 < expDiff ) {
3564
if ( aExp == 0x7FF ) {
3565
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3572
bSig |= LIT64( 0x2000000000000000 );
3574
shift64RightJamming( bSig, expDiff, &bSig );
3577
else if ( expDiff < 0 ) {
3578
if ( bExp == 0x7FF ) {
3579
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3580
return packFloat64( zSign, 0x7FF, 0 );
3586
aSig |= LIT64( 0x2000000000000000 );
3588
shift64RightJamming( aSig, - expDiff, &aSig );
3592
if ( aExp == 0x7FF ) {
3593
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3597
if (STATUS(flush_to_zero)) {
3599
float_raise(float_flag_output_denormal STATUS_VAR);
3601
return packFloat64(zSign, 0, 0);
3603
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3605
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3609
aSig |= LIT64( 0x2000000000000000 );
3610
zSig = ( aSig + bSig )<<1;
3612
if ( (int64_t) zSig < 0 ) {
3617
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3621
/*----------------------------------------------------------------------------
3622
| Returns the result of subtracting the absolute values of the double-
3623
| precision floating-point values `a' and `b'. If `zSign' is 1, the
3624
| difference is negated before being returned. `zSign' is ignored if the
3625
| result is a NaN. The subtraction is performed according to the IEC/IEEE
3626
| Standard for Binary Floating-Point Arithmetic.
3627
*----------------------------------------------------------------------------*/
3629
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3631
int_fast16_t aExp, bExp, zExp;
3632
uint64_t aSig, bSig, zSig;
3633
int_fast16_t expDiff;
3635
aSig = extractFloat64Frac( a );
3636
aExp = extractFloat64Exp( a );
3637
bSig = extractFloat64Frac( b );
3638
bExp = extractFloat64Exp( b );
3639
expDiff = aExp - bExp;
3642
if ( 0 < expDiff ) goto aExpBigger;
3643
if ( expDiff < 0 ) goto bExpBigger;
3644
if ( aExp == 0x7FF ) {
3645
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3646
float_raise( float_flag_invalid STATUS_VAR);
3647
return float64_default_nan;
3653
if ( bSig < aSig ) goto aBigger;
3654
if ( aSig < bSig ) goto bBigger;
3655
return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3657
if ( bExp == 0x7FF ) {
3658
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3659
return packFloat64( zSign ^ 1, 0x7FF, 0 );
3665
aSig |= LIT64( 0x4000000000000000 );
3667
shift64RightJamming( aSig, - expDiff, &aSig );
3668
bSig |= LIT64( 0x4000000000000000 );
3673
goto normalizeRoundAndPack;
3675
if ( aExp == 0x7FF ) {
3676
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3683
bSig |= LIT64( 0x4000000000000000 );
3685
shift64RightJamming( bSig, expDiff, &bSig );
3686
aSig |= LIT64( 0x4000000000000000 );
3690
normalizeRoundAndPack:
3692
return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3696
/*----------------------------------------------------------------------------
3697
| Returns the result of adding the double-precision floating-point values `a'
3698
| and `b'. The operation is performed according to the IEC/IEEE Standard for
3699
| Binary Floating-Point Arithmetic.
3700
*----------------------------------------------------------------------------*/
3702
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3705
a = float64_squash_input_denormal(a STATUS_VAR);
3706
b = float64_squash_input_denormal(b STATUS_VAR);
3708
aSign = extractFloat64Sign( a );
3709
bSign = extractFloat64Sign( b );
3710
if ( aSign == bSign ) {
3711
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3714
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3719
/*----------------------------------------------------------------------------
3720
| Returns the result of subtracting the double-precision floating-point values
3721
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3722
| for Binary Floating-Point Arithmetic.
3723
*----------------------------------------------------------------------------*/
3725
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3728
a = float64_squash_input_denormal(a STATUS_VAR);
3729
b = float64_squash_input_denormal(b STATUS_VAR);
3731
aSign = extractFloat64Sign( a );
3732
bSign = extractFloat64Sign( b );
3733
if ( aSign == bSign ) {
3734
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3737
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3742
/*----------------------------------------------------------------------------
3743
| Returns the result of multiplying the double-precision floating-point values
3744
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3745
| for Binary Floating-Point Arithmetic.
3746
*----------------------------------------------------------------------------*/
3748
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3750
flag aSign, bSign, zSign;
3751
int_fast16_t aExp, bExp, zExp;
3752
uint64_t aSig, bSig, zSig0, zSig1;
3754
a = float64_squash_input_denormal(a STATUS_VAR);
3755
b = float64_squash_input_denormal(b STATUS_VAR);
3757
aSig = extractFloat64Frac( a );
3758
aExp = extractFloat64Exp( a );
3759
aSign = extractFloat64Sign( a );
3760
bSig = extractFloat64Frac( b );
3761
bExp = extractFloat64Exp( b );
3762
bSign = extractFloat64Sign( b );
3763
zSign = aSign ^ bSign;
3764
if ( aExp == 0x7FF ) {
3765
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3766
return propagateFloat64NaN( a, b STATUS_VAR );
3768
if ( ( bExp | bSig ) == 0 ) {
3769
float_raise( float_flag_invalid STATUS_VAR);
3770
return float64_default_nan;
3772
return packFloat64( zSign, 0x7FF, 0 );
3774
if ( bExp == 0x7FF ) {
3775
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3776
if ( ( aExp | aSig ) == 0 ) {
3777
float_raise( float_flag_invalid STATUS_VAR);
3778
return float64_default_nan;
3780
return packFloat64( zSign, 0x7FF, 0 );
3783
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3784
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3787
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3788
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3790
zExp = aExp + bExp - 0x3FF;
3791
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3792
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3793
mul64To128( aSig, bSig, &zSig0, &zSig1 );
3794
zSig0 |= ( zSig1 != 0 );
3795
if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3799
return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3803
/*----------------------------------------------------------------------------
3804
| Returns the result of dividing the double-precision floating-point value `a'
3805
| by the corresponding value `b'. The operation is performed according to
3806
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3807
*----------------------------------------------------------------------------*/
3809
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3811
flag aSign, bSign, zSign;
3812
int_fast16_t aExp, bExp, zExp;
3813
uint64_t aSig, bSig, zSig;
3814
uint64_t rem0, rem1;
3815
uint64_t term0, term1;
3816
a = float64_squash_input_denormal(a STATUS_VAR);
3817
b = float64_squash_input_denormal(b STATUS_VAR);
3819
aSig = extractFloat64Frac( a );
3820
aExp = extractFloat64Exp( a );
3821
aSign = extractFloat64Sign( a );
3822
bSig = extractFloat64Frac( b );
3823
bExp = extractFloat64Exp( b );
3824
bSign = extractFloat64Sign( b );
3825
zSign = aSign ^ bSign;
3826
if ( aExp == 0x7FF ) {
3827
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3828
if ( bExp == 0x7FF ) {
3829
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3830
float_raise( float_flag_invalid STATUS_VAR);
3831
return float64_default_nan;
3833
return packFloat64( zSign, 0x7FF, 0 );
3835
if ( bExp == 0x7FF ) {
3836
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3837
return packFloat64( zSign, 0, 0 );
3841
if ( ( aExp | aSig ) == 0 ) {
3842
float_raise( float_flag_invalid STATUS_VAR);
3843
return float64_default_nan;
3845
float_raise( float_flag_divbyzero STATUS_VAR);
3846
return packFloat64( zSign, 0x7FF, 0 );
3848
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3851
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3852
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3854
zExp = aExp - bExp + 0x3FD;
3855
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3856
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3857
if ( bSig <= ( aSig + aSig ) ) {
3861
zSig = estimateDiv128To64( aSig, 0, bSig );
3862
if ( ( zSig & 0x1FF ) <= 2 ) {
3863
mul64To128( bSig, zSig, &term0, &term1 );
3864
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3865
while ( (int64_t) rem0 < 0 ) {
3867
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3869
zSig |= ( rem1 != 0 );
3871
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3875
/*----------------------------------------------------------------------------
3876
| Returns the remainder of the double-precision floating-point value `a'
3877
| with respect to the corresponding value `b'. The operation is performed
3878
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3879
*----------------------------------------------------------------------------*/
3881
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3884
int_fast16_t aExp, bExp, expDiff;
3885
uint64_t aSig, bSig;
3886
uint64_t q, alternateASig;
3889
a = float64_squash_input_denormal(a STATUS_VAR);
3890
b = float64_squash_input_denormal(b STATUS_VAR);
3891
aSig = extractFloat64Frac( a );
3892
aExp = extractFloat64Exp( a );
3893
aSign = extractFloat64Sign( a );
3894
bSig = extractFloat64Frac( b );
3895
bExp = extractFloat64Exp( b );
3896
if ( aExp == 0x7FF ) {
3897
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3898
return propagateFloat64NaN( a, b STATUS_VAR );
3900
float_raise( float_flag_invalid STATUS_VAR);
3901
return float64_default_nan;
3903
if ( bExp == 0x7FF ) {
3904
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3909
float_raise( float_flag_invalid STATUS_VAR);
3910
return float64_default_nan;
3912
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3915
if ( aSig == 0 ) return a;
3916
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3918
expDiff = aExp - bExp;
3919
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3920
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3921
if ( expDiff < 0 ) {
3922
if ( expDiff < -1 ) return a;
3925
q = ( bSig <= aSig );
3926
if ( q ) aSig -= bSig;
3928
while ( 0 < expDiff ) {
3929
q = estimateDiv128To64( aSig, 0, bSig );
3930
q = ( 2 < q ) ? q - 2 : 0;
3931
aSig = - ( ( bSig>>2 ) * q );
3935
if ( 0 < expDiff ) {
3936
q = estimateDiv128To64( aSig, 0, bSig );
3937
q = ( 2 < q ) ? q - 2 : 0;
3940
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3947
alternateASig = aSig;
3950
} while ( 0 <= (int64_t) aSig );
3951
sigMean = aSig + alternateASig;
3952
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3953
aSig = alternateASig;
3955
zSign = ( (int64_t) aSig < 0 );
3956
if ( zSign ) aSig = - aSig;
3957
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3961
/*----------------------------------------------------------------------------
3962
| Returns the result of multiplying the double-precision floating-point values
3963
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3964
| multiplication. The operation is performed according to the IEC/IEEE
3965
| Standard for Binary Floating-Point Arithmetic 754-2008.
3966
| The flags argument allows the caller to select negation of the
3967
| addend, the intermediate product, or the final result. (The difference
3968
| between this and having the caller do a separate negation is that negating
3969
| externally will flip the sign bit on NaNs.)
3970
*----------------------------------------------------------------------------*/
3972
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3974
flag aSign, bSign, cSign, zSign;
3975
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3976
uint64_t aSig, bSig, cSig;
3977
flag pInf, pZero, pSign;
3978
uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3980
flag signflip, infzero;
3982
a = float64_squash_input_denormal(a STATUS_VAR);
3983
b = float64_squash_input_denormal(b STATUS_VAR);
3984
c = float64_squash_input_denormal(c STATUS_VAR);
3985
aSig = extractFloat64Frac(a);
3986
aExp = extractFloat64Exp(a);
3987
aSign = extractFloat64Sign(a);
3988
bSig = extractFloat64Frac(b);
3989
bExp = extractFloat64Exp(b);
3990
bSign = extractFloat64Sign(b);
3991
cSig = extractFloat64Frac(c);
3992
cExp = extractFloat64Exp(c);
3993
cSign = extractFloat64Sign(c);
3995
infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3996
(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3998
/* It is implementation-defined whether the cases of (0,inf,qnan)
3999
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4000
* they return if they do), so we have to hand this information
4001
* off to the target-specific pick-a-NaN routine.
4003
if (((aExp == 0x7ff) && aSig) ||
4004
((bExp == 0x7ff) && bSig) ||
4005
((cExp == 0x7ff) && cSig)) {
4006
return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4010
float_raise(float_flag_invalid STATUS_VAR);
4011
return float64_default_nan;
4014
if (flags & float_muladd_negate_c) {
4018
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4020
/* Work out the sign and type of the product */
4021
pSign = aSign ^ bSign;
4022
if (flags & float_muladd_negate_product) {
4025
pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4026
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4028
if (cExp == 0x7ff) {
4029
if (pInf && (pSign ^ cSign)) {
4030
/* addition of opposite-signed infinities => InvalidOperation */
4031
float_raise(float_flag_invalid STATUS_VAR);
4032
return float64_default_nan;
4034
/* Otherwise generate an infinity of the same sign */
4035
return packFloat64(cSign ^ signflip, 0x7ff, 0);
4039
return packFloat64(pSign ^ signflip, 0x7ff, 0);
4045
/* Adding two exact zeroes */
4046
if (pSign == cSign) {
4048
} else if (STATUS(float_rounding_mode) == float_round_down) {
4053
return packFloat64(zSign ^ signflip, 0, 0);
4055
/* Exact zero plus a denorm */
4056
if (STATUS(flush_to_zero)) {
4057
float_raise(float_flag_output_denormal STATUS_VAR);
4058
return packFloat64(cSign ^ signflip, 0, 0);
4061
/* Zero plus something non-zero : just return the something */
4062
return packFloat64(cSign ^ signflip, cExp, cSig);
4066
normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4069
normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4072
/* Calculate the actual result a * b + c */
4074
/* Multiply first; this is easy. */
4075
/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4076
* because we want the true exponent, not the "one-less-than"
4077
* flavour that roundAndPackFloat64() takes.
4079
pExp = aExp + bExp - 0x3fe;
4080
aSig = (aSig | LIT64(0x0010000000000000))<<10;
4081
bSig = (bSig | LIT64(0x0010000000000000))<<11;
4082
mul64To128(aSig, bSig, &pSig0, &pSig1);
4083
if ((int64_t)(pSig0 << 1) >= 0) {
4084
shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4088
zSign = pSign ^ signflip;
4090
/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4091
* bit in position 126.
4095
/* Throw out the special case of c being an exact zero now */
4096
shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4097
return roundAndPackFloat64(zSign, pExp - 1,
4100
normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4103
/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4104
* significand of the addend, with the explicit bit in position 126.
4106
cSig0 = cSig << (126 - 64 - 52);
4108
cSig0 |= LIT64(0x4000000000000000);
4109
expDiff = pExp - cExp;
4111
if (pSign == cSign) {
4114
/* scale c to match p */
4115
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4117
} else if (expDiff < 0) {
4118
/* scale p to match c */
4119
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4122
/* no scaling needed */
4125
/* Add significands and make sure explicit bit ends up in posn 126 */
4126
add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4127
if ((int64_t)zSig0 < 0) {
4128
shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4132
shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4133
return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4137
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4138
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4140
} else if (expDiff < 0) {
4141
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4142
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4147
if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4148
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4149
} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4150
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4155
if (STATUS(float_rounding_mode) == float_round_down) {
4158
return packFloat64(zSign, 0, 0);
4162
/* Do the equivalent of normalizeRoundAndPackFloat64() but
4163
* starting with the significand in a pair of uint64_t.
4166
shiftcount = countLeadingZeros64(zSig0) - 1;
4167
shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4173
shiftcount = countLeadingZeros64(zSig1);
4174
if (shiftcount == 0) {
4175
zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4179
zSig0 = zSig1 << shiftcount;
4180
zExp -= (shiftcount + 64);
4183
return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4187
/*----------------------------------------------------------------------------
4188
| Returns the square root of the double-precision floating-point value `a'.
4189
| The operation is performed according to the IEC/IEEE Standard for Binary
4190
| Floating-Point Arithmetic.
4191
*----------------------------------------------------------------------------*/
4193
float64 float64_sqrt( float64 a STATUS_PARAM )
4196
int_fast16_t aExp, zExp;
4197
uint64_t aSig, zSig, doubleZSig;
4198
uint64_t rem0, rem1, term0, term1;
4199
a = float64_squash_input_denormal(a STATUS_VAR);
4201
aSig = extractFloat64Frac( a );
4202
aExp = extractFloat64Exp( a );
4203
aSign = extractFloat64Sign( a );
4204
if ( aExp == 0x7FF ) {
4205
if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4206
if ( ! aSign ) return a;
4207
float_raise( float_flag_invalid STATUS_VAR);
4208
return float64_default_nan;
4211
if ( ( aExp | aSig ) == 0 ) return a;
4212
float_raise( float_flag_invalid STATUS_VAR);
4213
return float64_default_nan;
4216
if ( aSig == 0 ) return float64_zero;
4217
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4219
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4220
aSig |= LIT64( 0x0010000000000000 );
4221
zSig = estimateSqrt32( aExp, aSig>>21 );
4222
aSig <<= 9 - ( aExp & 1 );
4223
zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4224
if ( ( zSig & 0x1FF ) <= 5 ) {
4225
doubleZSig = zSig<<1;
4226
mul64To128( zSig, zSig, &term0, &term1 );
4227
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4228
while ( (int64_t) rem0 < 0 ) {
4231
add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4233
zSig |= ( ( rem0 | rem1 ) != 0 );
4235
return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4239
/*----------------------------------------------------------------------------
4240
| Returns the binary log of the double-precision floating-point value `a'.
4241
| The operation is performed according to the IEC/IEEE Standard for Binary
4242
| Floating-Point Arithmetic.
4243
*----------------------------------------------------------------------------*/
4244
float64 float64_log2( float64 a STATUS_PARAM )
4248
uint64_t aSig, aSig0, aSig1, zSig, i;
4249
a = float64_squash_input_denormal(a STATUS_VAR);
4251
aSig = extractFloat64Frac( a );
4252
aExp = extractFloat64Exp( a );
4253
aSign = extractFloat64Sign( a );
4256
if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4257
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4260
float_raise( float_flag_invalid STATUS_VAR);
4261
return float64_default_nan;
4263
if ( aExp == 0x7FF ) {
4264
if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4269
aSig |= LIT64( 0x0010000000000000 );
4271
zSig = (uint64_t)aExp << 52;
4272
for (i = 1LL << 51; i > 0; i >>= 1) {
4273
mul64To128( aSig, aSig, &aSig0, &aSig1 );
4274
aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4275
if ( aSig & LIT64( 0x0020000000000000 ) ) {
4283
return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4286
/*----------------------------------------------------------------------------
4287
| Returns 1 if the double-precision floating-point value `a' is equal to the
4288
| corresponding value `b', and 0 otherwise. The invalid exception is raised
4289
| if either operand is a NaN. Otherwise, the comparison is performed
4290
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4291
*----------------------------------------------------------------------------*/
4293
int float64_eq( float64 a, float64 b STATUS_PARAM )
4296
a = float64_squash_input_denormal(a STATUS_VAR);
4297
b = float64_squash_input_denormal(b STATUS_VAR);
4299
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4300
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4302
float_raise( float_flag_invalid STATUS_VAR);
4305
av = float64_val(a);
4306
bv = float64_val(b);
4307
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4311
/*----------------------------------------------------------------------------
4312
| Returns 1 if the double-precision floating-point value `a' is less than or
4313
| equal to the corresponding value `b', and 0 otherwise. The invalid
4314
| exception is raised if either operand is a NaN. The comparison is performed
4315
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4316
*----------------------------------------------------------------------------*/
4318
int float64_le( float64 a, float64 b STATUS_PARAM )
4322
a = float64_squash_input_denormal(a STATUS_VAR);
4323
b = float64_squash_input_denormal(b STATUS_VAR);
4325
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4326
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4328
float_raise( float_flag_invalid STATUS_VAR);
4331
aSign = extractFloat64Sign( a );
4332
bSign = extractFloat64Sign( b );
4333
av = float64_val(a);
4334
bv = float64_val(b);
4335
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4336
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4340
/*----------------------------------------------------------------------------
4341
| Returns 1 if the double-precision floating-point value `a' is less than
4342
| the corresponding value `b', and 0 otherwise. The invalid exception is
4343
| raised if either operand is a NaN. The comparison is performed according
4344
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4345
*----------------------------------------------------------------------------*/
4347
int float64_lt( float64 a, float64 b STATUS_PARAM )
4352
a = float64_squash_input_denormal(a STATUS_VAR);
4353
b = float64_squash_input_denormal(b STATUS_VAR);
4354
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4355
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4357
float_raise( float_flag_invalid STATUS_VAR);
4360
aSign = extractFloat64Sign( a );
4361
bSign = extractFloat64Sign( b );
4362
av = float64_val(a);
4363
bv = float64_val(b);
4364
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4365
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4369
/*----------------------------------------------------------------------------
4370
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4371
| be compared, and 0 otherwise. The invalid exception is raised if either
4372
| operand is a NaN. The comparison is performed according to the IEC/IEEE
4373
| Standard for Binary Floating-Point Arithmetic.
4374
*----------------------------------------------------------------------------*/
4376
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4378
a = float64_squash_input_denormal(a STATUS_VAR);
4379
b = float64_squash_input_denormal(b STATUS_VAR);
4381
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4382
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4384
float_raise( float_flag_invalid STATUS_VAR);
4390
/*----------------------------------------------------------------------------
4391
| Returns 1 if the double-precision floating-point value `a' is equal to the
4392
| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4393
| exception.The comparison is performed according to the IEC/IEEE Standard
4394
| for Binary Floating-Point Arithmetic.
4395
*----------------------------------------------------------------------------*/
4397
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4400
a = float64_squash_input_denormal(a STATUS_VAR);
4401
b = float64_squash_input_denormal(b STATUS_VAR);
4403
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4404
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4406
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4407
float_raise( float_flag_invalid STATUS_VAR);
4411
av = float64_val(a);
4412
bv = float64_val(b);
4413
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4417
/*----------------------------------------------------------------------------
4418
| Returns 1 if the double-precision floating-point value `a' is less than or
4419
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4420
| cause an exception. Otherwise, the comparison is performed according to the
4421
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4422
*----------------------------------------------------------------------------*/
4424
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4428
a = float64_squash_input_denormal(a STATUS_VAR);
4429
b = float64_squash_input_denormal(b STATUS_VAR);
4431
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4432
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4434
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4435
float_raise( float_flag_invalid STATUS_VAR);
4439
aSign = extractFloat64Sign( a );
4440
bSign = extractFloat64Sign( b );
4441
av = float64_val(a);
4442
bv = float64_val(b);
4443
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4444
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4448
/*----------------------------------------------------------------------------
4449
| Returns 1 if the double-precision floating-point value `a' is less than
4450
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4451
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4452
| Standard for Binary Floating-Point Arithmetic.
4453
*----------------------------------------------------------------------------*/
4455
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4459
a = float64_squash_input_denormal(a STATUS_VAR);
4460
b = float64_squash_input_denormal(b STATUS_VAR);
4462
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4463
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4465
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4466
float_raise( float_flag_invalid STATUS_VAR);
4470
aSign = extractFloat64Sign( a );
4471
bSign = extractFloat64Sign( b );
4472
av = float64_val(a);
4473
bv = float64_val(b);
4474
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4475
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4479
/*----------------------------------------------------------------------------
4480
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4481
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4482
| comparison is performed according to the IEC/IEEE Standard for Binary
4483
| Floating-Point Arithmetic.
4484
*----------------------------------------------------------------------------*/
4486
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4488
a = float64_squash_input_denormal(a STATUS_VAR);
4489
b = float64_squash_input_denormal(b STATUS_VAR);
4491
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4492
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4494
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4495
float_raise( float_flag_invalid STATUS_VAR);
4502
/*----------------------------------------------------------------------------
4503
| Returns the result of converting the extended double-precision floating-
4504
| point value `a' to the 32-bit two's complement integer format. The
4505
| conversion is performed according to the IEC/IEEE Standard for Binary
4506
| Floating-Point Arithmetic---which means in particular that the conversion
4507
| is rounded according to the current rounding mode. If `a' is a NaN, the
4508
| largest positive integer is returned. Otherwise, if the conversion
4509
| overflows, the largest integer with the same sign as `a' is returned.
4510
*----------------------------------------------------------------------------*/
4512
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4515
int32 aExp, shiftCount;
4518
aSig = extractFloatx80Frac( a );
4519
aExp = extractFloatx80Exp( a );
4520
aSign = extractFloatx80Sign( a );
4521
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4522
shiftCount = 0x4037 - aExp;
4523
if ( shiftCount <= 0 ) shiftCount = 1;
4524
shift64RightJamming( aSig, shiftCount, &aSig );
4525
return roundAndPackInt32( aSign, aSig STATUS_VAR );
4529
/*----------------------------------------------------------------------------
4530
| Returns the result of converting the extended double-precision floating-
4531
| point value `a' to the 32-bit two's complement integer format. The
4532
| conversion is performed according to the IEC/IEEE Standard for Binary
4533
| Floating-Point Arithmetic, except that the conversion is always rounded
4534
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4535
| Otherwise, if the conversion overflows, the largest integer with the same
4536
| sign as `a' is returned.
4537
*----------------------------------------------------------------------------*/
4539
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4542
int32 aExp, shiftCount;
4543
uint64_t aSig, savedASig;
4546
aSig = extractFloatx80Frac( a );
4547
aExp = extractFloatx80Exp( a );
4548
aSign = extractFloatx80Sign( a );
4549
if ( 0x401E < aExp ) {
4550
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4553
else if ( aExp < 0x3FFF ) {
4554
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4557
shiftCount = 0x403E - aExp;
4559
aSig >>= shiftCount;
4561
if ( aSign ) z = - z;
4562
if ( ( z < 0 ) ^ aSign ) {
4564
float_raise( float_flag_invalid STATUS_VAR);
4565
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4567
if ( ( aSig<<shiftCount ) != savedASig ) {
4568
STATUS(float_exception_flags) |= float_flag_inexact;
4574
/*----------------------------------------------------------------------------
4575
| Returns the result of converting the extended double-precision floating-
4576
| point value `a' to the 64-bit two's complement integer format. The
4577
| conversion is performed according to the IEC/IEEE Standard for Binary
4578
| Floating-Point Arithmetic---which means in particular that the conversion
4579
| is rounded according to the current rounding mode. If `a' is a NaN,
4580
| the largest positive integer is returned. Otherwise, if the conversion
4581
| overflows, the largest integer with the same sign as `a' is returned.
4582
*----------------------------------------------------------------------------*/
4584
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4587
int32 aExp, shiftCount;
4588
uint64_t aSig, aSigExtra;
4590
aSig = extractFloatx80Frac( a );
4591
aExp = extractFloatx80Exp( a );
4592
aSign = extractFloatx80Sign( a );
4593
shiftCount = 0x403E - aExp;
4594
if ( shiftCount <= 0 ) {
4596
float_raise( float_flag_invalid STATUS_VAR);
4598
|| ( ( aExp == 0x7FFF )
4599
&& ( aSig != LIT64( 0x8000000000000000 ) ) )
4601
return LIT64( 0x7FFFFFFFFFFFFFFF );
4603
return (int64_t) LIT64( 0x8000000000000000 );
4608
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4610
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4614
/*----------------------------------------------------------------------------
4615
| Returns the result of converting the extended double-precision floating-
4616
| point value `a' to the 64-bit two's complement integer format. The
4617
| conversion is performed according to the IEC/IEEE Standard for Binary
4618
| Floating-Point Arithmetic, except that the conversion is always rounded
4619
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4620
| Otherwise, if the conversion overflows, the largest integer with the same
4621
| sign as `a' is returned.
4622
*----------------------------------------------------------------------------*/
4624
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4627
int32 aExp, shiftCount;
4631
aSig = extractFloatx80Frac( a );
4632
aExp = extractFloatx80Exp( a );
4633
aSign = extractFloatx80Sign( a );
4634
shiftCount = aExp - 0x403E;
4635
if ( 0 <= shiftCount ) {
4636
aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4637
if ( ( a.high != 0xC03E ) || aSig ) {
4638
float_raise( float_flag_invalid STATUS_VAR);
4639
if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4640
return LIT64( 0x7FFFFFFFFFFFFFFF );
4643
return (int64_t) LIT64( 0x8000000000000000 );
4645
else if ( aExp < 0x3FFF ) {
4646
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4649
z = aSig>>( - shiftCount );
4650
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4651
STATUS(float_exception_flags) |= float_flag_inexact;
4653
if ( aSign ) z = - z;
4658
/*----------------------------------------------------------------------------
4659
| Returns the result of converting the extended double-precision floating-
4660
| point value `a' to the single-precision floating-point format. The
4661
| conversion is performed according to the IEC/IEEE Standard for Binary
4662
| Floating-Point Arithmetic.
4663
*----------------------------------------------------------------------------*/
4665
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4671
aSig = extractFloatx80Frac( a );
4672
aExp = extractFloatx80Exp( a );
4673
aSign = extractFloatx80Sign( a );
4674
if ( aExp == 0x7FFF ) {
4675
if ( (uint64_t) ( aSig<<1 ) ) {
4676
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4678
return packFloat32( aSign, 0xFF, 0 );
4680
shift64RightJamming( aSig, 33, &aSig );
4681
if ( aExp || aSig ) aExp -= 0x3F81;
4682
return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4686
/*----------------------------------------------------------------------------
4687
| Returns the result of converting the extended double-precision floating-
4688
| point value `a' to the double-precision floating-point format. The
4689
| conversion is performed according to the IEC/IEEE Standard for Binary
4690
| Floating-Point Arithmetic.
4691
*----------------------------------------------------------------------------*/
4693
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4697
uint64_t aSig, zSig;
4699
aSig = extractFloatx80Frac( a );
4700
aExp = extractFloatx80Exp( a );
4701
aSign = extractFloatx80Sign( a );
4702
if ( aExp == 0x7FFF ) {
4703
if ( (uint64_t) ( aSig<<1 ) ) {
4704
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4706
return packFloat64( aSign, 0x7FF, 0 );
4708
shift64RightJamming( aSig, 1, &zSig );
4709
if ( aExp || aSig ) aExp -= 0x3C01;
4710
return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4714
/*----------------------------------------------------------------------------
4715
| Returns the result of converting the extended double-precision floating-
4716
| point value `a' to the quadruple-precision floating-point format. The
4717
| conversion is performed according to the IEC/IEEE Standard for Binary
4718
| Floating-Point Arithmetic.
4719
*----------------------------------------------------------------------------*/
4721
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4725
uint64_t aSig, zSig0, zSig1;
4727
aSig = extractFloatx80Frac( a );
4728
aExp = extractFloatx80Exp( a );
4729
aSign = extractFloatx80Sign( a );
4730
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4731
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4733
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4734
return packFloat128( aSign, aExp, zSig0, zSig1 );
4738
/*----------------------------------------------------------------------------
4739
| Rounds the extended double-precision floating-point value `a' to an integer,
4740
| and returns the result as an extended quadruple-precision floating-point
4741
| value. The operation is performed according to the IEC/IEEE Standard for
4742
| Binary Floating-Point Arithmetic.
4743
*----------------------------------------------------------------------------*/
4745
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4749
uint64_t lastBitMask, roundBitsMask;
4752
aExp = extractFloatx80Exp( a );
4753
if ( 0x403E <= aExp ) {
4754
if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4755
return propagateFloatx80NaN( a, a STATUS_VAR );
4759
if ( aExp < 0x3FFF ) {
4761
&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4764
STATUS(float_exception_flags) |= float_flag_inexact;
4765
aSign = extractFloatx80Sign( a );
4766
switch ( STATUS(float_rounding_mode) ) {
4767
case float_round_nearest_even:
4768
if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4771
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4774
case float_round_down:
4777
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4778
: packFloatx80( 0, 0, 0 );
4779
case float_round_up:
4781
aSign ? packFloatx80( 1, 0, 0 )
4782
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4784
return packFloatx80( aSign, 0, 0 );
4787
lastBitMask <<= 0x403E - aExp;
4788
roundBitsMask = lastBitMask - 1;
4790
switch (STATUS(float_rounding_mode)) {
4791
case float_round_nearest_even:
4792
z.low += lastBitMask>>1;
4793
if ((z.low & roundBitsMask) == 0) {
4794
z.low &= ~lastBitMask;
4797
case float_round_to_zero:
4799
case float_round_up:
4800
if (!extractFloatx80Sign(z)) {
4801
z.low += roundBitsMask;
4804
case float_round_down:
4805
if (extractFloatx80Sign(z)) {
4806
z.low += roundBitsMask;
4812
z.low &= ~ roundBitsMask;
4815
z.low = LIT64( 0x8000000000000000 );
4817
if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4822
/*----------------------------------------------------------------------------
4823
| Returns the result of adding the absolute values of the extended double-
4824
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4825
| negated before being returned. `zSign' is ignored if the result is a NaN.
4826
| The addition is performed according to the IEC/IEEE Standard for Binary
4827
| Floating-Point Arithmetic.
4828
*----------------------------------------------------------------------------*/
4830
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4832
int32 aExp, bExp, zExp;
4833
uint64_t aSig, bSig, zSig0, zSig1;
4836
aSig = extractFloatx80Frac( a );
4837
aExp = extractFloatx80Exp( a );
4838
bSig = extractFloatx80Frac( b );
4839
bExp = extractFloatx80Exp( b );
4840
expDiff = aExp - bExp;
4841
if ( 0 < expDiff ) {
4842
if ( aExp == 0x7FFF ) {
4843
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4846
if ( bExp == 0 ) --expDiff;
4847
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4850
else if ( expDiff < 0 ) {
4851
if ( bExp == 0x7FFF ) {
4852
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4853
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4855
if ( aExp == 0 ) ++expDiff;
4856
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4860
if ( aExp == 0x7FFF ) {
4861
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4862
return propagateFloatx80NaN( a, b STATUS_VAR );
4867
zSig0 = aSig + bSig;
4869
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4875
zSig0 = aSig + bSig;
4876
if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4878
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4879
zSig0 |= LIT64( 0x8000000000000000 );
4883
roundAndPackFloatx80(
4884
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4888
/*----------------------------------------------------------------------------
4889
| Returns the result of subtracting the absolute values of the extended
4890
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4891
| difference is negated before being returned. `zSign' is ignored if the
4892
| result is a NaN. The subtraction is performed according to the IEC/IEEE
4893
| Standard for Binary Floating-Point Arithmetic.
4894
*----------------------------------------------------------------------------*/
4896
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4898
int32 aExp, bExp, zExp;
4899
uint64_t aSig, bSig, zSig0, zSig1;
4903
aSig = extractFloatx80Frac( a );
4904
aExp = extractFloatx80Exp( a );
4905
bSig = extractFloatx80Frac( b );
4906
bExp = extractFloatx80Exp( b );
4907
expDiff = aExp - bExp;
4908
if ( 0 < expDiff ) goto aExpBigger;
4909
if ( expDiff < 0 ) goto bExpBigger;
4910
if ( aExp == 0x7FFF ) {
4911
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4912
return propagateFloatx80NaN( a, b STATUS_VAR );
4914
float_raise( float_flag_invalid STATUS_VAR);
4915
z.low = floatx80_default_nan_low;
4916
z.high = floatx80_default_nan_high;
4924
if ( bSig < aSig ) goto aBigger;
4925
if ( aSig < bSig ) goto bBigger;
4926
return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4928
if ( bExp == 0x7FFF ) {
4929
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4930
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4932
if ( aExp == 0 ) ++expDiff;
4933
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4935
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4938
goto normalizeRoundAndPack;
4940
if ( aExp == 0x7FFF ) {
4941
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4944
if ( bExp == 0 ) --expDiff;
4945
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4947
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4949
normalizeRoundAndPack:
4951
normalizeRoundAndPackFloatx80(
4952
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4956
/*----------------------------------------------------------------------------
4957
| Returns the result of adding the extended double-precision floating-point
4958
| values `a' and `b'. The operation is performed according to the IEC/IEEE
4959
| Standard for Binary Floating-Point Arithmetic.
4960
*----------------------------------------------------------------------------*/
4962
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4966
aSign = extractFloatx80Sign( a );
4967
bSign = extractFloatx80Sign( b );
4968
if ( aSign == bSign ) {
4969
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4972
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4977
/*----------------------------------------------------------------------------
4978
| Returns the result of subtracting the extended double-precision floating-
4979
| point values `a' and `b'. The operation is performed according to the
4980
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4981
*----------------------------------------------------------------------------*/
4983
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4987
aSign = extractFloatx80Sign( a );
4988
bSign = extractFloatx80Sign( b );
4989
if ( aSign == bSign ) {
4990
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4993
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4998
/*----------------------------------------------------------------------------
4999
| Returns the result of multiplying the extended double-precision floating-
5000
| point values `a' and `b'. The operation is performed according to the
5001
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5002
*----------------------------------------------------------------------------*/
5004
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5006
flag aSign, bSign, zSign;
5007
int32 aExp, bExp, zExp;
5008
uint64_t aSig, bSig, zSig0, zSig1;
5011
aSig = extractFloatx80Frac( a );
5012
aExp = extractFloatx80Exp( a );
5013
aSign = extractFloatx80Sign( a );
5014
bSig = extractFloatx80Frac( b );
5015
bExp = extractFloatx80Exp( b );
5016
bSign = extractFloatx80Sign( b );
5017
zSign = aSign ^ bSign;
5018
if ( aExp == 0x7FFF ) {
5019
if ( (uint64_t) ( aSig<<1 )
5020
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5021
return propagateFloatx80NaN( a, b STATUS_VAR );
5023
if ( ( bExp | bSig ) == 0 ) goto invalid;
5024
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5026
if ( bExp == 0x7FFF ) {
5027
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5028
if ( ( aExp | aSig ) == 0 ) {
5030
float_raise( float_flag_invalid STATUS_VAR);
5031
z.low = floatx80_default_nan_low;
5032
z.high = floatx80_default_nan_high;
5035
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5038
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5039
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5042
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5043
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5045
zExp = aExp + bExp - 0x3FFE;
5046
mul64To128( aSig, bSig, &zSig0, &zSig1 );
5047
if ( 0 < (int64_t) zSig0 ) {
5048
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5052
roundAndPackFloatx80(
5053
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5057
/*----------------------------------------------------------------------------
5058
| Returns the result of dividing the extended double-precision floating-point
5059
| value `a' by the corresponding value `b'. The operation is performed
5060
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5061
*----------------------------------------------------------------------------*/
5063
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5065
flag aSign, bSign, zSign;
5066
int32 aExp, bExp, zExp;
5067
uint64_t aSig, bSig, zSig0, zSig1;
5068
uint64_t rem0, rem1, rem2, term0, term1, term2;
5071
aSig = extractFloatx80Frac( a );
5072
aExp = extractFloatx80Exp( a );
5073
aSign = extractFloatx80Sign( a );
5074
bSig = extractFloatx80Frac( b );
5075
bExp = extractFloatx80Exp( b );
5076
bSign = extractFloatx80Sign( b );
5077
zSign = aSign ^ bSign;
5078
if ( aExp == 0x7FFF ) {
5079
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5080
if ( bExp == 0x7FFF ) {
5081
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5084
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5086
if ( bExp == 0x7FFF ) {
5087
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5088
return packFloatx80( zSign, 0, 0 );
5092
if ( ( aExp | aSig ) == 0 ) {
5094
float_raise( float_flag_invalid STATUS_VAR);
5095
z.low = floatx80_default_nan_low;
5096
z.high = floatx80_default_nan_high;
5099
float_raise( float_flag_divbyzero STATUS_VAR);
5100
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5102
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5105
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5106
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5108
zExp = aExp - bExp + 0x3FFE;
5110
if ( bSig <= aSig ) {
5111
shift128Right( aSig, 0, 1, &aSig, &rem1 );
5114
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5115
mul64To128( bSig, zSig0, &term0, &term1 );
5116
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5117
while ( (int64_t) rem0 < 0 ) {
5119
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5121
zSig1 = estimateDiv128To64( rem1, 0, bSig );
5122
if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5123
mul64To128( bSig, zSig1, &term1, &term2 );
5124
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5125
while ( (int64_t) rem1 < 0 ) {
5127
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5129
zSig1 |= ( ( rem1 | rem2 ) != 0 );
5132
roundAndPackFloatx80(
5133
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5137
/*----------------------------------------------------------------------------
5138
| Returns the remainder of the extended double-precision floating-point value
5139
| `a' with respect to the corresponding value `b'. The operation is performed
5140
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5141
*----------------------------------------------------------------------------*/
5143
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5146
int32 aExp, bExp, expDiff;
5147
uint64_t aSig0, aSig1, bSig;
5148
uint64_t q, term0, term1, alternateASig0, alternateASig1;
5151
aSig0 = extractFloatx80Frac( a );
5152
aExp = extractFloatx80Exp( a );
5153
aSign = extractFloatx80Sign( a );
5154
bSig = extractFloatx80Frac( b );
5155
bExp = extractFloatx80Exp( b );
5156
if ( aExp == 0x7FFF ) {
5157
if ( (uint64_t) ( aSig0<<1 )
5158
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5159
return propagateFloatx80NaN( a, b STATUS_VAR );
5163
if ( bExp == 0x7FFF ) {
5164
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5170
float_raise( float_flag_invalid STATUS_VAR);
5171
z.low = floatx80_default_nan_low;
5172
z.high = floatx80_default_nan_high;
5175
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5178
if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5179
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5181
bSig |= LIT64( 0x8000000000000000 );
5183
expDiff = aExp - bExp;
5185
if ( expDiff < 0 ) {
5186
if ( expDiff < -1 ) return a;
5187
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5190
q = ( bSig <= aSig0 );
5191
if ( q ) aSig0 -= bSig;
5193
while ( 0 < expDiff ) {
5194
q = estimateDiv128To64( aSig0, aSig1, bSig );
5195
q = ( 2 < q ) ? q - 2 : 0;
5196
mul64To128( bSig, q, &term0, &term1 );
5197
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5198
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5202
if ( 0 < expDiff ) {
5203
q = estimateDiv128To64( aSig0, aSig1, bSig );
5204
q = ( 2 < q ) ? q - 2 : 0;
5206
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5207
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5208
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5209
while ( le128( term0, term1, aSig0, aSig1 ) ) {
5211
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5218
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5219
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5220
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5223
aSig0 = alternateASig0;
5224
aSig1 = alternateASig1;
5228
normalizeRoundAndPackFloatx80(
5229
80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5233
/*----------------------------------------------------------------------------
5234
| Returns the square root of the extended double-precision floating-point
5235
| value `a'. The operation is performed according to the IEC/IEEE Standard
5236
| for Binary Floating-Point Arithmetic.
5237
*----------------------------------------------------------------------------*/
5239
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5243
uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5244
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5247
aSig0 = extractFloatx80Frac( a );
5248
aExp = extractFloatx80Exp( a );
5249
aSign = extractFloatx80Sign( a );
5250
if ( aExp == 0x7FFF ) {
5251
if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5252
if ( ! aSign ) return a;
5256
if ( ( aExp | aSig0 ) == 0 ) return a;
5258
float_raise( float_flag_invalid STATUS_VAR);
5259
z.low = floatx80_default_nan_low;
5260
z.high = floatx80_default_nan_high;
5264
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5265
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5267
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5268
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5269
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5270
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5271
doubleZSig0 = zSig0<<1;
5272
mul64To128( zSig0, zSig0, &term0, &term1 );
5273
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5274
while ( (int64_t) rem0 < 0 ) {
5277
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5279
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5280
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5281
if ( zSig1 == 0 ) zSig1 = 1;
5282
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5283
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5284
mul64To128( zSig1, zSig1, &term2, &term3 );
5285
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5286
while ( (int64_t) rem1 < 0 ) {
5288
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5290
term2 |= doubleZSig0;
5291
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5293
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5295
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5296
zSig0 |= doubleZSig0;
5298
roundAndPackFloatx80(
5299
STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5303
/*----------------------------------------------------------------------------
5304
| Returns 1 if the extended double-precision floating-point value `a' is equal
5305
| to the corresponding value `b', and 0 otherwise. The invalid exception is
5306
| raised if either operand is a NaN. Otherwise, the comparison is performed
5307
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5308
*----------------------------------------------------------------------------*/
5310
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5313
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5314
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5315
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5316
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5318
float_raise( float_flag_invalid STATUS_VAR);
5323
&& ( ( a.high == b.high )
5325
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5330
/*----------------------------------------------------------------------------
5331
| Returns 1 if the extended double-precision floating-point value `a' is
5332
| less than or equal to the corresponding value `b', and 0 otherwise. The
5333
| invalid exception is raised if either operand is a NaN. The comparison is
5334
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5336
*----------------------------------------------------------------------------*/
5338
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5342
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5343
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5344
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5345
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5347
float_raise( float_flag_invalid STATUS_VAR);
5350
aSign = extractFloatx80Sign( a );
5351
bSign = extractFloatx80Sign( b );
5352
if ( aSign != bSign ) {
5355
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5359
aSign ? le128( b.high, b.low, a.high, a.low )
5360
: le128( a.high, a.low, b.high, b.low );
5364
/*----------------------------------------------------------------------------
5365
| Returns 1 if the extended double-precision floating-point value `a' is
5366
| less than the corresponding value `b', and 0 otherwise. The invalid
5367
| exception is raised if either operand is a NaN. The comparison is performed
5368
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5369
*----------------------------------------------------------------------------*/
5371
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5375
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5376
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5377
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5378
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5380
float_raise( float_flag_invalid STATUS_VAR);
5383
aSign = extractFloatx80Sign( a );
5384
bSign = extractFloatx80Sign( b );
5385
if ( aSign != bSign ) {
5388
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5392
aSign ? lt128( b.high, b.low, a.high, a.low )
5393
: lt128( a.high, a.low, b.high, b.low );
5397
/*----------------------------------------------------------------------------
5398
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5399
| cannot be compared, and 0 otherwise. The invalid exception is raised if
5400
| either operand is a NaN. The comparison is performed according to the
5401
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5402
*----------------------------------------------------------------------------*/
5403
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5405
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5406
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5407
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5408
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5410
float_raise( float_flag_invalid STATUS_VAR);
5416
/*----------------------------------------------------------------------------
5417
| Returns 1 if the extended double-precision floating-point value `a' is
5418
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5419
| cause an exception. The comparison is performed according to the IEC/IEEE
5420
| Standard for Binary Floating-Point Arithmetic.
5421
*----------------------------------------------------------------------------*/
5423
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5426
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5427
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5428
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5429
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5431
if ( floatx80_is_signaling_nan( a )
5432
|| floatx80_is_signaling_nan( b ) ) {
5433
float_raise( float_flag_invalid STATUS_VAR);
5439
&& ( ( a.high == b.high )
5441
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5446
/*----------------------------------------------------------------------------
5447
| Returns 1 if the extended double-precision floating-point value `a' is less
5448
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5449
| do not cause an exception. Otherwise, the comparison is performed according
5450
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5451
*----------------------------------------------------------------------------*/
5453
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5457
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5458
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5459
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5460
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5462
if ( floatx80_is_signaling_nan( a )
5463
|| floatx80_is_signaling_nan( b ) ) {
5464
float_raise( float_flag_invalid STATUS_VAR);
5468
aSign = extractFloatx80Sign( a );
5469
bSign = extractFloatx80Sign( b );
5470
if ( aSign != bSign ) {
5473
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5477
aSign ? le128( b.high, b.low, a.high, a.low )
5478
: le128( a.high, a.low, b.high, b.low );
5482
/*----------------------------------------------------------------------------
5483
| Returns 1 if the extended double-precision floating-point value `a' is less
5484
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5485
| an exception. Otherwise, the comparison is performed according to the
5486
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5487
*----------------------------------------------------------------------------*/
5489
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5493
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5494
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5495
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5496
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5498
if ( floatx80_is_signaling_nan( a )
5499
|| floatx80_is_signaling_nan( b ) ) {
5500
float_raise( float_flag_invalid STATUS_VAR);
5504
aSign = extractFloatx80Sign( a );
5505
bSign = extractFloatx80Sign( b );
5506
if ( aSign != bSign ) {
5509
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5513
aSign ? lt128( b.high, b.low, a.high, a.low )
5514
: lt128( a.high, a.low, b.high, b.low );
5518
/*----------------------------------------------------------------------------
5519
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5520
| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5521
| The comparison is performed according to the IEC/IEEE Standard for Binary
5522
| Floating-Point Arithmetic.
5523
*----------------------------------------------------------------------------*/
5524
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5526
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5527
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5528
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5529
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5531
if ( floatx80_is_signaling_nan( a )
5532
|| floatx80_is_signaling_nan( b ) ) {
5533
float_raise( float_flag_invalid STATUS_VAR);
5540
/*----------------------------------------------------------------------------
5541
| Returns the result of converting the quadruple-precision floating-point
5542
| value `a' to the 32-bit two's complement integer format. The conversion
5543
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5544
| Arithmetic---which means in particular that the conversion is rounded
5545
| according to the current rounding mode. If `a' is a NaN, the largest
5546
| positive integer is returned. Otherwise, if the conversion overflows, the
5547
| largest integer with the same sign as `a' is returned.
5548
*----------------------------------------------------------------------------*/
5550
int32 float128_to_int32( float128 a STATUS_PARAM )
5553
int32 aExp, shiftCount;
5554
uint64_t aSig0, aSig1;
5556
aSig1 = extractFloat128Frac1( a );
5557
aSig0 = extractFloat128Frac0( a );
5558
aExp = extractFloat128Exp( a );
5559
aSign = extractFloat128Sign( a );
5560
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5561
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5562
aSig0 |= ( aSig1 != 0 );
5563
shiftCount = 0x4028 - aExp;
5564
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5565
return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5569
/*----------------------------------------------------------------------------
5570
| Returns the result of converting the quadruple-precision floating-point
5571
| value `a' to the 32-bit two's complement integer format. The conversion
5572
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5573
| Arithmetic, except that the conversion is always rounded toward zero. If
5574
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5575
| conversion overflows, the largest integer with the same sign as `a' is
5577
*----------------------------------------------------------------------------*/
5579
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5582
int32 aExp, shiftCount;
5583
uint64_t aSig0, aSig1, savedASig;
5586
aSig1 = extractFloat128Frac1( a );
5587
aSig0 = extractFloat128Frac0( a );
5588
aExp = extractFloat128Exp( a );
5589
aSign = extractFloat128Sign( a );
5590
aSig0 |= ( aSig1 != 0 );
5591
if ( 0x401E < aExp ) {
5592
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5595
else if ( aExp < 0x3FFF ) {
5596
if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5599
aSig0 |= LIT64( 0x0001000000000000 );
5600
shiftCount = 0x402F - aExp;
5602
aSig0 >>= shiftCount;
5604
if ( aSign ) z = - z;
5605
if ( ( z < 0 ) ^ aSign ) {
5607
float_raise( float_flag_invalid STATUS_VAR);
5608
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5610
if ( ( aSig0<<shiftCount ) != savedASig ) {
5611
STATUS(float_exception_flags) |= float_flag_inexact;
5617
/*----------------------------------------------------------------------------
5618
| Returns the result of converting the quadruple-precision floating-point
5619
| value `a' to the 64-bit two's complement integer format. The conversion
5620
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5621
| Arithmetic---which means in particular that the conversion is rounded
5622
| according to the current rounding mode. If `a' is a NaN, the largest
5623
| positive integer is returned. Otherwise, if the conversion overflows, the
5624
| largest integer with the same sign as `a' is returned.
5625
*----------------------------------------------------------------------------*/
5627
int64 float128_to_int64( float128 a STATUS_PARAM )
5630
int32 aExp, shiftCount;
5631
uint64_t aSig0, aSig1;
5633
aSig1 = extractFloat128Frac1( a );
5634
aSig0 = extractFloat128Frac0( a );
5635
aExp = extractFloat128Exp( a );
5636
aSign = extractFloat128Sign( a );
5637
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5638
shiftCount = 0x402F - aExp;
5639
if ( shiftCount <= 0 ) {
5640
if ( 0x403E < aExp ) {
5641
float_raise( float_flag_invalid STATUS_VAR);
5643
|| ( ( aExp == 0x7FFF )
5644
&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5647
return LIT64( 0x7FFFFFFFFFFFFFFF );
5649
return (int64_t) LIT64( 0x8000000000000000 );
5651
shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5654
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5656
return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5660
/*----------------------------------------------------------------------------
5661
| Returns the result of converting the quadruple-precision floating-point
5662
| value `a' to the 64-bit two's complement integer format. The conversion
5663
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5664
| Arithmetic, except that the conversion is always rounded toward zero.
5665
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5666
| the conversion overflows, the largest integer with the same sign as `a' is
5668
*----------------------------------------------------------------------------*/
5670
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5673
int32 aExp, shiftCount;
5674
uint64_t aSig0, aSig1;
5677
aSig1 = extractFloat128Frac1( a );
5678
aSig0 = extractFloat128Frac0( a );
5679
aExp = extractFloat128Exp( a );
5680
aSign = extractFloat128Sign( a );
5681
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5682
shiftCount = aExp - 0x402F;
5683
if ( 0 < shiftCount ) {
5684
if ( 0x403E <= aExp ) {
5685
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5686
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5687
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5688
if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5691
float_raise( float_flag_invalid STATUS_VAR);
5692
if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5693
return LIT64( 0x7FFFFFFFFFFFFFFF );
5696
return (int64_t) LIT64( 0x8000000000000000 );
5698
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5699
if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5700
STATUS(float_exception_flags) |= float_flag_inexact;
5704
if ( aExp < 0x3FFF ) {
5705
if ( aExp | aSig0 | aSig1 ) {
5706
STATUS(float_exception_flags) |= float_flag_inexact;
5710
z = aSig0>>( - shiftCount );
5712
|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5713
STATUS(float_exception_flags) |= float_flag_inexact;
5716
if ( aSign ) z = - z;
5721
/*----------------------------------------------------------------------------
5722
| Returns the result of converting the quadruple-precision floating-point
5723
| value `a' to the single-precision floating-point format. The conversion
5724
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5726
*----------------------------------------------------------------------------*/
5728
float32 float128_to_float32( float128 a STATUS_PARAM )
5732
uint64_t aSig0, aSig1;
5735
aSig1 = extractFloat128Frac1( a );
5736
aSig0 = extractFloat128Frac0( a );
5737
aExp = extractFloat128Exp( a );
5738
aSign = extractFloat128Sign( a );
5739
if ( aExp == 0x7FFF ) {
5740
if ( aSig0 | aSig1 ) {
5741
return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5743
return packFloat32( aSign, 0xFF, 0 );
5745
aSig0 |= ( aSig1 != 0 );
5746
shift64RightJamming( aSig0, 18, &aSig0 );
5748
if ( aExp || zSig ) {
5752
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5756
/*----------------------------------------------------------------------------
5757
| Returns the result of converting the quadruple-precision floating-point
5758
| value `a' to the double-precision floating-point format. The conversion
5759
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5761
*----------------------------------------------------------------------------*/
5763
float64 float128_to_float64( float128 a STATUS_PARAM )
5767
uint64_t aSig0, aSig1;
5769
aSig1 = extractFloat128Frac1( a );
5770
aSig0 = extractFloat128Frac0( a );
5771
aExp = extractFloat128Exp( a );
5772
aSign = extractFloat128Sign( a );
5773
if ( aExp == 0x7FFF ) {
5774
if ( aSig0 | aSig1 ) {
5775
return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5777
return packFloat64( aSign, 0x7FF, 0 );
5779
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5780
aSig0 |= ( aSig1 != 0 );
5781
if ( aExp || aSig0 ) {
5782
aSig0 |= LIT64( 0x4000000000000000 );
5785
return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5789
/*----------------------------------------------------------------------------
5790
| Returns the result of converting the quadruple-precision floating-point
5791
| value `a' to the extended double-precision floating-point format. The
5792
| conversion is performed according to the IEC/IEEE Standard for Binary
5793
| Floating-Point Arithmetic.
5794
*----------------------------------------------------------------------------*/
5796
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5800
uint64_t aSig0, aSig1;
5802
aSig1 = extractFloat128Frac1( a );
5803
aSig0 = extractFloat128Frac0( a );
5804
aExp = extractFloat128Exp( a );
5805
aSign = extractFloat128Sign( a );
5806
if ( aExp == 0x7FFF ) {
5807
if ( aSig0 | aSig1 ) {
5808
return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5810
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5813
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5814
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5817
aSig0 |= LIT64( 0x0001000000000000 );
5819
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5820
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5824
/*----------------------------------------------------------------------------
5825
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5826
| returns the result as a quadruple-precision floating-point value. The
5827
| operation is performed according to the IEC/IEEE Standard for Binary
5828
| Floating-Point Arithmetic.
5829
*----------------------------------------------------------------------------*/
5831
float128 float128_round_to_int( float128 a STATUS_PARAM )
5835
uint64_t lastBitMask, roundBitsMask;
5838
aExp = extractFloat128Exp( a );
5839
if ( 0x402F <= aExp ) {
5840
if ( 0x406F <= aExp ) {
5841
if ( ( aExp == 0x7FFF )
5842
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5844
return propagateFloat128NaN( a, a STATUS_VAR );
5849
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5850
roundBitsMask = lastBitMask - 1;
5852
switch (STATUS(float_rounding_mode)) {
5853
case float_round_nearest_even:
5854
if ( lastBitMask ) {
5855
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5856
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5859
if ( (int64_t) z.low < 0 ) {
5861
if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5865
case float_round_to_zero:
5867
case float_round_up:
5868
if (!extractFloat128Sign(z)) {
5869
add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5872
case float_round_down:
5873
if (extractFloat128Sign(z)) {
5874
add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5880
z.low &= ~ roundBitsMask;
5883
if ( aExp < 0x3FFF ) {
5884
if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5885
STATUS(float_exception_flags) |= float_flag_inexact;
5886
aSign = extractFloat128Sign( a );
5887
switch ( STATUS(float_rounding_mode) ) {
5888
case float_round_nearest_even:
5889
if ( ( aExp == 0x3FFE )
5890
&& ( extractFloat128Frac0( a )
5891
| extractFloat128Frac1( a ) )
5893
return packFloat128( aSign, 0x3FFF, 0, 0 );
5896
case float_round_down:
5898
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5899
: packFloat128( 0, 0, 0, 0 );
5900
case float_round_up:
5902
aSign ? packFloat128( 1, 0, 0, 0 )
5903
: packFloat128( 0, 0x3FFF, 0, 0 );
5905
return packFloat128( aSign, 0, 0, 0 );
5908
lastBitMask <<= 0x402F - aExp;
5909
roundBitsMask = lastBitMask - 1;
5912
switch (STATUS(float_rounding_mode)) {
5913
case float_round_nearest_even:
5914
z.high += lastBitMask>>1;
5915
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5916
z.high &= ~ lastBitMask;
5919
case float_round_to_zero:
5921
case float_round_up:
5922
if (!extractFloat128Sign(z)) {
5923
z.high |= ( a.low != 0 );
5924
z.high += roundBitsMask;
5927
case float_round_down:
5928
if (extractFloat128Sign(z)) {
5929
z.high |= (a.low != 0);
5930
z.high += roundBitsMask;
5936
z.high &= ~ roundBitsMask;
5938
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5939
STATUS(float_exception_flags) |= float_flag_inexact;
5945
/*----------------------------------------------------------------------------
5946
| Returns the result of adding the absolute values of the quadruple-precision
5947
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5948
| before being returned. `zSign' is ignored if the result is a NaN.
5949
| The addition is performed according to the IEC/IEEE Standard for Binary
5950
| Floating-Point Arithmetic.
5951
*----------------------------------------------------------------------------*/
5953
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5955
int32 aExp, bExp, zExp;
5956
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5959
aSig1 = extractFloat128Frac1( a );
5960
aSig0 = extractFloat128Frac0( a );
5961
aExp = extractFloat128Exp( a );
5962
bSig1 = extractFloat128Frac1( b );
5963
bSig0 = extractFloat128Frac0( b );
5964
bExp = extractFloat128Exp( b );
5965
expDiff = aExp - bExp;
5966
if ( 0 < expDiff ) {
5967
if ( aExp == 0x7FFF ) {
5968
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5975
bSig0 |= LIT64( 0x0001000000000000 );
5977
shift128ExtraRightJamming(
5978
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5981
else if ( expDiff < 0 ) {
5982
if ( bExp == 0x7FFF ) {
5983
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5984
return packFloat128( zSign, 0x7FFF, 0, 0 );
5990
aSig0 |= LIT64( 0x0001000000000000 );
5992
shift128ExtraRightJamming(
5993
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5997
if ( aExp == 0x7FFF ) {
5998
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5999
return propagateFloat128NaN( a, b STATUS_VAR );
6003
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6005
if (STATUS(flush_to_zero)) {
6006
if (zSig0 | zSig1) {
6007
float_raise(float_flag_output_denormal STATUS_VAR);
6009
return packFloat128(zSign, 0, 0, 0);
6011
return packFloat128( zSign, 0, zSig0, zSig1 );
6014
zSig0 |= LIT64( 0x0002000000000000 );
6018
aSig0 |= LIT64( 0x0001000000000000 );
6019
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6021
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6024
shift128ExtraRightJamming(
6025
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6027
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6031
/*----------------------------------------------------------------------------
6032
| Returns the result of subtracting the absolute values of the quadruple-
6033
| precision floating-point values `a' and `b'. If `zSign' is 1, the
6034
| difference is negated before being returned. `zSign' is ignored if the
6035
| result is a NaN. The subtraction is performed according to the IEC/IEEE
6036
| Standard for Binary Floating-Point Arithmetic.
6037
*----------------------------------------------------------------------------*/
6039
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6041
int32 aExp, bExp, zExp;
6042
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6046
aSig1 = extractFloat128Frac1( a );
6047
aSig0 = extractFloat128Frac0( a );
6048
aExp = extractFloat128Exp( a );
6049
bSig1 = extractFloat128Frac1( b );
6050
bSig0 = extractFloat128Frac0( b );
6051
bExp = extractFloat128Exp( b );
6052
expDiff = aExp - bExp;
6053
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6054
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6055
if ( 0 < expDiff ) goto aExpBigger;
6056
if ( expDiff < 0 ) goto bExpBigger;
6057
if ( aExp == 0x7FFF ) {
6058
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6059
return propagateFloat128NaN( a, b STATUS_VAR );
6061
float_raise( float_flag_invalid STATUS_VAR);
6062
z.low = float128_default_nan_low;
6063
z.high = float128_default_nan_high;
6070
if ( bSig0 < aSig0 ) goto aBigger;
6071
if ( aSig0 < bSig0 ) goto bBigger;
6072
if ( bSig1 < aSig1 ) goto aBigger;
6073
if ( aSig1 < bSig1 ) goto bBigger;
6074
return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6076
if ( bExp == 0x7FFF ) {
6077
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6078
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6084
aSig0 |= LIT64( 0x4000000000000000 );
6086
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6087
bSig0 |= LIT64( 0x4000000000000000 );
6089
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6092
goto normalizeRoundAndPack;
6094
if ( aExp == 0x7FFF ) {
6095
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6102
bSig0 |= LIT64( 0x4000000000000000 );
6104
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6105
aSig0 |= LIT64( 0x4000000000000000 );
6107
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6109
normalizeRoundAndPack:
6111
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6115
/*----------------------------------------------------------------------------
6116
| Returns the result of adding the quadruple-precision floating-point values
6117
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6118
| for Binary Floating-Point Arithmetic.
6119
*----------------------------------------------------------------------------*/
6121
float128 float128_add( float128 a, float128 b STATUS_PARAM )
6125
aSign = extractFloat128Sign( a );
6126
bSign = extractFloat128Sign( b );
6127
if ( aSign == bSign ) {
6128
return addFloat128Sigs( a, b, aSign STATUS_VAR );
6131
return subFloat128Sigs( a, b, aSign STATUS_VAR );
6136
/*----------------------------------------------------------------------------
6137
| Returns the result of subtracting the quadruple-precision floating-point
6138
| values `a' and `b'. The operation is performed according to the IEC/IEEE
6139
| Standard for Binary Floating-Point Arithmetic.
6140
*----------------------------------------------------------------------------*/
6142
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6146
aSign = extractFloat128Sign( a );
6147
bSign = extractFloat128Sign( b );
6148
if ( aSign == bSign ) {
6149
return subFloat128Sigs( a, b, aSign STATUS_VAR );
6152
return addFloat128Sigs( a, b, aSign STATUS_VAR );
6157
/*----------------------------------------------------------------------------
6158
| Returns the result of multiplying the quadruple-precision floating-point
6159
| values `a' and `b'. The operation is performed according to the IEC/IEEE
6160
| Standard for Binary Floating-Point Arithmetic.
6161
*----------------------------------------------------------------------------*/
6163
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6165
flag aSign, bSign, zSign;
6166
int32 aExp, bExp, zExp;
6167
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6170
aSig1 = extractFloat128Frac1( a );
6171
aSig0 = extractFloat128Frac0( a );
6172
aExp = extractFloat128Exp( a );
6173
aSign = extractFloat128Sign( a );
6174
bSig1 = extractFloat128Frac1( b );
6175
bSig0 = extractFloat128Frac0( b );
6176
bExp = extractFloat128Exp( b );
6177
bSign = extractFloat128Sign( b );
6178
zSign = aSign ^ bSign;
6179
if ( aExp == 0x7FFF ) {
6180
if ( ( aSig0 | aSig1 )
6181
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6182
return propagateFloat128NaN( a, b STATUS_VAR );
6184
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6185
return packFloat128( zSign, 0x7FFF, 0, 0 );
6187
if ( bExp == 0x7FFF ) {
6188
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6189
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6191
float_raise( float_flag_invalid STATUS_VAR);
6192
z.low = float128_default_nan_low;
6193
z.high = float128_default_nan_high;
6196
return packFloat128( zSign, 0x7FFF, 0, 0 );
6199
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6200
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6203
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6204
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6206
zExp = aExp + bExp - 0x4000;
6207
aSig0 |= LIT64( 0x0001000000000000 );
6208
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6209
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6210
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6211
zSig2 |= ( zSig3 != 0 );
6212
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6213
shift128ExtraRightJamming(
6214
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6217
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6221
/*----------------------------------------------------------------------------
6222
| Returns the result of dividing the quadruple-precision floating-point value
6223
| `a' by the corresponding value `b'. The operation is performed according to
6224
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6225
*----------------------------------------------------------------------------*/
6227
float128 float128_div( float128 a, float128 b STATUS_PARAM )
6229
flag aSign, bSign, zSign;
6230
int32 aExp, bExp, zExp;
6231
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6232
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6235
aSig1 = extractFloat128Frac1( a );
6236
aSig0 = extractFloat128Frac0( a );
6237
aExp = extractFloat128Exp( a );
6238
aSign = extractFloat128Sign( a );
6239
bSig1 = extractFloat128Frac1( b );
6240
bSig0 = extractFloat128Frac0( b );
6241
bExp = extractFloat128Exp( b );
6242
bSign = extractFloat128Sign( b );
6243
zSign = aSign ^ bSign;
6244
if ( aExp == 0x7FFF ) {
6245
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6246
if ( bExp == 0x7FFF ) {
6247
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6250
return packFloat128( zSign, 0x7FFF, 0, 0 );
6252
if ( bExp == 0x7FFF ) {
6253
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6254
return packFloat128( zSign, 0, 0, 0 );
6257
if ( ( bSig0 | bSig1 ) == 0 ) {
6258
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6260
float_raise( float_flag_invalid STATUS_VAR);
6261
z.low = float128_default_nan_low;
6262
z.high = float128_default_nan_high;
6265
float_raise( float_flag_divbyzero STATUS_VAR);
6266
return packFloat128( zSign, 0x7FFF, 0, 0 );
6268
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6271
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6272
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6274
zExp = aExp - bExp + 0x3FFD;
6276
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6278
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6279
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6280
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6283
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6284
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6285
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6286
while ( (int64_t) rem0 < 0 ) {
6288
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6290
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6291
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6292
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6293
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6294
while ( (int64_t) rem1 < 0 ) {
6296
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6298
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6300
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6301
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6305
/*----------------------------------------------------------------------------
6306
| Returns the remainder of the quadruple-precision floating-point value `a'
6307
| with respect to the corresponding value `b'. The operation is performed
6308
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6309
*----------------------------------------------------------------------------*/
6311
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6314
int32 aExp, bExp, expDiff;
6315
uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6316
uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6320
aSig1 = extractFloat128Frac1( a );
6321
aSig0 = extractFloat128Frac0( a );
6322
aExp = extractFloat128Exp( a );
6323
aSign = extractFloat128Sign( a );
6324
bSig1 = extractFloat128Frac1( b );
6325
bSig0 = extractFloat128Frac0( b );
6326
bExp = extractFloat128Exp( b );
6327
if ( aExp == 0x7FFF ) {
6328
if ( ( aSig0 | aSig1 )
6329
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6330
return propagateFloat128NaN( a, b STATUS_VAR );
6334
if ( bExp == 0x7FFF ) {
6335
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6339
if ( ( bSig0 | bSig1 ) == 0 ) {
6341
float_raise( float_flag_invalid STATUS_VAR);
6342
z.low = float128_default_nan_low;
6343
z.high = float128_default_nan_high;
6346
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6349
if ( ( aSig0 | aSig1 ) == 0 ) return a;
6350
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6352
expDiff = aExp - bExp;
6353
if ( expDiff < -1 ) return a;
6355
aSig0 | LIT64( 0x0001000000000000 ),
6357
15 - ( expDiff < 0 ),
6362
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6363
q = le128( bSig0, bSig1, aSig0, aSig1 );
6364
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6366
while ( 0 < expDiff ) {
6367
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6368
q = ( 4 < q ) ? q - 4 : 0;
6369
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6370
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6371
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6372
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6375
if ( -64 < expDiff ) {
6376
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6377
q = ( 4 < q ) ? q - 4 : 0;
6379
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6381
if ( expDiff < 0 ) {
6382
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6385
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6387
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6388
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6391
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6392
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6395
alternateASig0 = aSig0;
6396
alternateASig1 = aSig1;
6398
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6399
} while ( 0 <= (int64_t) aSig0 );
6401
aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6402
if ( ( sigMean0 < 0 )
6403
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6404
aSig0 = alternateASig0;
6405
aSig1 = alternateASig1;
6407
zSign = ( (int64_t) aSig0 < 0 );
6408
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6410
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6414
/*----------------------------------------------------------------------------
6415
| Returns the square root of the quadruple-precision floating-point value `a'.
6416
| The operation is performed according to the IEC/IEEE Standard for Binary
6417
| Floating-Point Arithmetic.
6418
*----------------------------------------------------------------------------*/
6420
float128 float128_sqrt( float128 a STATUS_PARAM )
6424
uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6425
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6428
aSig1 = extractFloat128Frac1( a );
6429
aSig0 = extractFloat128Frac0( a );
6430
aExp = extractFloat128Exp( a );
6431
aSign = extractFloat128Sign( a );
6432
if ( aExp == 0x7FFF ) {
6433
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6434
if ( ! aSign ) return a;
6438
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6440
float_raise( float_flag_invalid STATUS_VAR);
6441
z.low = float128_default_nan_low;
6442
z.high = float128_default_nan_high;
6446
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6447
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6449
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6450
aSig0 |= LIT64( 0x0001000000000000 );
6451
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6452
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6453
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6454
doubleZSig0 = zSig0<<1;
6455
mul64To128( zSig0, zSig0, &term0, &term1 );
6456
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6457
while ( (int64_t) rem0 < 0 ) {
6460
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6462
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6463
if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6464
if ( zSig1 == 0 ) zSig1 = 1;
6465
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6466
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6467
mul64To128( zSig1, zSig1, &term2, &term3 );
6468
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6469
while ( (int64_t) rem1 < 0 ) {
6471
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6473
term2 |= doubleZSig0;
6474
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6476
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6478
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6479
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6483
/*----------------------------------------------------------------------------
6484
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6485
| the corresponding value `b', and 0 otherwise. The invalid exception is
6486
| raised if either operand is a NaN. Otherwise, the comparison is performed
6487
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6488
*----------------------------------------------------------------------------*/
6490
int float128_eq( float128 a, float128 b STATUS_PARAM )
6493
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6494
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6495
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6496
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6498
float_raise( float_flag_invalid STATUS_VAR);
6503
&& ( ( a.high == b.high )
6505
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6510
/*----------------------------------------------------------------------------
6511
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6512
| or equal to the corresponding value `b', and 0 otherwise. The invalid
6513
| exception is raised if either operand is a NaN. The comparison is performed
6514
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6515
*----------------------------------------------------------------------------*/
6517
int float128_le( float128 a, float128 b STATUS_PARAM )
6521
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6522
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6523
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6524
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6526
float_raise( float_flag_invalid STATUS_VAR);
6529
aSign = extractFloat128Sign( a );
6530
bSign = extractFloat128Sign( b );
6531
if ( aSign != bSign ) {
6534
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6538
aSign ? le128( b.high, b.low, a.high, a.low )
6539
: le128( a.high, a.low, b.high, b.low );
6543
/*----------------------------------------------------------------------------
6544
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6545
| the corresponding value `b', and 0 otherwise. The invalid exception is
6546
| raised if either operand is a NaN. The comparison is performed according
6547
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6548
*----------------------------------------------------------------------------*/
6550
int float128_lt( float128 a, float128 b STATUS_PARAM )
6554
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6555
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6556
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6557
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6559
float_raise( float_flag_invalid STATUS_VAR);
6562
aSign = extractFloat128Sign( a );
6563
bSign = extractFloat128Sign( b );
6564
if ( aSign != bSign ) {
6567
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6571
aSign ? lt128( b.high, b.low, a.high, a.low )
6572
: lt128( a.high, a.low, b.high, b.low );
6576
/*----------------------------------------------------------------------------
6577
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6578
| be compared, and 0 otherwise. The invalid exception is raised if either
6579
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6580
| Standard for Binary Floating-Point Arithmetic.
6581
*----------------------------------------------------------------------------*/
6583
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6585
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6586
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6587
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6588
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6590
float_raise( float_flag_invalid STATUS_VAR);
6596
/*----------------------------------------------------------------------------
6597
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6598
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6599
| exception. The comparison is performed according to the IEC/IEEE Standard
6600
| for Binary Floating-Point Arithmetic.
6601
*----------------------------------------------------------------------------*/
6603
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6606
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6607
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6608
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6609
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6611
if ( float128_is_signaling_nan( a )
6612
|| float128_is_signaling_nan( b ) ) {
6613
float_raise( float_flag_invalid STATUS_VAR);
6619
&& ( ( a.high == b.high )
6621
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6626
/*----------------------------------------------------------------------------
6627
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6628
| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6629
| cause an exception. Otherwise, the comparison is performed according to the
6630
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6631
*----------------------------------------------------------------------------*/
6633
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6637
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6638
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6639
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6640
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6642
if ( float128_is_signaling_nan( a )
6643
|| float128_is_signaling_nan( b ) ) {
6644
float_raise( float_flag_invalid STATUS_VAR);
6648
aSign = extractFloat128Sign( a );
6649
bSign = extractFloat128Sign( b );
6650
if ( aSign != bSign ) {
6653
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6657
aSign ? le128( b.high, b.low, a.high, a.low )
6658
: le128( a.high, a.low, b.high, b.low );
6662
/*----------------------------------------------------------------------------
6663
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6664
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6665
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6666
| Standard for Binary Floating-Point Arithmetic.
6667
*----------------------------------------------------------------------------*/
6669
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6673
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6674
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6675
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6676
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6678
if ( float128_is_signaling_nan( a )
6679
|| float128_is_signaling_nan( b ) ) {
6680
float_raise( float_flag_invalid STATUS_VAR);
6684
aSign = extractFloat128Sign( a );
6685
bSign = extractFloat128Sign( b );
6686
if ( aSign != bSign ) {
6689
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6693
aSign ? lt128( b.high, b.low, a.high, a.low )
6694
: lt128( a.high, a.low, b.high, b.low );
6698
/*----------------------------------------------------------------------------
6699
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6700
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6701
| comparison is performed according to the IEC/IEEE Standard for Binary
6702
| Floating-Point Arithmetic.
6703
*----------------------------------------------------------------------------*/
6705
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6707
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6708
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6709
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6710
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6712
if ( float128_is_signaling_nan( a )
6713
|| float128_is_signaling_nan( b ) ) {
6714
float_raise( float_flag_invalid STATUS_VAR);
6721
/* misc functions */
6722
float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6724
return int64_to_float32(a STATUS_VAR);
6727
float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6729
return int64_to_float64(a STATUS_VAR);
6732
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6736
int old_exc_flags = get_float_exception_flags(status);
6738
v = float32_to_int64(a STATUS_VAR);
6741
} else if (v > 0xffffffff) {
6746
set_float_exception_flags(old_exc_flags, status);
6747
float_raise(float_flag_invalid STATUS_VAR);
6751
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6755
int old_exc_flags = get_float_exception_flags(status);
6757
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6760
} else if (v > 0xffffffff) {
6765
set_float_exception_flags(old_exc_flags, status);
6766
float_raise(float_flag_invalid STATUS_VAR);
6770
int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6774
int old_exc_flags = get_float_exception_flags(status);
6776
v = float32_to_int32(a STATUS_VAR);
6779
} else if (v > 0x7fff) {
6785
set_float_exception_flags(old_exc_flags, status);
6786
float_raise(float_flag_invalid STATUS_VAR);
6790
uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6794
int old_exc_flags = get_float_exception_flags(status);
6796
v = float32_to_int32(a STATUS_VAR);
6799
} else if (v > 0xffff) {
6805
set_float_exception_flags(old_exc_flags, status);
6806
float_raise(float_flag_invalid STATUS_VAR);
6810
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6814
int old_exc_flags = get_float_exception_flags(status);
6816
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6819
} else if (v > 0xffff) {
6824
set_float_exception_flags(old_exc_flags, status);
6825
float_raise(float_flag_invalid STATUS_VAR);
6829
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6833
int old_exc_flags = get_float_exception_flags(status);
6835
v = float64_to_uint64(a STATUS_VAR);
6836
if (v > 0xffffffff) {
6841
set_float_exception_flags(old_exc_flags, status);
6842
float_raise(float_flag_invalid STATUS_VAR);
6846
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6850
int old_exc_flags = get_float_exception_flags(status);
6852
v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6853
if (v > 0xffffffff) {
6858
set_float_exception_flags(old_exc_flags, status);
6859
float_raise(float_flag_invalid STATUS_VAR);
6863
int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6867
int old_exc_flags = get_float_exception_flags(status);
6869
v = float64_to_int32(a STATUS_VAR);
6872
} else if (v > 0x7fff) {
6878
set_float_exception_flags(old_exc_flags, status);
6879
float_raise(float_flag_invalid STATUS_VAR);
6883
uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6887
int old_exc_flags = get_float_exception_flags(status);
6889
v = float64_to_int32(a STATUS_VAR);
6892
} else if (v > 0xffff) {
6898
set_float_exception_flags(old_exc_flags, status);
6899
float_raise(float_flag_invalid STATUS_VAR);
6903
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6907
int old_exc_flags = get_float_exception_flags(status);
6909
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6912
} else if (v > 0xffff) {
6917
set_float_exception_flags(old_exc_flags, status);
6918
float_raise(float_flag_invalid STATUS_VAR);
6922
/*----------------------------------------------------------------------------
6923
| Returns the result of converting the double-precision floating-point value
6924
| `a' to the 64-bit unsigned integer format. The conversion is
6925
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6926
| Arithmetic---which means in particular that the conversion is rounded
6927
| according to the current rounding mode. If `a' is a NaN, the largest
6928
| positive integer is returned. If the conversion overflows, the
6929
| largest unsigned integer is returned. If 'a' is negative, the value is
6930
| rounded and zero is returned; negative values that do not round to zero
6931
| will raise the inexact exception.
6932
*----------------------------------------------------------------------------*/
6934
uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6937
int_fast16_t aExp, shiftCount;
6938
uint64_t aSig, aSigExtra;
6939
a = float64_squash_input_denormal(a STATUS_VAR);
6941
aSig = extractFloat64Frac(a);
6942
aExp = extractFloat64Exp(a);
6943
aSign = extractFloat64Sign(a);
6944
if (aSign && (aExp > 1022)) {
6945
float_raise(float_flag_invalid STATUS_VAR);
6946
if (float64_is_any_nan(a)) {
6947
return LIT64(0xFFFFFFFFFFFFFFFF);
6953
aSig |= LIT64(0x0010000000000000);
6955
shiftCount = 0x433 - aExp;
6956
if (shiftCount <= 0) {
6958
float_raise(float_flag_invalid STATUS_VAR);
6959
return LIT64(0xFFFFFFFFFFFFFFFF);
6962
aSig <<= -shiftCount;
6964
shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6966
return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6969
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6971
signed char current_rounding_mode = STATUS(float_rounding_mode);
6972
set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6973
int64_t v = float64_to_uint64(a STATUS_VAR);
6974
set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6978
#define COMPARE(s, nan_exp) \
6979
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6980
int is_quiet STATUS_PARAM ) \
6982
flag aSign, bSign; \
6983
uint ## s ## _t av, bv; \
6984
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6985
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6987
if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6988
extractFloat ## s ## Frac( a ) ) || \
6989
( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6990
extractFloat ## s ## Frac( b ) )) { \
6992
float ## s ## _is_signaling_nan( a ) || \
6993
float ## s ## _is_signaling_nan( b ) ) { \
6994
float_raise( float_flag_invalid STATUS_VAR); \
6996
return float_relation_unordered; \
6998
aSign = extractFloat ## s ## Sign( a ); \
6999
bSign = extractFloat ## s ## Sign( b ); \
7000
av = float ## s ## _val(a); \
7001
bv = float ## s ## _val(b); \
7002
if ( aSign != bSign ) { \
7003
if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7005
return float_relation_equal; \
7007
return 1 - (2 * aSign); \
7011
return float_relation_equal; \
7013
return 1 - 2 * (aSign ^ ( av < bv )); \
7018
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
7020
return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7023
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
7025
return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7031
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
7032
int is_quiet STATUS_PARAM )
7036
if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7037
( extractFloatx80Frac( a )<<1 ) ) ||
7038
( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7039
( extractFloatx80Frac( b )<<1 ) )) {
7041
floatx80_is_signaling_nan( a ) ||
7042
floatx80_is_signaling_nan( b ) ) {
7043
float_raise( float_flag_invalid STATUS_VAR);
7045
return float_relation_unordered;
7047
aSign = extractFloatx80Sign( a );
7048
bSign = extractFloatx80Sign( b );
7049
if ( aSign != bSign ) {
7051
if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7052
( ( a.low | b.low ) == 0 ) ) {
7054
return float_relation_equal;
7056
return 1 - (2 * aSign);
7059
if (a.low == b.low && a.high == b.high) {
7060
return float_relation_equal;
7062
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7067
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7069
return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7072
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7074
return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7077
INLINE int float128_compare_internal( float128 a, float128 b,
7078
int is_quiet STATUS_PARAM )
7082
if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7083
( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7084
( ( extractFloat128Exp( b ) == 0x7fff ) &&
7085
( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7087
float128_is_signaling_nan( a ) ||
7088
float128_is_signaling_nan( b ) ) {
7089
float_raise( float_flag_invalid STATUS_VAR);
7091
return float_relation_unordered;
7093
aSign = extractFloat128Sign( a );
7094
bSign = extractFloat128Sign( b );
7095
if ( aSign != bSign ) {
7096
if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7098
return float_relation_equal;
7100
return 1 - (2 * aSign);
7103
if (a.low == b.low && a.high == b.high) {
7104
return float_relation_equal;
7106
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7111
int float128_compare( float128 a, float128 b STATUS_PARAM )
7113
return float128_compare_internal(a, b, 0 STATUS_VAR);
7116
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7118
return float128_compare_internal(a, b, 1 STATUS_VAR);
7121
/* min() and max() functions. These can't be implemented as
7122
* 'compare and pick one input' because that would mishandle
7123
* NaNs and +0 vs -0.
7125
* minnum() and maxnum() functions. These are similar to the min()
7126
* and max() functions but if one of the arguments is a QNaN and
7127
* the other is numerical then the numerical argument is returned.
7128
* minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7129
* and maxNum() operations. min() and max() are the typical min/max
7130
* semantics provided by many CPUs which predate that specification.
7132
#define MINMAX(s, nan_exp) \
7133
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7134
int ismin, int isieee STATUS_PARAM) \
7136
flag aSign, bSign; \
7137
uint ## s ## _t av, bv; \
7138
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7139
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7140
if (float ## s ## _is_any_nan(a) || \
7141
float ## s ## _is_any_nan(b)) { \
7143
if (float ## s ## _is_quiet_nan(a) && \
7144
!float ## s ##_is_any_nan(b)) { \
7146
} else if (float ## s ## _is_quiet_nan(b) && \
7147
!float ## s ## _is_any_nan(a)) { \
7151
return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7153
aSign = extractFloat ## s ## Sign(a); \
7154
bSign = extractFloat ## s ## Sign(b); \
7155
av = float ## s ## _val(a); \
7156
bv = float ## s ## _val(b); \
7157
if (aSign != bSign) { \
7159
return aSign ? a : b; \
7161
return aSign ? b : a; \
7165
return (aSign ^ (av < bv)) ? a : b; \
7167
return (aSign ^ (av < bv)) ? b : a; \
7172
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7174
return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
7177
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7179
return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7182
float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7184
return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7187
float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7189
return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7196
/* Multiply A by 2 raised to the power N. */
7197
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7203
a = float32_squash_input_denormal(a STATUS_VAR);
7204
aSig = extractFloat32Frac( a );
7205
aExp = extractFloat32Exp( a );
7206
aSign = extractFloat32Sign( a );
7208
if ( aExp == 0xFF ) {
7210
return propagateFloat32NaN( a, a STATUS_VAR );
7216
} else if (aSig == 0) {
7224
} else if (n < -0x200) {
7230
return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7233
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7239
a = float64_squash_input_denormal(a STATUS_VAR);
7240
aSig = extractFloat64Frac( a );
7241
aExp = extractFloat64Exp( a );
7242
aSign = extractFloat64Sign( a );
7244
if ( aExp == 0x7FF ) {
7246
return propagateFloat64NaN( a, a STATUS_VAR );
7251
aSig |= LIT64( 0x0010000000000000 );
7252
} else if (aSig == 0) {
7260
} else if (n < -0x1000) {
7266
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7269
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7275
aSig = extractFloatx80Frac( a );
7276
aExp = extractFloatx80Exp( a );
7277
aSign = extractFloatx80Sign( a );
7279
if ( aExp == 0x7FFF ) {
7281
return propagateFloatx80NaN( a, a STATUS_VAR );
7295
} else if (n < -0x10000) {
7300
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7301
aSign, aExp, aSig, 0 STATUS_VAR );
7304
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7308
uint64_t aSig0, aSig1;
7310
aSig1 = extractFloat128Frac1( a );
7311
aSig0 = extractFloat128Frac0( a );
7312
aExp = extractFloat128Exp( a );
7313
aSign = extractFloat128Sign( a );
7314
if ( aExp == 0x7FFF ) {
7315
if ( aSig0 | aSig1 ) {
7316
return propagateFloat128NaN( a, a STATUS_VAR );
7321
aSig0 |= LIT64( 0x0001000000000000 );
7322
} else if (aSig0 == 0 && aSig1 == 0) {
7330
} else if (n < -0x10000) {
7335
return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1