4
* Derived from SoftFloat.
7
/*============================================================================
9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12
Written by John R. Hauser. This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704. Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980. The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
36
=============================================================================*/
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
* target-dependent and needs the TARGET_* macros.
43
#include "fpu/softfloat.h"
45
/*----------------------------------------------------------------------------
46
| Primitive arithmetic functions, including multi-word arithmetic, and
47
| division and square root approximations. (Can be specialized to target if
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-macros.h"
52
/*----------------------------------------------------------------------------
53
| Functions and definitions to determine: (1) whether tininess for underflow
54
| is detected before or after rounding by default, (2) what (if anything)
55
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57
| are propagated from function inputs to output. These details are target-
59
*----------------------------------------------------------------------------*/
60
#include "softfloat-specialize.h"
62
void set_float_rounding_mode(int val STATUS_PARAM)
64
STATUS(float_rounding_mode) = val;
67
void set_float_exception_flags(int val STATUS_PARAM)
69
STATUS(float_exception_flags) = val;
72
void set_floatx80_rounding_precision(int val STATUS_PARAM)
74
STATUS(floatx80_rounding_precision) = val;
77
/*----------------------------------------------------------------------------
78
| Returns the fraction bits of the half-precision floating-point value `a'.
79
*----------------------------------------------------------------------------*/
81
INLINE uint32_t extractFloat16Frac(float16 a)
83
return float16_val(a) & 0x3ff;
86
/*----------------------------------------------------------------------------
87
| Returns the exponent bits of the half-precision floating-point value `a'.
88
*----------------------------------------------------------------------------*/
90
INLINE int_fast16_t extractFloat16Exp(float16 a)
92
return (float16_val(a) >> 10) & 0x1f;
95
/*----------------------------------------------------------------------------
96
| Returns the sign bit of the single-precision floating-point value `a'.
97
*----------------------------------------------------------------------------*/
99
INLINE flag extractFloat16Sign(float16 a)
101
return float16_val(a)>>15;
104
/*----------------------------------------------------------------------------
105
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106
| and 7, and returns the properly rounded 32-bit integer corresponding to the
107
| input. If `zSign' is 1, the input is negated before being converted to an
108
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109
| is simply rounded to an integer, with the inexact exception raised if the
110
| input cannot be represented exactly as an integer. However, if the fixed-
111
| point input is too large, the invalid exception is raised and the largest
112
| positive or negative integer is returned.
113
*----------------------------------------------------------------------------*/
115
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
118
flag roundNearestEven;
119
int8 roundIncrement, roundBits;
122
roundingMode = STATUS(float_rounding_mode);
123
roundNearestEven = ( roundingMode == float_round_nearest_even );
124
roundIncrement = 0x40;
125
if ( ! roundNearestEven ) {
126
if ( roundingMode == float_round_to_zero ) {
130
roundIncrement = 0x7F;
132
if ( roundingMode == float_round_up ) roundIncrement = 0;
135
if ( roundingMode == float_round_down ) roundIncrement = 0;
139
roundBits = absZ & 0x7F;
140
absZ = ( absZ + roundIncrement )>>7;
141
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143
if ( zSign ) z = - z;
144
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145
float_raise( float_flag_invalid STATUS_VAR);
146
return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
148
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
153
/*----------------------------------------------------------------------------
154
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155
| `absZ1', with binary point between bits 63 and 64 (between the input words),
156
| and returns the properly rounded 64-bit integer corresponding to the input.
157
| If `zSign' is 1, the input is negated before being converted to an integer.
158
| Ordinarily, the fixed-point input is simply rounded to an integer, with
159
| the inexact exception raised if the input cannot be represented exactly as
160
| an integer. However, if the fixed-point input is too large, the invalid
161
| exception is raised and the largest positive or negative integer is
163
*----------------------------------------------------------------------------*/
165
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
168
flag roundNearestEven, increment;
171
roundingMode = STATUS(float_rounding_mode);
172
roundNearestEven = ( roundingMode == float_round_nearest_even );
173
increment = ( (int64_t) absZ1 < 0 );
174
if ( ! roundNearestEven ) {
175
if ( roundingMode == float_round_to_zero ) {
180
increment = ( roundingMode == float_round_down ) && absZ1;
183
increment = ( roundingMode == float_round_up ) && absZ1;
189
if ( absZ0 == 0 ) goto overflow;
190
absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193
if ( zSign ) z = - z;
194
if ( z && ( ( z < 0 ) ^ zSign ) ) {
196
float_raise( float_flag_invalid STATUS_VAR);
198
zSign ? (int64_t) LIT64( 0x8000000000000000 )
199
: LIT64( 0x7FFFFFFFFFFFFFFF );
201
if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
206
/*----------------------------------------------------------------------------
207
| Returns the fraction bits of the single-precision floating-point value `a'.
208
*----------------------------------------------------------------------------*/
210
INLINE uint32_t extractFloat32Frac( float32 a )
213
return float32_val(a) & 0x007FFFFF;
217
/*----------------------------------------------------------------------------
218
| Returns the exponent bits of the single-precision floating-point value `a'.
219
*----------------------------------------------------------------------------*/
221
INLINE int_fast16_t extractFloat32Exp(float32 a)
224
return ( float32_val(a)>>23 ) & 0xFF;
228
/*----------------------------------------------------------------------------
229
| Returns the sign bit of the single-precision floating-point value `a'.
230
*----------------------------------------------------------------------------*/
232
INLINE flag extractFloat32Sign( float32 a )
235
return float32_val(a)>>31;
239
/*----------------------------------------------------------------------------
240
| If `a' is denormal and we are in flush-to-zero mode then set the
241
| input-denormal exception and return zero. Otherwise just return the value.
242
*----------------------------------------------------------------------------*/
243
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
245
if (STATUS(flush_inputs_to_zero)) {
246
if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247
float_raise(float_flag_input_denormal STATUS_VAR);
248
return make_float32(float32_val(a) & 0x80000000);
254
/*----------------------------------------------------------------------------
255
| Normalizes the subnormal single-precision floating-point value represented
256
| by the denormalized significand `aSig'. The normalized exponent and
257
| significand are stored at the locations pointed to by `zExpPtr' and
258
| `zSigPtr', respectively.
259
*----------------------------------------------------------------------------*/
262
normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
266
shiftCount = countLeadingZeros32( aSig ) - 8;
267
*zSigPtr = aSig<<shiftCount;
268
*zExpPtr = 1 - shiftCount;
272
/*----------------------------------------------------------------------------
273
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274
| single-precision floating-point value, returning the result. After being
275
| shifted into the proper positions, the three fields are simply added
276
| together to form the result. This means that any integer portion of `zSig'
277
| will be added into the exponent. Since a properly normalized significand
278
| will have an integer portion equal to 1, the `zExp' input should be 1 less
279
| than the desired result exponent whenever `zSig' is a complete, normalized
281
*----------------------------------------------------------------------------*/
283
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
287
( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
291
/*----------------------------------------------------------------------------
292
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293
| and significand `zSig', and returns the proper single-precision floating-
294
| point value corresponding to the abstract input. Ordinarily, the abstract
295
| value is simply rounded and packed into the single-precision format, with
296
| the inexact exception raised if the abstract input cannot be represented
297
| exactly. However, if the abstract value is too large, the overflow and
298
| inexact exceptions are raised and an infinity or maximal finite value is
299
| returned. If the abstract value is too small, the input value is rounded to
300
| a subnormal number, and the underflow and inexact exceptions are raised if
301
| the abstract input cannot be represented exactly as a subnormal single-
302
| precision floating-point number.
303
| The input significand `zSig' has its binary point between bits 30
304
| and 29, which is 7 bits to the left of the usual location. This shifted
305
| significand must be normalized or smaller. If `zSig' is not normalized,
306
| `zExp' must be 0; in that case, the result returned is a subnormal number,
307
| and it must not require rounding. In the usual case that `zSig' is
308
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309
| The handling of underflow and overflow follows the IEC/IEEE Standard for
310
| Binary Floating-Point Arithmetic.
311
*----------------------------------------------------------------------------*/
313
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
316
flag roundNearestEven;
317
int8 roundIncrement, roundBits;
320
roundingMode = STATUS(float_rounding_mode);
321
roundNearestEven = ( roundingMode == float_round_nearest_even );
322
roundIncrement = 0x40;
323
if ( ! roundNearestEven ) {
324
if ( roundingMode == float_round_to_zero ) {
328
roundIncrement = 0x7F;
330
if ( roundingMode == float_round_up ) roundIncrement = 0;
333
if ( roundingMode == float_round_down ) roundIncrement = 0;
337
roundBits = zSig & 0x7F;
338
if ( 0xFD <= (uint16_t) zExp ) {
340
|| ( ( zExp == 0xFD )
341
&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
343
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
344
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
347
if (STATUS(flush_to_zero)) {
348
float_raise(float_flag_output_denormal STATUS_VAR);
349
return packFloat32(zSign, 0, 0);
352
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
354
|| ( zSig + roundIncrement < 0x80000000 );
355
shift32RightJamming( zSig, - zExp, &zSig );
357
roundBits = zSig & 0x7F;
358
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
361
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362
zSig = ( zSig + roundIncrement )>>7;
363
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364
if ( zSig == 0 ) zExp = 0;
365
return packFloat32( zSign, zExp, zSig );
369
/*----------------------------------------------------------------------------
370
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371
| and significand `zSig', and returns the proper single-precision floating-
372
| point value corresponding to the abstract input. This routine is just like
373
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375
| floating-point exponent.
376
*----------------------------------------------------------------------------*/
379
normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
383
shiftCount = countLeadingZeros32( zSig ) - 1;
384
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
388
/*----------------------------------------------------------------------------
389
| Returns the fraction bits of the double-precision floating-point value `a'.
390
*----------------------------------------------------------------------------*/
392
INLINE uint64_t extractFloat64Frac( float64 a )
395
return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
399
/*----------------------------------------------------------------------------
400
| Returns the exponent bits of the double-precision floating-point value `a'.
401
*----------------------------------------------------------------------------*/
403
INLINE int_fast16_t extractFloat64Exp(float64 a)
406
return ( float64_val(a)>>52 ) & 0x7FF;
410
/*----------------------------------------------------------------------------
411
| Returns the sign bit of the double-precision floating-point value `a'.
412
*----------------------------------------------------------------------------*/
414
INLINE flag extractFloat64Sign( float64 a )
417
return float64_val(a)>>63;
421
/*----------------------------------------------------------------------------
422
| If `a' is denormal and we are in flush-to-zero mode then set the
423
| input-denormal exception and return zero. Otherwise just return the value.
424
*----------------------------------------------------------------------------*/
425
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
427
if (STATUS(flush_inputs_to_zero)) {
428
if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429
float_raise(float_flag_input_denormal STATUS_VAR);
430
return make_float64(float64_val(a) & (1ULL << 63));
436
/*----------------------------------------------------------------------------
437
| Normalizes the subnormal double-precision floating-point value represented
438
| by the denormalized significand `aSig'. The normalized exponent and
439
| significand are stored at the locations pointed to by `zExpPtr' and
440
| `zSigPtr', respectively.
441
*----------------------------------------------------------------------------*/
444
normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
448
shiftCount = countLeadingZeros64( aSig ) - 11;
449
*zSigPtr = aSig<<shiftCount;
450
*zExpPtr = 1 - shiftCount;
454
/*----------------------------------------------------------------------------
455
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456
| double-precision floating-point value, returning the result. After being
457
| shifted into the proper positions, the three fields are simply added
458
| together to form the result. This means that any integer portion of `zSig'
459
| will be added into the exponent. Since a properly normalized significand
460
| will have an integer portion equal to 1, the `zExp' input should be 1 less
461
| than the desired result exponent whenever `zSig' is a complete, normalized
463
*----------------------------------------------------------------------------*/
465
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
469
( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
473
/*----------------------------------------------------------------------------
474
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475
| and significand `zSig', and returns the proper double-precision floating-
476
| point value corresponding to the abstract input. Ordinarily, the abstract
477
| value is simply rounded and packed into the double-precision format, with
478
| the inexact exception raised if the abstract input cannot be represented
479
| exactly. However, if the abstract value is too large, the overflow and
480
| inexact exceptions are raised and an infinity or maximal finite value is
481
| returned. If the abstract value is too small, the input value is rounded
482
| to a subnormal number, and the underflow and inexact exceptions are raised
483
| if the abstract input cannot be represented exactly as a subnormal double-
484
| precision floating-point number.
485
| The input significand `zSig' has its binary point between bits 62
486
| and 61, which is 10 bits to the left of the usual location. This shifted
487
| significand must be normalized or smaller. If `zSig' is not normalized,
488
| `zExp' must be 0; in that case, the result returned is a subnormal number,
489
| and it must not require rounding. In the usual case that `zSig' is
490
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491
| The handling of underflow and overflow follows the IEC/IEEE Standard for
492
| Binary Floating-Point Arithmetic.
493
*----------------------------------------------------------------------------*/
495
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
498
flag roundNearestEven;
499
int_fast16_t roundIncrement, roundBits;
502
roundingMode = STATUS(float_rounding_mode);
503
roundNearestEven = ( roundingMode == float_round_nearest_even );
504
roundIncrement = 0x200;
505
if ( ! roundNearestEven ) {
506
if ( roundingMode == float_round_to_zero ) {
510
roundIncrement = 0x3FF;
512
if ( roundingMode == float_round_up ) roundIncrement = 0;
515
if ( roundingMode == float_round_down ) roundIncrement = 0;
519
roundBits = zSig & 0x3FF;
520
if ( 0x7FD <= (uint16_t) zExp ) {
521
if ( ( 0x7FD < zExp )
522
|| ( ( zExp == 0x7FD )
523
&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
525
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
526
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
529
if (STATUS(flush_to_zero)) {
530
float_raise(float_flag_output_denormal STATUS_VAR);
531
return packFloat64(zSign, 0, 0);
534
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
536
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537
shift64RightJamming( zSig, - zExp, &zSig );
539
roundBits = zSig & 0x3FF;
540
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
543
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544
zSig = ( zSig + roundIncrement )>>10;
545
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546
if ( zSig == 0 ) zExp = 0;
547
return packFloat64( zSign, zExp, zSig );
551
/*----------------------------------------------------------------------------
552
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553
| and significand `zSig', and returns the proper double-precision floating-
554
| point value corresponding to the abstract input. This routine is just like
555
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557
| floating-point exponent.
558
*----------------------------------------------------------------------------*/
561
normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
565
shiftCount = countLeadingZeros64( zSig ) - 1;
566
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
570
/*----------------------------------------------------------------------------
571
| Returns the fraction bits of the extended double-precision floating-point
573
*----------------------------------------------------------------------------*/
575
INLINE uint64_t extractFloatx80Frac( floatx80 a )
582
/*----------------------------------------------------------------------------
583
| Returns the exponent bits of the extended double-precision floating-point
585
*----------------------------------------------------------------------------*/
587
INLINE int32 extractFloatx80Exp( floatx80 a )
590
return a.high & 0x7FFF;
594
/*----------------------------------------------------------------------------
595
| Returns the sign bit of the extended double-precision floating-point value
597
*----------------------------------------------------------------------------*/
599
INLINE flag extractFloatx80Sign( floatx80 a )
606
/*----------------------------------------------------------------------------
607
| Normalizes the subnormal extended double-precision floating-point value
608
| represented by the denormalized significand `aSig'. The normalized exponent
609
| and significand are stored at the locations pointed to by `zExpPtr' and
610
| `zSigPtr', respectively.
611
*----------------------------------------------------------------------------*/
614
normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
618
shiftCount = countLeadingZeros64( aSig );
619
*zSigPtr = aSig<<shiftCount;
620
*zExpPtr = 1 - shiftCount;
624
/*----------------------------------------------------------------------------
625
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626
| extended double-precision floating-point value, returning the result.
627
*----------------------------------------------------------------------------*/
629
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
634
z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
639
/*----------------------------------------------------------------------------
640
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
642
| and returns the proper extended double-precision floating-point value
643
| corresponding to the abstract input. Ordinarily, the abstract value is
644
| rounded and packed into the extended double-precision format, with the
645
| inexact exception raised if the abstract input cannot be represented
646
| exactly. However, if the abstract value is too large, the overflow and
647
| inexact exceptions are raised and an infinity or maximal finite value is
648
| returned. If the abstract value is too small, the input value is rounded to
649
| a subnormal number, and the underflow and inexact exceptions are raised if
650
| the abstract input cannot be represented exactly as a subnormal extended
651
| double-precision floating-point number.
652
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
653
| number of bits as single or double precision, respectively. Otherwise, the
654
| result is rounded to the full precision of the extended double-precision
656
| The input significand must be normalized or smaller. If the input
657
| significand is not normalized, `zExp' must be 0; in that case, the result
658
| returned is a subnormal number, and it must not require rounding. The
659
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660
| Floating-Point Arithmetic.
661
*----------------------------------------------------------------------------*/
664
roundAndPackFloatx80(
665
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
669
flag roundNearestEven, increment, isTiny;
670
int64 roundIncrement, roundMask, roundBits;
672
roundingMode = STATUS(float_rounding_mode);
673
roundNearestEven = ( roundingMode == float_round_nearest_even );
674
if ( roundingPrecision == 80 ) goto precision80;
675
if ( roundingPrecision == 64 ) {
676
roundIncrement = LIT64( 0x0000000000000400 );
677
roundMask = LIT64( 0x00000000000007FF );
679
else if ( roundingPrecision == 32 ) {
680
roundIncrement = LIT64( 0x0000008000000000 );
681
roundMask = LIT64( 0x000000FFFFFFFFFF );
686
zSig0 |= ( zSig1 != 0 );
687
if ( ! roundNearestEven ) {
688
if ( roundingMode == float_round_to_zero ) {
692
roundIncrement = roundMask;
694
if ( roundingMode == float_round_up ) roundIncrement = 0;
697
if ( roundingMode == float_round_down ) roundIncrement = 0;
701
roundBits = zSig0 & roundMask;
702
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
703
if ( ( 0x7FFE < zExp )
704
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
709
if (STATUS(flush_to_zero)) {
710
float_raise(float_flag_output_denormal STATUS_VAR);
711
return packFloatx80(zSign, 0, 0);
714
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
716
|| ( zSig0 <= zSig0 + roundIncrement );
717
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
719
roundBits = zSig0 & roundMask;
720
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722
zSig0 += roundIncrement;
723
if ( (int64_t) zSig0 < 0 ) zExp = 1;
724
roundIncrement = roundMask + 1;
725
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726
roundMask |= roundIncrement;
728
zSig0 &= ~ roundMask;
729
return packFloatx80( zSign, zExp, zSig0 );
732
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733
zSig0 += roundIncrement;
734
if ( zSig0 < roundIncrement ) {
736
zSig0 = LIT64( 0x8000000000000000 );
738
roundIncrement = roundMask + 1;
739
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740
roundMask |= roundIncrement;
742
zSig0 &= ~ roundMask;
743
if ( zSig0 == 0 ) zExp = 0;
744
return packFloatx80( zSign, zExp, zSig0 );
746
increment = ( (int64_t) zSig1 < 0 );
747
if ( ! roundNearestEven ) {
748
if ( roundingMode == float_round_to_zero ) {
753
increment = ( roundingMode == float_round_down ) && zSig1;
756
increment = ( roundingMode == float_round_up ) && zSig1;
760
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
761
if ( ( 0x7FFE < zExp )
762
|| ( ( zExp == 0x7FFE )
763
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
769
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770
if ( ( roundingMode == float_round_to_zero )
771
|| ( zSign && ( roundingMode == float_round_up ) )
772
|| ( ! zSign && ( roundingMode == float_round_down ) )
774
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
776
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
780
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
783
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
786
if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788
if ( roundNearestEven ) {
789
increment = ( (int64_t) zSig1 < 0 );
793
increment = ( roundingMode == float_round_down ) && zSig1;
796
increment = ( roundingMode == float_round_up ) && zSig1;
802
~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803
if ( (int64_t) zSig0 < 0 ) zExp = 1;
805
return packFloatx80( zSign, zExp, zSig0 );
808
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
813
zSig0 = LIT64( 0x8000000000000000 );
816
zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
820
if ( zSig0 == 0 ) zExp = 0;
822
return packFloatx80( zSign, zExp, zSig0 );
826
/*----------------------------------------------------------------------------
827
| Takes an abstract floating-point value having sign `zSign', exponent
828
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829
| and returns the proper extended double-precision floating-point value
830
| corresponding to the abstract input. This routine is just like
831
| `roundAndPackFloatx80' except that the input significand does not have to be
833
*----------------------------------------------------------------------------*/
836
normalizeRoundAndPackFloatx80(
837
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
847
shiftCount = countLeadingZeros64( zSig0 );
848
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
851
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
855
/*----------------------------------------------------------------------------
856
| Returns the least-significant 64 fraction bits of the quadruple-precision
857
| floating-point value `a'.
858
*----------------------------------------------------------------------------*/
860
INLINE uint64_t extractFloat128Frac1( float128 a )
867
/*----------------------------------------------------------------------------
868
| Returns the most-significant 48 fraction bits of the quadruple-precision
869
| floating-point value `a'.
870
*----------------------------------------------------------------------------*/
872
INLINE uint64_t extractFloat128Frac0( float128 a )
875
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
879
/*----------------------------------------------------------------------------
880
| Returns the exponent bits of the quadruple-precision floating-point value
882
*----------------------------------------------------------------------------*/
884
INLINE int32 extractFloat128Exp( float128 a )
887
return ( a.high>>48 ) & 0x7FFF;
891
/*----------------------------------------------------------------------------
892
| Returns the sign bit of the quadruple-precision floating-point value `a'.
893
*----------------------------------------------------------------------------*/
895
INLINE flag extractFloat128Sign( float128 a )
902
/*----------------------------------------------------------------------------
903
| Normalizes the subnormal quadruple-precision floating-point value
904
| represented by the denormalized significand formed by the concatenation of
905
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
906
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907
| significand are stored at the location pointed to by `zSig0Ptr', and the
908
| least significant 64 bits of the normalized significand are stored at the
909
| location pointed to by `zSig1Ptr'.
910
*----------------------------------------------------------------------------*/
913
normalizeFloat128Subnormal(
924
shiftCount = countLeadingZeros64( aSig1 ) - 15;
925
if ( shiftCount < 0 ) {
926
*zSig0Ptr = aSig1>>( - shiftCount );
927
*zSig1Ptr = aSig1<<( shiftCount & 63 );
930
*zSig0Ptr = aSig1<<shiftCount;
933
*zExpPtr = - shiftCount - 63;
936
shiftCount = countLeadingZeros64( aSig0 ) - 15;
937
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938
*zExpPtr = 1 - shiftCount;
943
/*----------------------------------------------------------------------------
944
| Packs the sign `zSign', the exponent `zExp', and the significand formed
945
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946
| floating-point value, returning the result. After being shifted into the
947
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948
| added together to form the most significant 32 bits of the result. This
949
| means that any integer portion of `zSig0' will be added into the exponent.
950
| Since a properly normalized significand will have an integer portion equal
951
| to 1, the `zExp' input should be 1 less than the desired result exponent
952
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
954
*----------------------------------------------------------------------------*/
957
packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
962
z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
967
/*----------------------------------------------------------------------------
968
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969
| and extended significand formed by the concatenation of `zSig0', `zSig1',
970
| and `zSig2', and returns the proper quadruple-precision floating-point value
971
| corresponding to the abstract input. Ordinarily, the abstract value is
972
| simply rounded and packed into the quadruple-precision format, with the
973
| inexact exception raised if the abstract input cannot be represented
974
| exactly. However, if the abstract value is too large, the overflow and
975
| inexact exceptions are raised and an infinity or maximal finite value is
976
| returned. If the abstract value is too small, the input value is rounded to
977
| a subnormal number, and the underflow and inexact exceptions are raised if
978
| the abstract input cannot be represented exactly as a subnormal quadruple-
979
| precision floating-point number.
980
| The input significand must be normalized or smaller. If the input
981
| significand is not normalized, `zExp' must be 0; in that case, the result
982
| returned is a subnormal number, and it must not require rounding. In the
983
| usual case that the input significand is normalized, `zExp' must be 1 less
984
| than the ``true'' floating-point exponent. The handling of underflow and
985
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986
*----------------------------------------------------------------------------*/
989
roundAndPackFloat128(
990
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
993
flag roundNearestEven, increment, isTiny;
995
roundingMode = STATUS(float_rounding_mode);
996
roundNearestEven = ( roundingMode == float_round_nearest_even );
997
increment = ( (int64_t) zSig2 < 0 );
998
if ( ! roundNearestEven ) {
999
if ( roundingMode == float_round_to_zero ) {
1004
increment = ( roundingMode == float_round_down ) && zSig2;
1007
increment = ( roundingMode == float_round_up ) && zSig2;
1011
if ( 0x7FFD <= (uint32_t) zExp ) {
1012
if ( ( 0x7FFD < zExp )
1013
|| ( ( zExp == 0x7FFD )
1015
LIT64( 0x0001FFFFFFFFFFFF ),
1016
LIT64( 0xFFFFFFFFFFFFFFFF ),
1023
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024
if ( ( roundingMode == float_round_to_zero )
1025
|| ( zSign && ( roundingMode == float_round_up ) )
1026
|| ( ! zSign && ( roundingMode == float_round_down ) )
1032
LIT64( 0x0000FFFFFFFFFFFF ),
1033
LIT64( 0xFFFFFFFFFFFFFFFF )
1036
return packFloat128( zSign, 0x7FFF, 0, 0 );
1039
if (STATUS(flush_to_zero)) {
1040
float_raise(float_flag_output_denormal STATUS_VAR);
1041
return packFloat128(zSign, 0, 0, 0);
1044
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1050
LIT64( 0x0001FFFFFFFFFFFF ),
1051
LIT64( 0xFFFFFFFFFFFFFFFF )
1053
shift128ExtraRightJamming(
1054
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1056
if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057
if ( roundNearestEven ) {
1058
increment = ( (int64_t) zSig2 < 0 );
1062
increment = ( roundingMode == float_round_down ) && zSig2;
1065
increment = ( roundingMode == float_round_up ) && zSig2;
1070
if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1072
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1076
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1078
return packFloat128( zSign, zExp, zSig0, zSig1 );
1082
/*----------------------------------------------------------------------------
1083
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085
| returns the proper quadruple-precision floating-point value corresponding
1086
| to the abstract input. This routine is just like `roundAndPackFloat128'
1087
| except that the input significand has fewer bits and does not have to be
1088
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1090
*----------------------------------------------------------------------------*/
1093
normalizeRoundAndPackFloat128(
1094
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1104
shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105
if ( 0 <= shiftCount ) {
1107
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1110
shift128ExtraRightJamming(
1111
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1114
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1118
/*----------------------------------------------------------------------------
1119
| Returns the result of converting the 32-bit two's complement integer `a'
1120
| to the single-precision floating-point format. The conversion is performed
1121
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122
*----------------------------------------------------------------------------*/
1124
float32 int32_to_float32( int32 a STATUS_PARAM )
1128
if ( a == 0 ) return float32_zero;
1129
if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1131
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1135
/*----------------------------------------------------------------------------
1136
| Returns the result of converting the 32-bit two's complement integer `a'
1137
| to the double-precision floating-point format. The conversion is performed
1138
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139
*----------------------------------------------------------------------------*/
1141
float64 int32_to_float64( int32 a STATUS_PARAM )
1148
if ( a == 0 ) return float64_zero;
1150
absA = zSign ? - a : a;
1151
shiftCount = countLeadingZeros32( absA ) + 21;
1153
return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1157
/*----------------------------------------------------------------------------
1158
| Returns the result of converting the 32-bit two's complement integer `a'
1159
| to the extended double-precision floating-point format. The conversion
1160
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1162
*----------------------------------------------------------------------------*/
1164
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1171
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1173
absA = zSign ? - a : a;
1174
shiftCount = countLeadingZeros32( absA ) + 32;
1176
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1180
/*----------------------------------------------------------------------------
1181
| Returns the result of converting the 32-bit two's complement integer `a' to
1182
| the quadruple-precision floating-point format. The conversion is performed
1183
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184
*----------------------------------------------------------------------------*/
1186
float128 int32_to_float128( int32 a STATUS_PARAM )
1193
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1195
absA = zSign ? - a : a;
1196
shiftCount = countLeadingZeros32( absA ) + 17;
1198
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202
/*----------------------------------------------------------------------------
1203
| Returns the result of converting the 64-bit two's complement integer `a'
1204
| to the single-precision floating-point format. The conversion is performed
1205
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206
*----------------------------------------------------------------------------*/
1208
float32 int64_to_float32( int64 a STATUS_PARAM )
1214
if ( a == 0 ) return float32_zero;
1216
absA = zSign ? - a : a;
1217
shiftCount = countLeadingZeros64( absA ) - 40;
1218
if ( 0 <= shiftCount ) {
1219
return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1223
if ( shiftCount < 0 ) {
1224
shift64RightJamming( absA, - shiftCount, &absA );
1227
absA <<= shiftCount;
1229
return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1234
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1238
if ( a == 0 ) return float32_zero;
1239
shiftCount = countLeadingZeros64( a ) - 40;
1240
if ( 0 <= shiftCount ) {
1241
return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1245
if ( shiftCount < 0 ) {
1246
shift64RightJamming( a, - shiftCount, &a );
1251
return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1255
/*----------------------------------------------------------------------------
1256
| Returns the result of converting the 64-bit two's complement integer `a'
1257
| to the double-precision floating-point format. The conversion is performed
1258
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259
*----------------------------------------------------------------------------*/
1261
float64 int64_to_float64( int64 a STATUS_PARAM )
1265
if ( a == 0 ) return float64_zero;
1266
if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267
return packFloat64( 1, 0x43E, 0 );
1270
return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1274
float64 uint64_to_float64(uint64 a STATUS_PARAM)
1279
return float64_zero;
1281
if ((int64_t)a < 0) {
1282
shift64RightJamming(a, 1, &a);
1285
return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1288
/*----------------------------------------------------------------------------
1289
| Returns the result of converting the 64-bit two's complement integer `a'
1290
| to the extended double-precision floating-point format. The conversion
1291
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1293
*----------------------------------------------------------------------------*/
1295
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1301
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1303
absA = zSign ? - a : a;
1304
shiftCount = countLeadingZeros64( absA );
1305
return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1309
/*----------------------------------------------------------------------------
1310
| Returns the result of converting the 64-bit two's complement integer `a' to
1311
| the quadruple-precision floating-point format. The conversion is performed
1312
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313
*----------------------------------------------------------------------------*/
1315
float128 int64_to_float128( int64 a STATUS_PARAM )
1321
uint64_t zSig0, zSig1;
1323
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1325
absA = zSign ? - a : a;
1326
shiftCount = countLeadingZeros64( absA ) + 49;
1327
zExp = 0x406E - shiftCount;
1328
if ( 64 <= shiftCount ) {
1337
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338
return packFloat128( zSign, zExp, zSig0, zSig1 );
1342
float128 uint64_to_float128(uint64 a STATUS_PARAM)
1345
return float128_zero;
1347
return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1350
/*----------------------------------------------------------------------------
1351
| Returns the result of converting the single-precision floating-point value
1352
| `a' to the 32-bit two's complement integer format. The conversion is
1353
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354
| Arithmetic---which means in particular that the conversion is rounded
1355
| according to the current rounding mode. If `a' is a NaN, the largest
1356
| positive integer is returned. Otherwise, if the conversion overflows, the
1357
| largest integer with the same sign as `a' is returned.
1358
*----------------------------------------------------------------------------*/
1360
int32 float32_to_int32( float32 a STATUS_PARAM )
1363
int_fast16_t aExp, shiftCount;
1367
a = float32_squash_input_denormal(a STATUS_VAR);
1368
aSig = extractFloat32Frac( a );
1369
aExp = extractFloat32Exp( a );
1370
aSign = extractFloat32Sign( a );
1371
if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1372
if ( aExp ) aSig |= 0x00800000;
1373
shiftCount = 0xAF - aExp;
1376
if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1377
return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1381
/*----------------------------------------------------------------------------
1382
| Returns the result of converting the single-precision floating-point value
1383
| `a' to the 32-bit two's complement integer format. The conversion is
1384
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1385
| Arithmetic, except that the conversion is always rounded toward zero.
1386
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1387
| the conversion overflows, the largest integer with the same sign as `a' is
1389
*----------------------------------------------------------------------------*/
1391
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1394
int_fast16_t aExp, shiftCount;
1397
a = float32_squash_input_denormal(a STATUS_VAR);
1399
aSig = extractFloat32Frac( a );
1400
aExp = extractFloat32Exp( a );
1401
aSign = extractFloat32Sign( a );
1402
shiftCount = aExp - 0x9E;
1403
if ( 0 <= shiftCount ) {
1404
if ( float32_val(a) != 0xCF000000 ) {
1405
float_raise( float_flag_invalid STATUS_VAR);
1406
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1408
return (int32_t) 0x80000000;
1410
else if ( aExp <= 0x7E ) {
1411
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1414
aSig = ( aSig | 0x00800000 )<<8;
1415
z = aSig>>( - shiftCount );
1416
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1417
STATUS(float_exception_flags) |= float_flag_inexact;
1419
if ( aSign ) z = - z;
1424
/*----------------------------------------------------------------------------
1425
| Returns the result of converting the single-precision floating-point value
1426
| `a' to the 16-bit two's complement integer format. The conversion is
1427
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1428
| Arithmetic, except that the conversion is always rounded toward zero.
1429
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1430
| the conversion overflows, the largest integer with the same sign as `a' is
1432
*----------------------------------------------------------------------------*/
1434
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1437
int_fast16_t aExp, shiftCount;
1441
aSig = extractFloat32Frac( a );
1442
aExp = extractFloat32Exp( a );
1443
aSign = extractFloat32Sign( a );
1444
shiftCount = aExp - 0x8E;
1445
if ( 0 <= shiftCount ) {
1446
if ( float32_val(a) != 0xC7000000 ) {
1447
float_raise( float_flag_invalid STATUS_VAR);
1448
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1452
return (int32_t) 0xffff8000;
1454
else if ( aExp <= 0x7E ) {
1455
if ( aExp | aSig ) {
1456
STATUS(float_exception_flags) |= float_flag_inexact;
1461
aSig = ( aSig | 0x00800000 )<<8;
1462
z = aSig>>( - shiftCount );
1463
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1464
STATUS(float_exception_flags) |= float_flag_inexact;
1473
/*----------------------------------------------------------------------------
1474
| Returns the result of converting the single-precision floating-point value
1475
| `a' to the 64-bit two's complement integer format. The conversion is
1476
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1477
| Arithmetic---which means in particular that the conversion is rounded
1478
| according to the current rounding mode. If `a' is a NaN, the largest
1479
| positive integer is returned. Otherwise, if the conversion overflows, the
1480
| largest integer with the same sign as `a' is returned.
1481
*----------------------------------------------------------------------------*/
1483
int64 float32_to_int64( float32 a STATUS_PARAM )
1486
int_fast16_t aExp, shiftCount;
1488
uint64_t aSig64, aSigExtra;
1489
a = float32_squash_input_denormal(a STATUS_VAR);
1491
aSig = extractFloat32Frac( a );
1492
aExp = extractFloat32Exp( a );
1493
aSign = extractFloat32Sign( a );
1494
shiftCount = 0xBE - aExp;
1495
if ( shiftCount < 0 ) {
1496
float_raise( float_flag_invalid STATUS_VAR);
1497
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1498
return LIT64( 0x7FFFFFFFFFFFFFFF );
1500
return (int64_t) LIT64( 0x8000000000000000 );
1502
if ( aExp ) aSig |= 0x00800000;
1505
shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1506
return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1510
/*----------------------------------------------------------------------------
1511
| Returns the result of converting the single-precision floating-point value
1512
| `a' to the 64-bit two's complement integer format. The conversion is
1513
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1514
| Arithmetic, except that the conversion is always rounded toward zero. If
1515
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1516
| conversion overflows, the largest integer with the same sign as `a' is
1518
*----------------------------------------------------------------------------*/
1520
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1523
int_fast16_t aExp, shiftCount;
1527
a = float32_squash_input_denormal(a STATUS_VAR);
1529
aSig = extractFloat32Frac( a );
1530
aExp = extractFloat32Exp( a );
1531
aSign = extractFloat32Sign( a );
1532
shiftCount = aExp - 0xBE;
1533
if ( 0 <= shiftCount ) {
1534
if ( float32_val(a) != 0xDF000000 ) {
1535
float_raise( float_flag_invalid STATUS_VAR);
1536
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1537
return LIT64( 0x7FFFFFFFFFFFFFFF );
1540
return (int64_t) LIT64( 0x8000000000000000 );
1542
else if ( aExp <= 0x7E ) {
1543
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1546
aSig64 = aSig | 0x00800000;
1548
z = aSig64>>( - shiftCount );
1549
if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1550
STATUS(float_exception_flags) |= float_flag_inexact;
1552
if ( aSign ) z = - z;
1557
/*----------------------------------------------------------------------------
1558
| Returns the result of converting the single-precision floating-point value
1559
| `a' to the double-precision floating-point format. The conversion is
1560
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1562
*----------------------------------------------------------------------------*/
1564
float64 float32_to_float64( float32 a STATUS_PARAM )
1569
a = float32_squash_input_denormal(a STATUS_VAR);
1571
aSig = extractFloat32Frac( a );
1572
aExp = extractFloat32Exp( a );
1573
aSign = extractFloat32Sign( a );
1574
if ( aExp == 0xFF ) {
1575
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1576
return packFloat64( aSign, 0x7FF, 0 );
1579
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1580
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1583
return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1587
/*----------------------------------------------------------------------------
1588
| Returns the result of converting the single-precision floating-point value
1589
| `a' to the extended double-precision floating-point format. The conversion
1590
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1592
*----------------------------------------------------------------------------*/
1594
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1600
a = float32_squash_input_denormal(a STATUS_VAR);
1601
aSig = extractFloat32Frac( a );
1602
aExp = extractFloat32Exp( a );
1603
aSign = extractFloat32Sign( a );
1604
if ( aExp == 0xFF ) {
1605
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1606
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1609
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1610
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1613
return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1617
/*----------------------------------------------------------------------------
1618
| Returns the result of converting the single-precision floating-point value
1619
| `a' to the double-precision floating-point format. The conversion is
1620
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1622
*----------------------------------------------------------------------------*/
1624
float128 float32_to_float128( float32 a STATUS_PARAM )
1630
a = float32_squash_input_denormal(a STATUS_VAR);
1631
aSig = extractFloat32Frac( a );
1632
aExp = extractFloat32Exp( a );
1633
aSign = extractFloat32Sign( a );
1634
if ( aExp == 0xFF ) {
1635
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636
return packFloat128( aSign, 0x7FFF, 0, 0 );
1639
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1643
return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1647
/*----------------------------------------------------------------------------
1648
| Rounds the single-precision floating-point value `a' to an integer, and
1649
| returns the result as a single-precision floating-point value. The
1650
| operation is performed according to the IEC/IEEE Standard for Binary
1651
| Floating-Point Arithmetic.
1652
*----------------------------------------------------------------------------*/
1654
float32 float32_round_to_int( float32 a STATUS_PARAM)
1658
uint32_t lastBitMask, roundBitsMask;
1661
a = float32_squash_input_denormal(a STATUS_VAR);
1663
aExp = extractFloat32Exp( a );
1664
if ( 0x96 <= aExp ) {
1665
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1666
return propagateFloat32NaN( a, a STATUS_VAR );
1670
if ( aExp <= 0x7E ) {
1671
if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1672
STATUS(float_exception_flags) |= float_flag_inexact;
1673
aSign = extractFloat32Sign( a );
1674
switch ( STATUS(float_rounding_mode) ) {
1675
case float_round_nearest_even:
1676
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1677
return packFloat32( aSign, 0x7F, 0 );
1680
case float_round_down:
1681
return make_float32(aSign ? 0xBF800000 : 0);
1682
case float_round_up:
1683
return make_float32(aSign ? 0x80000000 : 0x3F800000);
1685
return packFloat32( aSign, 0, 0 );
1688
lastBitMask <<= 0x96 - aExp;
1689
roundBitsMask = lastBitMask - 1;
1691
roundingMode = STATUS(float_rounding_mode);
1692
if ( roundingMode == float_round_nearest_even ) {
1693
z += lastBitMask>>1;
1694
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1696
else if ( roundingMode != float_round_to_zero ) {
1697
if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1701
z &= ~ roundBitsMask;
1702
if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1703
return make_float32(z);
1707
/*----------------------------------------------------------------------------
1708
| Returns the result of adding the absolute values of the single-precision
1709
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1710
| before being returned. `zSign' is ignored if the result is a NaN.
1711
| The addition is performed according to the IEC/IEEE Standard for Binary
1712
| Floating-Point Arithmetic.
1713
*----------------------------------------------------------------------------*/
1715
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1717
int_fast16_t aExp, bExp, zExp;
1718
uint32_t aSig, bSig, zSig;
1719
int_fast16_t expDiff;
1721
aSig = extractFloat32Frac( a );
1722
aExp = extractFloat32Exp( a );
1723
bSig = extractFloat32Frac( b );
1724
bExp = extractFloat32Exp( b );
1725
expDiff = aExp - bExp;
1728
if ( 0 < expDiff ) {
1729
if ( aExp == 0xFF ) {
1730
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1739
shift32RightJamming( bSig, expDiff, &bSig );
1742
else if ( expDiff < 0 ) {
1743
if ( bExp == 0xFF ) {
1744
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745
return packFloat32( zSign, 0xFF, 0 );
1753
shift32RightJamming( aSig, - expDiff, &aSig );
1757
if ( aExp == 0xFF ) {
1758
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1762
if (STATUS(flush_to_zero)) {
1764
float_raise(float_flag_output_denormal STATUS_VAR);
1766
return packFloat32(zSign, 0, 0);
1768
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1770
zSig = 0x40000000 + aSig + bSig;
1775
zSig = ( aSig + bSig )<<1;
1777
if ( (int32_t) zSig < 0 ) {
1782
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1786
/*----------------------------------------------------------------------------
1787
| Returns the result of subtracting the absolute values of the single-
1788
| precision floating-point values `a' and `b'. If `zSign' is 1, the
1789
| difference is negated before being returned. `zSign' is ignored if the
1790
| result is a NaN. The subtraction is performed according to the IEC/IEEE
1791
| Standard for Binary Floating-Point Arithmetic.
1792
*----------------------------------------------------------------------------*/
1794
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1796
int_fast16_t aExp, bExp, zExp;
1797
uint32_t aSig, bSig, zSig;
1798
int_fast16_t expDiff;
1800
aSig = extractFloat32Frac( a );
1801
aExp = extractFloat32Exp( a );
1802
bSig = extractFloat32Frac( b );
1803
bExp = extractFloat32Exp( b );
1804
expDiff = aExp - bExp;
1807
if ( 0 < expDiff ) goto aExpBigger;
1808
if ( expDiff < 0 ) goto bExpBigger;
1809
if ( aExp == 0xFF ) {
1810
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1811
float_raise( float_flag_invalid STATUS_VAR);
1812
return float32_default_nan;
1818
if ( bSig < aSig ) goto aBigger;
1819
if ( aSig < bSig ) goto bBigger;
1820
return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1822
if ( bExp == 0xFF ) {
1823
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1824
return packFloat32( zSign ^ 1, 0xFF, 0 );
1832
shift32RightJamming( aSig, - expDiff, &aSig );
1838
goto normalizeRoundAndPack;
1840
if ( aExp == 0xFF ) {
1841
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1850
shift32RightJamming( bSig, expDiff, &bSig );
1855
normalizeRoundAndPack:
1857
return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1861
/*----------------------------------------------------------------------------
1862
| Returns the result of adding the single-precision floating-point values `a'
1863
| and `b'. The operation is performed according to the IEC/IEEE Standard for
1864
| Binary Floating-Point Arithmetic.
1865
*----------------------------------------------------------------------------*/
1867
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1870
a = float32_squash_input_denormal(a STATUS_VAR);
1871
b = float32_squash_input_denormal(b STATUS_VAR);
1873
aSign = extractFloat32Sign( a );
1874
bSign = extractFloat32Sign( b );
1875
if ( aSign == bSign ) {
1876
return addFloat32Sigs( a, b, aSign STATUS_VAR);
1879
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1884
/*----------------------------------------------------------------------------
1885
| Returns the result of subtracting the single-precision floating-point values
1886
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1887
| for Binary Floating-Point Arithmetic.
1888
*----------------------------------------------------------------------------*/
1890
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1893
a = float32_squash_input_denormal(a STATUS_VAR);
1894
b = float32_squash_input_denormal(b STATUS_VAR);
1896
aSign = extractFloat32Sign( a );
1897
bSign = extractFloat32Sign( b );
1898
if ( aSign == bSign ) {
1899
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1902
return addFloat32Sigs( a, b, aSign STATUS_VAR );
1907
/*----------------------------------------------------------------------------
1908
| Returns the result of multiplying the single-precision floating-point values
1909
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1910
| for Binary Floating-Point Arithmetic.
1911
*----------------------------------------------------------------------------*/
1913
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1915
flag aSign, bSign, zSign;
1916
int_fast16_t aExp, bExp, zExp;
1917
uint32_t aSig, bSig;
1921
a = float32_squash_input_denormal(a STATUS_VAR);
1922
b = float32_squash_input_denormal(b STATUS_VAR);
1924
aSig = extractFloat32Frac( a );
1925
aExp = extractFloat32Exp( a );
1926
aSign = extractFloat32Sign( a );
1927
bSig = extractFloat32Frac( b );
1928
bExp = extractFloat32Exp( b );
1929
bSign = extractFloat32Sign( b );
1930
zSign = aSign ^ bSign;
1931
if ( aExp == 0xFF ) {
1932
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1933
return propagateFloat32NaN( a, b STATUS_VAR );
1935
if ( ( bExp | bSig ) == 0 ) {
1936
float_raise( float_flag_invalid STATUS_VAR);
1937
return float32_default_nan;
1939
return packFloat32( zSign, 0xFF, 0 );
1941
if ( bExp == 0xFF ) {
1942
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1943
if ( ( aExp | aSig ) == 0 ) {
1944
float_raise( float_flag_invalid STATUS_VAR);
1945
return float32_default_nan;
1947
return packFloat32( zSign, 0xFF, 0 );
1950
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1951
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1954
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1955
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1957
zExp = aExp + bExp - 0x7F;
1958
aSig = ( aSig | 0x00800000 )<<7;
1959
bSig = ( bSig | 0x00800000 )<<8;
1960
shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1962
if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1966
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1970
/*----------------------------------------------------------------------------
1971
| Returns the result of dividing the single-precision floating-point value `a'
1972
| by the corresponding value `b'. The operation is performed according to the
1973
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1974
*----------------------------------------------------------------------------*/
1976
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1978
flag aSign, bSign, zSign;
1979
int_fast16_t aExp, bExp, zExp;
1980
uint32_t aSig, bSig, zSig;
1981
a = float32_squash_input_denormal(a STATUS_VAR);
1982
b = float32_squash_input_denormal(b STATUS_VAR);
1984
aSig = extractFloat32Frac( a );
1985
aExp = extractFloat32Exp( a );
1986
aSign = extractFloat32Sign( a );
1987
bSig = extractFloat32Frac( b );
1988
bExp = extractFloat32Exp( b );
1989
bSign = extractFloat32Sign( b );
1990
zSign = aSign ^ bSign;
1991
if ( aExp == 0xFF ) {
1992
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993
if ( bExp == 0xFF ) {
1994
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1995
float_raise( float_flag_invalid STATUS_VAR);
1996
return float32_default_nan;
1998
return packFloat32( zSign, 0xFF, 0 );
2000
if ( bExp == 0xFF ) {
2001
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2002
return packFloat32( zSign, 0, 0 );
2006
if ( ( aExp | aSig ) == 0 ) {
2007
float_raise( float_flag_invalid STATUS_VAR);
2008
return float32_default_nan;
2010
float_raise( float_flag_divbyzero STATUS_VAR);
2011
return packFloat32( zSign, 0xFF, 0 );
2013
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2016
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2017
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2019
zExp = aExp - bExp + 0x7D;
2020
aSig = ( aSig | 0x00800000 )<<7;
2021
bSig = ( bSig | 0x00800000 )<<8;
2022
if ( bSig <= ( aSig + aSig ) ) {
2026
zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2027
if ( ( zSig & 0x3F ) == 0 ) {
2028
zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2030
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2034
/*----------------------------------------------------------------------------
2035
| Returns the remainder of the single-precision floating-point value `a'
2036
| with respect to the corresponding value `b'. The operation is performed
2037
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2038
*----------------------------------------------------------------------------*/
2040
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2043
int_fast16_t aExp, bExp, expDiff;
2044
uint32_t aSig, bSig;
2046
uint64_t aSig64, bSig64, q64;
2047
uint32_t alternateASig;
2049
a = float32_squash_input_denormal(a STATUS_VAR);
2050
b = float32_squash_input_denormal(b STATUS_VAR);
2052
aSig = extractFloat32Frac( a );
2053
aExp = extractFloat32Exp( a );
2054
aSign = extractFloat32Sign( a );
2055
bSig = extractFloat32Frac( b );
2056
bExp = extractFloat32Exp( b );
2057
if ( aExp == 0xFF ) {
2058
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2059
return propagateFloat32NaN( a, b STATUS_VAR );
2061
float_raise( float_flag_invalid STATUS_VAR);
2062
return float32_default_nan;
2064
if ( bExp == 0xFF ) {
2065
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2070
float_raise( float_flag_invalid STATUS_VAR);
2071
return float32_default_nan;
2073
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2076
if ( aSig == 0 ) return a;
2077
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2079
expDiff = aExp - bExp;
2082
if ( expDiff < 32 ) {
2085
if ( expDiff < 0 ) {
2086
if ( expDiff < -1 ) return a;
2089
q = ( bSig <= aSig );
2090
if ( q ) aSig -= bSig;
2091
if ( 0 < expDiff ) {
2092
q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2095
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2103
if ( bSig <= aSig ) aSig -= bSig;
2104
aSig64 = ( (uint64_t) aSig )<<40;
2105
bSig64 = ( (uint64_t) bSig )<<40;
2107
while ( 0 < expDiff ) {
2108
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2109
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2110
aSig64 = - ( ( bSig * q64 )<<38 );
2114
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2115
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2116
q = q64>>( 64 - expDiff );
2118
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2121
alternateASig = aSig;
2124
} while ( 0 <= (int32_t) aSig );
2125
sigMean = aSig + alternateASig;
2126
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2127
aSig = alternateASig;
2129
zSign = ( (int32_t) aSig < 0 );
2130
if ( zSign ) aSig = - aSig;
2131
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2135
/*----------------------------------------------------------------------------
2136
| Returns the result of multiplying the single-precision floating-point values
2137
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2138
| multiplication. The operation is performed according to the IEC/IEEE
2139
| Standard for Binary Floating-Point Arithmetic 754-2008.
2140
| The flags argument allows the caller to select negation of the
2141
| addend, the intermediate product, or the final result. (The difference
2142
| between this and having the caller do a separate negation is that negating
2143
| externally will flip the sign bit on NaNs.)
2144
*----------------------------------------------------------------------------*/
2146
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2148
flag aSign, bSign, cSign, zSign;
2149
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2150
uint32_t aSig, bSig, cSig;
2151
flag pInf, pZero, pSign;
2152
uint64_t pSig64, cSig64, zSig64;
2155
flag signflip, infzero;
2157
a = float32_squash_input_denormal(a STATUS_VAR);
2158
b = float32_squash_input_denormal(b STATUS_VAR);
2159
c = float32_squash_input_denormal(c STATUS_VAR);
2160
aSig = extractFloat32Frac(a);
2161
aExp = extractFloat32Exp(a);
2162
aSign = extractFloat32Sign(a);
2163
bSig = extractFloat32Frac(b);
2164
bExp = extractFloat32Exp(b);
2165
bSign = extractFloat32Sign(b);
2166
cSig = extractFloat32Frac(c);
2167
cExp = extractFloat32Exp(c);
2168
cSign = extractFloat32Sign(c);
2170
infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2171
(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2173
/* It is implementation-defined whether the cases of (0,inf,qnan)
2174
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2175
* they return if they do), so we have to hand this information
2176
* off to the target-specific pick-a-NaN routine.
2178
if (((aExp == 0xff) && aSig) ||
2179
((bExp == 0xff) && bSig) ||
2180
((cExp == 0xff) && cSig)) {
2181
return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2185
float_raise(float_flag_invalid STATUS_VAR);
2186
return float32_default_nan;
2189
if (flags & float_muladd_negate_c) {
2193
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2195
/* Work out the sign and type of the product */
2196
pSign = aSign ^ bSign;
2197
if (flags & float_muladd_negate_product) {
2200
pInf = (aExp == 0xff) || (bExp == 0xff);
2201
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2204
if (pInf && (pSign ^ cSign)) {
2205
/* addition of opposite-signed infinities => InvalidOperation */
2206
float_raise(float_flag_invalid STATUS_VAR);
2207
return float32_default_nan;
2209
/* Otherwise generate an infinity of the same sign */
2210
return packFloat32(cSign ^ signflip, 0xff, 0);
2214
return packFloat32(pSign ^ signflip, 0xff, 0);
2220
/* Adding two exact zeroes */
2221
if (pSign == cSign) {
2223
} else if (STATUS(float_rounding_mode) == float_round_down) {
2228
return packFloat32(zSign ^ signflip, 0, 0);
2230
/* Exact zero plus a denorm */
2231
if (STATUS(flush_to_zero)) {
2232
float_raise(float_flag_output_denormal STATUS_VAR);
2233
return packFloat32(cSign ^ signflip, 0, 0);
2236
/* Zero plus something non-zero : just return the something */
2237
return packFloat32(cSign ^ signflip, cExp, cSig);
2241
normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2244
normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2247
/* Calculate the actual result a * b + c */
2249
/* Multiply first; this is easy. */
2250
/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2251
* because we want the true exponent, not the "one-less-than"
2252
* flavour that roundAndPackFloat32() takes.
2254
pExp = aExp + bExp - 0x7e;
2255
aSig = (aSig | 0x00800000) << 7;
2256
bSig = (bSig | 0x00800000) << 8;
2257
pSig64 = (uint64_t)aSig * bSig;
2258
if ((int64_t)(pSig64 << 1) >= 0) {
2263
zSign = pSign ^ signflip;
2265
/* Now pSig64 is the significand of the multiply, with the explicit bit in
2270
/* Throw out the special case of c being an exact zero now */
2271
shift64RightJamming(pSig64, 32, &pSig64);
2273
return roundAndPackFloat32(zSign, pExp - 1,
2276
normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2279
cSig64 = (uint64_t)cSig << (62 - 23);
2280
cSig64 |= LIT64(0x4000000000000000);
2281
expDiff = pExp - cExp;
2283
if (pSign == cSign) {
2286
/* scale c to match p */
2287
shift64RightJamming(cSig64, expDiff, &cSig64);
2289
} else if (expDiff < 0) {
2290
/* scale p to match c */
2291
shift64RightJamming(pSig64, -expDiff, &pSig64);
2294
/* no scaling needed */
2297
/* Add significands and make sure explicit bit ends up in posn 62 */
2298
zSig64 = pSig64 + cSig64;
2299
if ((int64_t)zSig64 < 0) {
2300
shift64RightJamming(zSig64, 1, &zSig64);
2307
shift64RightJamming(cSig64, expDiff, &cSig64);
2308
zSig64 = pSig64 - cSig64;
2310
} else if (expDiff < 0) {
2311
shift64RightJamming(pSig64, -expDiff, &pSig64);
2312
zSig64 = cSig64 - pSig64;
2317
if (cSig64 < pSig64) {
2318
zSig64 = pSig64 - cSig64;
2319
} else if (pSig64 < cSig64) {
2320
zSig64 = cSig64 - pSig64;
2325
if (STATUS(float_rounding_mode) == float_round_down) {
2328
return packFloat32(zSign, 0, 0);
2332
/* Normalize to put the explicit bit back into bit 62. */
2333
shiftcount = countLeadingZeros64(zSig64) - 1;
2334
zSig64 <<= shiftcount;
2337
shift64RightJamming(zSig64, 32, &zSig64);
2338
return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2342
/*----------------------------------------------------------------------------
2343
| Returns the square root of the single-precision floating-point value `a'.
2344
| The operation is performed according to the IEC/IEEE Standard for Binary
2345
| Floating-Point Arithmetic.
2346
*----------------------------------------------------------------------------*/
2348
float32 float32_sqrt( float32 a STATUS_PARAM )
2351
int_fast16_t aExp, zExp;
2352
uint32_t aSig, zSig;
2354
a = float32_squash_input_denormal(a STATUS_VAR);
2356
aSig = extractFloat32Frac( a );
2357
aExp = extractFloat32Exp( a );
2358
aSign = extractFloat32Sign( a );
2359
if ( aExp == 0xFF ) {
2360
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2361
if ( ! aSign ) return a;
2362
float_raise( float_flag_invalid STATUS_VAR);
2363
return float32_default_nan;
2366
if ( ( aExp | aSig ) == 0 ) return a;
2367
float_raise( float_flag_invalid STATUS_VAR);
2368
return float32_default_nan;
2371
if ( aSig == 0 ) return float32_zero;
2372
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2374
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2375
aSig = ( aSig | 0x00800000 )<<8;
2376
zSig = estimateSqrt32( aExp, aSig ) + 2;
2377
if ( ( zSig & 0x7F ) <= 5 ) {
2383
term = ( (uint64_t) zSig ) * zSig;
2384
rem = ( ( (uint64_t) aSig )<<32 ) - term;
2385
while ( (int64_t) rem < 0 ) {
2387
rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2389
zSig |= ( rem != 0 );
2391
shift32RightJamming( zSig, 1, &zSig );
2393
return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2397
/*----------------------------------------------------------------------------
2398
| Returns the binary exponential of the single-precision floating-point value
2399
| `a'. The operation is performed according to the IEC/IEEE Standard for
2400
| Binary Floating-Point Arithmetic.
2402
| Uses the following identities:
2404
| 1. -------------------------------------------------------------------------
2408
| 2. -------------------------------------------------------------------------
2411
| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2413
*----------------------------------------------------------------------------*/
2415
static const float64 float32_exp2_coefficients[15] =
2417
const_float64( 0x3ff0000000000000ll ), /* 1 */
2418
const_float64( 0x3fe0000000000000ll ), /* 2 */
2419
const_float64( 0x3fc5555555555555ll ), /* 3 */
2420
const_float64( 0x3fa5555555555555ll ), /* 4 */
2421
const_float64( 0x3f81111111111111ll ), /* 5 */
2422
const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2423
const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2424
const_float64( 0x3efa01a01a01a01all ), /* 8 */
2425
const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2426
const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2427
const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2428
const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2429
const_float64( 0x3de6124613a86d09ll ), /* 13 */
2430
const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2431
const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2434
float32 float32_exp2( float32 a STATUS_PARAM )
2441
a = float32_squash_input_denormal(a STATUS_VAR);
2443
aSig = extractFloat32Frac( a );
2444
aExp = extractFloat32Exp( a );
2445
aSign = extractFloat32Sign( a );
2447
if ( aExp == 0xFF) {
2448
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2449
return (aSign) ? float32_zero : a;
2452
if (aSig == 0) return float32_one;
2455
float_raise( float_flag_inexact STATUS_VAR);
2457
/* ******************************* */
2458
/* using float64 for approximation */
2459
/* ******************************* */
2460
x = float32_to_float64(a STATUS_VAR);
2461
x = float64_mul(x, float64_ln2 STATUS_VAR);
2465
for (i = 0 ; i < 15 ; i++) {
2468
f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2469
r = float64_add(r, f STATUS_VAR);
2471
xn = float64_mul(xn, x STATUS_VAR);
2474
return float64_to_float32(r, status);
2477
/*----------------------------------------------------------------------------
2478
| Returns the binary log of the single-precision floating-point value `a'.
2479
| The operation is performed according to the IEC/IEEE Standard for Binary
2480
| Floating-Point Arithmetic.
2481
*----------------------------------------------------------------------------*/
2482
float32 float32_log2( float32 a STATUS_PARAM )
2486
uint32_t aSig, zSig, i;
2488
a = float32_squash_input_denormal(a STATUS_VAR);
2489
aSig = extractFloat32Frac( a );
2490
aExp = extractFloat32Exp( a );
2491
aSign = extractFloat32Sign( a );
2494
if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2495
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2498
float_raise( float_flag_invalid STATUS_VAR);
2499
return float32_default_nan;
2501
if ( aExp == 0xFF ) {
2502
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2511
for (i = 1 << 22; i > 0; i >>= 1) {
2512
aSig = ( (uint64_t)aSig * aSig ) >> 23;
2513
if ( aSig & 0x01000000 ) {
2522
return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2525
/*----------------------------------------------------------------------------
2526
| Returns 1 if the single-precision floating-point value `a' is equal to
2527
| the corresponding value `b', and 0 otherwise. The invalid exception is
2528
| raised if either operand is a NaN. Otherwise, the comparison is performed
2529
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2530
*----------------------------------------------------------------------------*/
2532
int float32_eq( float32 a, float32 b STATUS_PARAM )
2535
a = float32_squash_input_denormal(a STATUS_VAR);
2536
b = float32_squash_input_denormal(b STATUS_VAR);
2538
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2539
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2541
float_raise( float_flag_invalid STATUS_VAR);
2544
av = float32_val(a);
2545
bv = float32_val(b);
2546
return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2549
/*----------------------------------------------------------------------------
2550
| Returns 1 if the single-precision floating-point value `a' is less than
2551
| or equal to the corresponding value `b', and 0 otherwise. The invalid
2552
| exception is raised if either operand is a NaN. The comparison is performed
2553
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2554
*----------------------------------------------------------------------------*/
2556
int float32_le( float32 a, float32 b STATUS_PARAM )
2560
a = float32_squash_input_denormal(a STATUS_VAR);
2561
b = float32_squash_input_denormal(b STATUS_VAR);
2563
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2564
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2566
float_raise( float_flag_invalid STATUS_VAR);
2569
aSign = extractFloat32Sign( a );
2570
bSign = extractFloat32Sign( b );
2571
av = float32_val(a);
2572
bv = float32_val(b);
2573
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2574
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2578
/*----------------------------------------------------------------------------
2579
| Returns 1 if the single-precision floating-point value `a' is less than
2580
| the corresponding value `b', and 0 otherwise. The invalid exception is
2581
| raised if either operand is a NaN. The comparison is performed according
2582
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2583
*----------------------------------------------------------------------------*/
2585
int float32_lt( float32 a, float32 b STATUS_PARAM )
2589
a = float32_squash_input_denormal(a STATUS_VAR);
2590
b = float32_squash_input_denormal(b STATUS_VAR);
2592
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2593
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2595
float_raise( float_flag_invalid STATUS_VAR);
2598
aSign = extractFloat32Sign( a );
2599
bSign = extractFloat32Sign( b );
2600
av = float32_val(a);
2601
bv = float32_val(b);
2602
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2603
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2607
/*----------------------------------------------------------------------------
2608
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2609
| be compared, and 0 otherwise. The invalid exception is raised if either
2610
| operand is a NaN. The comparison is performed according to the IEC/IEEE
2611
| Standard for Binary Floating-Point Arithmetic.
2612
*----------------------------------------------------------------------------*/
2614
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2616
a = float32_squash_input_denormal(a STATUS_VAR);
2617
b = float32_squash_input_denormal(b STATUS_VAR);
2619
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2622
float_raise( float_flag_invalid STATUS_VAR);
2628
/*----------------------------------------------------------------------------
2629
| Returns 1 if the single-precision floating-point value `a' is equal to
2630
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2631
| exception. The comparison is performed according to the IEC/IEEE Standard
2632
| for Binary Floating-Point Arithmetic.
2633
*----------------------------------------------------------------------------*/
2635
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2637
a = float32_squash_input_denormal(a STATUS_VAR);
2638
b = float32_squash_input_denormal(b STATUS_VAR);
2640
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2641
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2643
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2644
float_raise( float_flag_invalid STATUS_VAR);
2648
return ( float32_val(a) == float32_val(b) ) ||
2649
( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2652
/*----------------------------------------------------------------------------
2653
| Returns 1 if the single-precision floating-point value `a' is less than or
2654
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2655
| cause an exception. Otherwise, the comparison is performed according to the
2656
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2657
*----------------------------------------------------------------------------*/
2659
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2663
a = float32_squash_input_denormal(a STATUS_VAR);
2664
b = float32_squash_input_denormal(b STATUS_VAR);
2666
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2667
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2669
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2670
float_raise( float_flag_invalid STATUS_VAR);
2674
aSign = extractFloat32Sign( a );
2675
bSign = extractFloat32Sign( b );
2676
av = float32_val(a);
2677
bv = float32_val(b);
2678
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2679
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2683
/*----------------------------------------------------------------------------
2684
| Returns 1 if the single-precision floating-point value `a' is less than
2685
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2686
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2687
| Standard for Binary Floating-Point Arithmetic.
2688
*----------------------------------------------------------------------------*/
2690
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2694
a = float32_squash_input_denormal(a STATUS_VAR);
2695
b = float32_squash_input_denormal(b STATUS_VAR);
2697
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2698
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2700
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2701
float_raise( float_flag_invalid STATUS_VAR);
2705
aSign = extractFloat32Sign( a );
2706
bSign = extractFloat32Sign( b );
2707
av = float32_val(a);
2708
bv = float32_val(b);
2709
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2710
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2714
/*----------------------------------------------------------------------------
2715
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2716
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2717
| comparison is performed according to the IEC/IEEE Standard for Binary
2718
| Floating-Point Arithmetic.
2719
*----------------------------------------------------------------------------*/
2721
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2723
a = float32_squash_input_denormal(a STATUS_VAR);
2724
b = float32_squash_input_denormal(b STATUS_VAR);
2726
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2727
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2729
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2730
float_raise( float_flag_invalid STATUS_VAR);
2737
/*----------------------------------------------------------------------------
2738
| Returns the result of converting the double-precision floating-point value
2739
| `a' to the 32-bit two's complement integer format. The conversion is
2740
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2741
| Arithmetic---which means in particular that the conversion is rounded
2742
| according to the current rounding mode. If `a' is a NaN, the largest
2743
| positive integer is returned. Otherwise, if the conversion overflows, the
2744
| largest integer with the same sign as `a' is returned.
2745
*----------------------------------------------------------------------------*/
2747
int32 float64_to_int32( float64 a STATUS_PARAM )
2750
int_fast16_t aExp, shiftCount;
2752
a = float64_squash_input_denormal(a STATUS_VAR);
2754
aSig = extractFloat64Frac( a );
2755
aExp = extractFloat64Exp( a );
2756
aSign = extractFloat64Sign( a );
2757
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2758
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2759
shiftCount = 0x42C - aExp;
2760
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2761
return roundAndPackInt32( aSign, aSig STATUS_VAR );
2765
/*----------------------------------------------------------------------------
2766
| Returns the result of converting the double-precision floating-point value
2767
| `a' to the 32-bit two's complement integer format. The conversion is
2768
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2769
| Arithmetic, except that the conversion is always rounded toward zero.
2770
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2771
| the conversion overflows, the largest integer with the same sign as `a' is
2773
*----------------------------------------------------------------------------*/
2775
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2778
int_fast16_t aExp, shiftCount;
2779
uint64_t aSig, savedASig;
2781
a = float64_squash_input_denormal(a STATUS_VAR);
2783
aSig = extractFloat64Frac( a );
2784
aExp = extractFloat64Exp( a );
2785
aSign = extractFloat64Sign( a );
2786
if ( 0x41E < aExp ) {
2787
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2790
else if ( aExp < 0x3FF ) {
2791
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2794
aSig |= LIT64( 0x0010000000000000 );
2795
shiftCount = 0x433 - aExp;
2797
aSig >>= shiftCount;
2799
if ( aSign ) z = - z;
2800
if ( ( z < 0 ) ^ aSign ) {
2802
float_raise( float_flag_invalid STATUS_VAR);
2803
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2805
if ( ( aSig<<shiftCount ) != savedASig ) {
2806
STATUS(float_exception_flags) |= float_flag_inexact;
2812
/*----------------------------------------------------------------------------
2813
| Returns the result of converting the double-precision floating-point value
2814
| `a' to the 16-bit two's complement integer format. The conversion is
2815
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2816
| Arithmetic, except that the conversion is always rounded toward zero.
2817
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2818
| the conversion overflows, the largest integer with the same sign as `a' is
2820
*----------------------------------------------------------------------------*/
2822
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2825
int_fast16_t aExp, shiftCount;
2826
uint64_t aSig, savedASig;
2829
aSig = extractFloat64Frac( a );
2830
aExp = extractFloat64Exp( a );
2831
aSign = extractFloat64Sign( a );
2832
if ( 0x40E < aExp ) {
2833
if ( ( aExp == 0x7FF ) && aSig ) {
2838
else if ( aExp < 0x3FF ) {
2839
if ( aExp || aSig ) {
2840
STATUS(float_exception_flags) |= float_flag_inexact;
2844
aSig |= LIT64( 0x0010000000000000 );
2845
shiftCount = 0x433 - aExp;
2847
aSig >>= shiftCount;
2852
if ( ( (int16_t)z < 0 ) ^ aSign ) {
2854
float_raise( float_flag_invalid STATUS_VAR);
2855
return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2857
if ( ( aSig<<shiftCount ) != savedASig ) {
2858
STATUS(float_exception_flags) |= float_flag_inexact;
2863
/*----------------------------------------------------------------------------
2864
| Returns the result of converting the double-precision floating-point value
2865
| `a' to the 64-bit two's complement integer format. The conversion is
2866
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2867
| Arithmetic---which means in particular that the conversion is rounded
2868
| according to the current rounding mode. If `a' is a NaN, the largest
2869
| positive integer is returned. Otherwise, if the conversion overflows, the
2870
| largest integer with the same sign as `a' is returned.
2871
*----------------------------------------------------------------------------*/
2873
int64 float64_to_int64( float64 a STATUS_PARAM )
2876
int_fast16_t aExp, shiftCount;
2877
uint64_t aSig, aSigExtra;
2878
a = float64_squash_input_denormal(a STATUS_VAR);
2880
aSig = extractFloat64Frac( a );
2881
aExp = extractFloat64Exp( a );
2882
aSign = extractFloat64Sign( a );
2883
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2884
shiftCount = 0x433 - aExp;
2885
if ( shiftCount <= 0 ) {
2886
if ( 0x43E < aExp ) {
2887
float_raise( float_flag_invalid STATUS_VAR);
2889
|| ( ( aExp == 0x7FF )
2890
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2892
return LIT64( 0x7FFFFFFFFFFFFFFF );
2894
return (int64_t) LIT64( 0x8000000000000000 );
2897
aSig <<= - shiftCount;
2900
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2902
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2906
/*----------------------------------------------------------------------------
2907
| Returns the result of converting the double-precision floating-point value
2908
| `a' to the 64-bit two's complement integer format. The conversion is
2909
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2910
| Arithmetic, except that the conversion is always rounded toward zero.
2911
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2912
| the conversion overflows, the largest integer with the same sign as `a' is
2914
*----------------------------------------------------------------------------*/
2916
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2919
int_fast16_t aExp, shiftCount;
2922
a = float64_squash_input_denormal(a STATUS_VAR);
2924
aSig = extractFloat64Frac( a );
2925
aExp = extractFloat64Exp( a );
2926
aSign = extractFloat64Sign( a );
2927
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2928
shiftCount = aExp - 0x433;
2929
if ( 0 <= shiftCount ) {
2930
if ( 0x43E <= aExp ) {
2931
if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2932
float_raise( float_flag_invalid STATUS_VAR);
2934
|| ( ( aExp == 0x7FF )
2935
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2937
return LIT64( 0x7FFFFFFFFFFFFFFF );
2940
return (int64_t) LIT64( 0x8000000000000000 );
2942
z = aSig<<shiftCount;
2945
if ( aExp < 0x3FE ) {
2946
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2949
z = aSig>>( - shiftCount );
2950
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2951
STATUS(float_exception_flags) |= float_flag_inexact;
2954
if ( aSign ) z = - z;
2959
/*----------------------------------------------------------------------------
2960
| Returns the result of converting the double-precision floating-point value
2961
| `a' to the single-precision floating-point format. The conversion is
2962
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2964
*----------------------------------------------------------------------------*/
2966
float32 float64_to_float32( float64 a STATUS_PARAM )
2972
a = float64_squash_input_denormal(a STATUS_VAR);
2974
aSig = extractFloat64Frac( a );
2975
aExp = extractFloat64Exp( a );
2976
aSign = extractFloat64Sign( a );
2977
if ( aExp == 0x7FF ) {
2978
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2979
return packFloat32( aSign, 0xFF, 0 );
2981
shift64RightJamming( aSig, 22, &aSig );
2983
if ( aExp || zSig ) {
2987
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2992
/*----------------------------------------------------------------------------
2993
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2994
| half-precision floating-point value, returning the result. After being
2995
| shifted into the proper positions, the three fields are simply added
2996
| together to form the result. This means that any integer portion of `zSig'
2997
| will be added into the exponent. Since a properly normalized significand
2998
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2999
| than the desired result exponent whenever `zSig' is a complete, normalized
3001
*----------------------------------------------------------------------------*/
3002
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3004
return make_float16(
3005
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3008
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3009
The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3011
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3017
aSign = extractFloat16Sign(a);
3018
aExp = extractFloat16Exp(a);
3019
aSig = extractFloat16Frac(a);
3021
if (aExp == 0x1f && ieee) {
3023
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3025
return packFloat32(aSign, 0xff, 0);
3031
return packFloat32(aSign, 0, 0);
3034
shiftCount = countLeadingZeros32( aSig ) - 21;
3035
aSig = aSig << shiftCount;
3038
return packFloat32( aSign, aExp + 0x70, aSig << 13);
3041
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3049
a = float32_squash_input_denormal(a STATUS_VAR);
3051
aSig = extractFloat32Frac( a );
3052
aExp = extractFloat32Exp( a );
3053
aSign = extractFloat32Sign( a );
3054
if ( aExp == 0xFF ) {
3056
/* Input is a NaN */
3057
float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3059
return packFloat16(aSign, 0, 0);
3065
float_raise(float_flag_invalid STATUS_VAR);
3066
return packFloat16(aSign, 0x1f, 0x3ff);
3068
return packFloat16(aSign, 0x1f, 0);
3070
if (aExp == 0 && aSig == 0) {
3071
return packFloat16(aSign, 0, 0);
3073
/* Decimal point between bits 22 and 23. */
3085
float_raise( float_flag_underflow STATUS_VAR );
3086
roundingMode = STATUS(float_rounding_mode);
3087
switch (roundingMode) {
3088
case float_round_nearest_even:
3089
increment = (mask + 1) >> 1;
3090
if ((aSig & mask) == increment) {
3091
increment = aSig & (increment << 1);
3094
case float_round_up:
3095
increment = aSign ? 0 : mask;
3097
case float_round_down:
3098
increment = aSign ? mask : 0;
3100
default: /* round_to_zero */
3105
if (aSig >= 0x01000000) {
3109
} else if (aExp < -14
3110
&& STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3111
float_raise( float_flag_underflow STATUS_VAR);
3116
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3117
return packFloat16(aSign, 0x1f, 0);
3121
float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3122
return packFloat16(aSign, 0x1f, 0x3ff);
3126
return packFloat16(aSign, 0, 0);
3129
aSig >>= -14 - aExp;
3132
return packFloat16(aSign, aExp + 14, aSig >> 13);
3135
/*----------------------------------------------------------------------------
3136
| Returns the result of converting the double-precision floating-point value
3137
| `a' to the extended double-precision floating-point format. The conversion
3138
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3140
*----------------------------------------------------------------------------*/
3142
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3148
a = float64_squash_input_denormal(a STATUS_VAR);
3149
aSig = extractFloat64Frac( a );
3150
aExp = extractFloat64Exp( a );
3151
aSign = extractFloat64Sign( a );
3152
if ( aExp == 0x7FF ) {
3153
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3154
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3157
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3158
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3162
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3166
/*----------------------------------------------------------------------------
3167
| Returns the result of converting the double-precision floating-point value
3168
| `a' to the quadruple-precision floating-point format. The conversion is
3169
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3171
*----------------------------------------------------------------------------*/
3173
float128 float64_to_float128( float64 a STATUS_PARAM )
3177
uint64_t aSig, zSig0, zSig1;
3179
a = float64_squash_input_denormal(a STATUS_VAR);
3180
aSig = extractFloat64Frac( a );
3181
aExp = extractFloat64Exp( a );
3182
aSign = extractFloat64Sign( a );
3183
if ( aExp == 0x7FF ) {
3184
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3185
return packFloat128( aSign, 0x7FFF, 0, 0 );
3188
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3189
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3192
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3193
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3197
/*----------------------------------------------------------------------------
3198
| Rounds the double-precision floating-point value `a' to an integer, and
3199
| returns the result as a double-precision floating-point value. The
3200
| operation is performed according to the IEC/IEEE Standard for Binary
3201
| Floating-Point Arithmetic.
3202
*----------------------------------------------------------------------------*/
3204
float64 float64_round_to_int( float64 a STATUS_PARAM )
3208
uint64_t lastBitMask, roundBitsMask;
3211
a = float64_squash_input_denormal(a STATUS_VAR);
3213
aExp = extractFloat64Exp( a );
3214
if ( 0x433 <= aExp ) {
3215
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3216
return propagateFloat64NaN( a, a STATUS_VAR );
3220
if ( aExp < 0x3FF ) {
3221
if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3222
STATUS(float_exception_flags) |= float_flag_inexact;
3223
aSign = extractFloat64Sign( a );
3224
switch ( STATUS(float_rounding_mode) ) {
3225
case float_round_nearest_even:
3226
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3227
return packFloat64( aSign, 0x3FF, 0 );
3230
case float_round_down:
3231
return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3232
case float_round_up:
3233
return make_float64(
3234
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3236
return packFloat64( aSign, 0, 0 );
3239
lastBitMask <<= 0x433 - aExp;
3240
roundBitsMask = lastBitMask - 1;
3242
roundingMode = STATUS(float_rounding_mode);
3243
if ( roundingMode == float_round_nearest_even ) {
3244
z += lastBitMask>>1;
3245
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3247
else if ( roundingMode != float_round_to_zero ) {
3248
if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3252
z &= ~ roundBitsMask;
3253
if ( z != float64_val(a) )
3254
STATUS(float_exception_flags) |= float_flag_inexact;
3255
return make_float64(z);
3259
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3263
oldmode = STATUS(float_rounding_mode);
3264
STATUS(float_rounding_mode) = float_round_to_zero;
3265
res = float64_round_to_int(a STATUS_VAR);
3266
STATUS(float_rounding_mode) = oldmode;
3270
/*----------------------------------------------------------------------------
3271
| Returns the result of adding the absolute values of the double-precision
3272
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3273
| before being returned. `zSign' is ignored if the result is a NaN.
3274
| The addition is performed according to the IEC/IEEE Standard for Binary
3275
| Floating-Point Arithmetic.
3276
*----------------------------------------------------------------------------*/
3278
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3280
int_fast16_t aExp, bExp, zExp;
3281
uint64_t aSig, bSig, zSig;
3282
int_fast16_t expDiff;
3284
aSig = extractFloat64Frac( a );
3285
aExp = extractFloat64Exp( a );
3286
bSig = extractFloat64Frac( b );
3287
bExp = extractFloat64Exp( b );
3288
expDiff = aExp - bExp;
3291
if ( 0 < expDiff ) {
3292
if ( aExp == 0x7FF ) {
3293
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3300
bSig |= LIT64( 0x2000000000000000 );
3302
shift64RightJamming( bSig, expDiff, &bSig );
3305
else if ( expDiff < 0 ) {
3306
if ( bExp == 0x7FF ) {
3307
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3308
return packFloat64( zSign, 0x7FF, 0 );
3314
aSig |= LIT64( 0x2000000000000000 );
3316
shift64RightJamming( aSig, - expDiff, &aSig );
3320
if ( aExp == 0x7FF ) {
3321
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3325
if (STATUS(flush_to_zero)) {
3327
float_raise(float_flag_output_denormal STATUS_VAR);
3329
return packFloat64(zSign, 0, 0);
3331
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3333
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3337
aSig |= LIT64( 0x2000000000000000 );
3338
zSig = ( aSig + bSig )<<1;
3340
if ( (int64_t) zSig < 0 ) {
3345
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3349
/*----------------------------------------------------------------------------
3350
| Returns the result of subtracting the absolute values of the double-
3351
| precision floating-point values `a' and `b'. If `zSign' is 1, the
3352
| difference is negated before being returned. `zSign' is ignored if the
3353
| result is a NaN. The subtraction is performed according to the IEC/IEEE
3354
| Standard for Binary Floating-Point Arithmetic.
3355
*----------------------------------------------------------------------------*/
3357
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3359
int_fast16_t aExp, bExp, zExp;
3360
uint64_t aSig, bSig, zSig;
3361
int_fast16_t expDiff;
3363
aSig = extractFloat64Frac( a );
3364
aExp = extractFloat64Exp( a );
3365
bSig = extractFloat64Frac( b );
3366
bExp = extractFloat64Exp( b );
3367
expDiff = aExp - bExp;
3370
if ( 0 < expDiff ) goto aExpBigger;
3371
if ( expDiff < 0 ) goto bExpBigger;
3372
if ( aExp == 0x7FF ) {
3373
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3374
float_raise( float_flag_invalid STATUS_VAR);
3375
return float64_default_nan;
3381
if ( bSig < aSig ) goto aBigger;
3382
if ( aSig < bSig ) goto bBigger;
3383
return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3385
if ( bExp == 0x7FF ) {
3386
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3387
return packFloat64( zSign ^ 1, 0x7FF, 0 );
3393
aSig |= LIT64( 0x4000000000000000 );
3395
shift64RightJamming( aSig, - expDiff, &aSig );
3396
bSig |= LIT64( 0x4000000000000000 );
3401
goto normalizeRoundAndPack;
3403
if ( aExp == 0x7FF ) {
3404
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3411
bSig |= LIT64( 0x4000000000000000 );
3413
shift64RightJamming( bSig, expDiff, &bSig );
3414
aSig |= LIT64( 0x4000000000000000 );
3418
normalizeRoundAndPack:
3420
return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3424
/*----------------------------------------------------------------------------
3425
| Returns the result of adding the double-precision floating-point values `a'
3426
| and `b'. The operation is performed according to the IEC/IEEE Standard for
3427
| Binary Floating-Point Arithmetic.
3428
*----------------------------------------------------------------------------*/
3430
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3433
a = float64_squash_input_denormal(a STATUS_VAR);
3434
b = float64_squash_input_denormal(b STATUS_VAR);
3436
aSign = extractFloat64Sign( a );
3437
bSign = extractFloat64Sign( b );
3438
if ( aSign == bSign ) {
3439
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3442
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3447
/*----------------------------------------------------------------------------
3448
| Returns the result of subtracting the double-precision floating-point values
3449
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3450
| for Binary Floating-Point Arithmetic.
3451
*----------------------------------------------------------------------------*/
3453
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3456
a = float64_squash_input_denormal(a STATUS_VAR);
3457
b = float64_squash_input_denormal(b STATUS_VAR);
3459
aSign = extractFloat64Sign( a );
3460
bSign = extractFloat64Sign( b );
3461
if ( aSign == bSign ) {
3462
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3465
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3470
/*----------------------------------------------------------------------------
3471
| Returns the result of multiplying the double-precision floating-point values
3472
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3473
| for Binary Floating-Point Arithmetic.
3474
*----------------------------------------------------------------------------*/
3476
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3478
flag aSign, bSign, zSign;
3479
int_fast16_t aExp, bExp, zExp;
3480
uint64_t aSig, bSig, zSig0, zSig1;
3482
a = float64_squash_input_denormal(a STATUS_VAR);
3483
b = float64_squash_input_denormal(b STATUS_VAR);
3485
aSig = extractFloat64Frac( a );
3486
aExp = extractFloat64Exp( a );
3487
aSign = extractFloat64Sign( a );
3488
bSig = extractFloat64Frac( b );
3489
bExp = extractFloat64Exp( b );
3490
bSign = extractFloat64Sign( b );
3491
zSign = aSign ^ bSign;
3492
if ( aExp == 0x7FF ) {
3493
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3494
return propagateFloat64NaN( a, b STATUS_VAR );
3496
if ( ( bExp | bSig ) == 0 ) {
3497
float_raise( float_flag_invalid STATUS_VAR);
3498
return float64_default_nan;
3500
return packFloat64( zSign, 0x7FF, 0 );
3502
if ( bExp == 0x7FF ) {
3503
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3504
if ( ( aExp | aSig ) == 0 ) {
3505
float_raise( float_flag_invalid STATUS_VAR);
3506
return float64_default_nan;
3508
return packFloat64( zSign, 0x7FF, 0 );
3511
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3512
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3515
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3516
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3518
zExp = aExp + bExp - 0x3FF;
3519
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3520
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3521
mul64To128( aSig, bSig, &zSig0, &zSig1 );
3522
zSig0 |= ( zSig1 != 0 );
3523
if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3527
return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3531
/*----------------------------------------------------------------------------
3532
| Returns the result of dividing the double-precision floating-point value `a'
3533
| by the corresponding value `b'. The operation is performed according to
3534
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3535
*----------------------------------------------------------------------------*/
3537
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3539
flag aSign, bSign, zSign;
3540
int_fast16_t aExp, bExp, zExp;
3541
uint64_t aSig, bSig, zSig;
3542
uint64_t rem0, rem1;
3543
uint64_t term0, term1;
3544
a = float64_squash_input_denormal(a STATUS_VAR);
3545
b = float64_squash_input_denormal(b STATUS_VAR);
3547
aSig = extractFloat64Frac( a );
3548
aExp = extractFloat64Exp( a );
3549
aSign = extractFloat64Sign( a );
3550
bSig = extractFloat64Frac( b );
3551
bExp = extractFloat64Exp( b );
3552
bSign = extractFloat64Sign( b );
3553
zSign = aSign ^ bSign;
3554
if ( aExp == 0x7FF ) {
3555
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3556
if ( bExp == 0x7FF ) {
3557
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3558
float_raise( float_flag_invalid STATUS_VAR);
3559
return float64_default_nan;
3561
return packFloat64( zSign, 0x7FF, 0 );
3563
if ( bExp == 0x7FF ) {
3564
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3565
return packFloat64( zSign, 0, 0 );
3569
if ( ( aExp | aSig ) == 0 ) {
3570
float_raise( float_flag_invalid STATUS_VAR);
3571
return float64_default_nan;
3573
float_raise( float_flag_divbyzero STATUS_VAR);
3574
return packFloat64( zSign, 0x7FF, 0 );
3576
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3579
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3580
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3582
zExp = aExp - bExp + 0x3FD;
3583
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3584
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3585
if ( bSig <= ( aSig + aSig ) ) {
3589
zSig = estimateDiv128To64( aSig, 0, bSig );
3590
if ( ( zSig & 0x1FF ) <= 2 ) {
3591
mul64To128( bSig, zSig, &term0, &term1 );
3592
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3593
while ( (int64_t) rem0 < 0 ) {
3595
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3597
zSig |= ( rem1 != 0 );
3599
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3603
/*----------------------------------------------------------------------------
3604
| Returns the remainder of the double-precision floating-point value `a'
3605
| with respect to the corresponding value `b'. The operation is performed
3606
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3607
*----------------------------------------------------------------------------*/
3609
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3612
int_fast16_t aExp, bExp, expDiff;
3613
uint64_t aSig, bSig;
3614
uint64_t q, alternateASig;
3617
a = float64_squash_input_denormal(a STATUS_VAR);
3618
b = float64_squash_input_denormal(b STATUS_VAR);
3619
aSig = extractFloat64Frac( a );
3620
aExp = extractFloat64Exp( a );
3621
aSign = extractFloat64Sign( a );
3622
bSig = extractFloat64Frac( b );
3623
bExp = extractFloat64Exp( b );
3624
if ( aExp == 0x7FF ) {
3625
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3626
return propagateFloat64NaN( a, b STATUS_VAR );
3628
float_raise( float_flag_invalid STATUS_VAR);
3629
return float64_default_nan;
3631
if ( bExp == 0x7FF ) {
3632
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3637
float_raise( float_flag_invalid STATUS_VAR);
3638
return float64_default_nan;
3640
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3643
if ( aSig == 0 ) return a;
3644
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3646
expDiff = aExp - bExp;
3647
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3648
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3649
if ( expDiff < 0 ) {
3650
if ( expDiff < -1 ) return a;
3653
q = ( bSig <= aSig );
3654
if ( q ) aSig -= bSig;
3656
while ( 0 < expDiff ) {
3657
q = estimateDiv128To64( aSig, 0, bSig );
3658
q = ( 2 < q ) ? q - 2 : 0;
3659
aSig = - ( ( bSig>>2 ) * q );
3663
if ( 0 < expDiff ) {
3664
q = estimateDiv128To64( aSig, 0, bSig );
3665
q = ( 2 < q ) ? q - 2 : 0;
3668
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3675
alternateASig = aSig;
3678
} while ( 0 <= (int64_t) aSig );
3679
sigMean = aSig + alternateASig;
3680
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3681
aSig = alternateASig;
3683
zSign = ( (int64_t) aSig < 0 );
3684
if ( zSign ) aSig = - aSig;
3685
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3689
/*----------------------------------------------------------------------------
3690
| Returns the result of multiplying the double-precision floating-point values
3691
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3692
| multiplication. The operation is performed according to the IEC/IEEE
3693
| Standard for Binary Floating-Point Arithmetic 754-2008.
3694
| The flags argument allows the caller to select negation of the
3695
| addend, the intermediate product, or the final result. (The difference
3696
| between this and having the caller do a separate negation is that negating
3697
| externally will flip the sign bit on NaNs.)
3698
*----------------------------------------------------------------------------*/
3700
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3702
flag aSign, bSign, cSign, zSign;
3703
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3704
uint64_t aSig, bSig, cSig;
3705
flag pInf, pZero, pSign;
3706
uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3708
flag signflip, infzero;
3710
a = float64_squash_input_denormal(a STATUS_VAR);
3711
b = float64_squash_input_denormal(b STATUS_VAR);
3712
c = float64_squash_input_denormal(c STATUS_VAR);
3713
aSig = extractFloat64Frac(a);
3714
aExp = extractFloat64Exp(a);
3715
aSign = extractFloat64Sign(a);
3716
bSig = extractFloat64Frac(b);
3717
bExp = extractFloat64Exp(b);
3718
bSign = extractFloat64Sign(b);
3719
cSig = extractFloat64Frac(c);
3720
cExp = extractFloat64Exp(c);
3721
cSign = extractFloat64Sign(c);
3723
infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3724
(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3726
/* It is implementation-defined whether the cases of (0,inf,qnan)
3727
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3728
* they return if they do), so we have to hand this information
3729
* off to the target-specific pick-a-NaN routine.
3731
if (((aExp == 0x7ff) && aSig) ||
3732
((bExp == 0x7ff) && bSig) ||
3733
((cExp == 0x7ff) && cSig)) {
3734
return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3738
float_raise(float_flag_invalid STATUS_VAR);
3739
return float64_default_nan;
3742
if (flags & float_muladd_negate_c) {
3746
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3748
/* Work out the sign and type of the product */
3749
pSign = aSign ^ bSign;
3750
if (flags & float_muladd_negate_product) {
3753
pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3754
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3756
if (cExp == 0x7ff) {
3757
if (pInf && (pSign ^ cSign)) {
3758
/* addition of opposite-signed infinities => InvalidOperation */
3759
float_raise(float_flag_invalid STATUS_VAR);
3760
return float64_default_nan;
3762
/* Otherwise generate an infinity of the same sign */
3763
return packFloat64(cSign ^ signflip, 0x7ff, 0);
3767
return packFloat64(pSign ^ signflip, 0x7ff, 0);
3773
/* Adding two exact zeroes */
3774
if (pSign == cSign) {
3776
} else if (STATUS(float_rounding_mode) == float_round_down) {
3781
return packFloat64(zSign ^ signflip, 0, 0);
3783
/* Exact zero plus a denorm */
3784
if (STATUS(flush_to_zero)) {
3785
float_raise(float_flag_output_denormal STATUS_VAR);
3786
return packFloat64(cSign ^ signflip, 0, 0);
3789
/* Zero plus something non-zero : just return the something */
3790
return packFloat64(cSign ^ signflip, cExp, cSig);
3794
normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3797
normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3800
/* Calculate the actual result a * b + c */
3802
/* Multiply first; this is easy. */
3803
/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3804
* because we want the true exponent, not the "one-less-than"
3805
* flavour that roundAndPackFloat64() takes.
3807
pExp = aExp + bExp - 0x3fe;
3808
aSig = (aSig | LIT64(0x0010000000000000))<<10;
3809
bSig = (bSig | LIT64(0x0010000000000000))<<11;
3810
mul64To128(aSig, bSig, &pSig0, &pSig1);
3811
if ((int64_t)(pSig0 << 1) >= 0) {
3812
shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3816
zSign = pSign ^ signflip;
3818
/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3819
* bit in position 126.
3823
/* Throw out the special case of c being an exact zero now */
3824
shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3825
return roundAndPackFloat64(zSign, pExp - 1,
3828
normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3831
/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3832
* significand of the addend, with the explicit bit in position 126.
3834
cSig0 = cSig << (126 - 64 - 52);
3836
cSig0 |= LIT64(0x4000000000000000);
3837
expDiff = pExp - cExp;
3839
if (pSign == cSign) {
3842
/* scale c to match p */
3843
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3845
} else if (expDiff < 0) {
3846
/* scale p to match c */
3847
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3850
/* no scaling needed */
3853
/* Add significands and make sure explicit bit ends up in posn 126 */
3854
add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3855
if ((int64_t)zSig0 < 0) {
3856
shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3860
shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3861
return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3865
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3866
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3868
} else if (expDiff < 0) {
3869
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3870
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3875
if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3876
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3877
} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3878
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3883
if (STATUS(float_rounding_mode) == float_round_down) {
3886
return packFloat64(zSign, 0, 0);
3890
/* Do the equivalent of normalizeRoundAndPackFloat64() but
3891
* starting with the significand in a pair of uint64_t.
3894
shiftcount = countLeadingZeros64(zSig0) - 1;
3895
shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3901
shiftcount = countLeadingZeros64(zSig1);
3902
if (shiftcount == 0) {
3903
zSig0 = (zSig1 >> 1) | (zSig1 & 1);
3907
zSig0 = zSig1 << shiftcount;
3908
zExp -= (shiftcount + 64);
3911
return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3915
/*----------------------------------------------------------------------------
3916
| Returns the square root of the double-precision floating-point value `a'.
3917
| The operation is performed according to the IEC/IEEE Standard for Binary
3918
| Floating-Point Arithmetic.
3919
*----------------------------------------------------------------------------*/
3921
float64 float64_sqrt( float64 a STATUS_PARAM )
3924
int_fast16_t aExp, zExp;
3925
uint64_t aSig, zSig, doubleZSig;
3926
uint64_t rem0, rem1, term0, term1;
3927
a = float64_squash_input_denormal(a STATUS_VAR);
3929
aSig = extractFloat64Frac( a );
3930
aExp = extractFloat64Exp( a );
3931
aSign = extractFloat64Sign( a );
3932
if ( aExp == 0x7FF ) {
3933
if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3934
if ( ! aSign ) return a;
3935
float_raise( float_flag_invalid STATUS_VAR);
3936
return float64_default_nan;
3939
if ( ( aExp | aSig ) == 0 ) return a;
3940
float_raise( float_flag_invalid STATUS_VAR);
3941
return float64_default_nan;
3944
if ( aSig == 0 ) return float64_zero;
3945
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3947
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3948
aSig |= LIT64( 0x0010000000000000 );
3949
zSig = estimateSqrt32( aExp, aSig>>21 );
3950
aSig <<= 9 - ( aExp & 1 );
3951
zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3952
if ( ( zSig & 0x1FF ) <= 5 ) {
3953
doubleZSig = zSig<<1;
3954
mul64To128( zSig, zSig, &term0, &term1 );
3955
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3956
while ( (int64_t) rem0 < 0 ) {
3959
add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3961
zSig |= ( ( rem0 | rem1 ) != 0 );
3963
return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3967
/*----------------------------------------------------------------------------
3968
| Returns the binary log of the double-precision floating-point value `a'.
3969
| The operation is performed according to the IEC/IEEE Standard for Binary
3970
| Floating-Point Arithmetic.
3971
*----------------------------------------------------------------------------*/
3972
float64 float64_log2( float64 a STATUS_PARAM )
3976
uint64_t aSig, aSig0, aSig1, zSig, i;
3977
a = float64_squash_input_denormal(a STATUS_VAR);
3979
aSig = extractFloat64Frac( a );
3980
aExp = extractFloat64Exp( a );
3981
aSign = extractFloat64Sign( a );
3984
if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3985
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3988
float_raise( float_flag_invalid STATUS_VAR);
3989
return float64_default_nan;
3991
if ( aExp == 0x7FF ) {
3992
if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3997
aSig |= LIT64( 0x0010000000000000 );
3999
zSig = (uint64_t)aExp << 52;
4000
for (i = 1LL << 51; i > 0; i >>= 1) {
4001
mul64To128( aSig, aSig, &aSig0, &aSig1 );
4002
aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4003
if ( aSig & LIT64( 0x0020000000000000 ) ) {
4011
return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4014
/*----------------------------------------------------------------------------
4015
| Returns 1 if the double-precision floating-point value `a' is equal to the
4016
| corresponding value `b', and 0 otherwise. The invalid exception is raised
4017
| if either operand is a NaN. Otherwise, the comparison is performed
4018
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4019
*----------------------------------------------------------------------------*/
4021
int float64_eq( float64 a, float64 b STATUS_PARAM )
4024
a = float64_squash_input_denormal(a STATUS_VAR);
4025
b = float64_squash_input_denormal(b STATUS_VAR);
4027
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4028
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4030
float_raise( float_flag_invalid STATUS_VAR);
4033
av = float64_val(a);
4034
bv = float64_val(b);
4035
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4039
/*----------------------------------------------------------------------------
4040
| Returns 1 if the double-precision floating-point value `a' is less than or
4041
| equal to the corresponding value `b', and 0 otherwise. The invalid
4042
| exception is raised if either operand is a NaN. The comparison is performed
4043
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4044
*----------------------------------------------------------------------------*/
4046
int float64_le( float64 a, float64 b STATUS_PARAM )
4050
a = float64_squash_input_denormal(a STATUS_VAR);
4051
b = float64_squash_input_denormal(b STATUS_VAR);
4053
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4054
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4056
float_raise( float_flag_invalid STATUS_VAR);
4059
aSign = extractFloat64Sign( a );
4060
bSign = extractFloat64Sign( b );
4061
av = float64_val(a);
4062
bv = float64_val(b);
4063
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4064
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4068
/*----------------------------------------------------------------------------
4069
| Returns 1 if the double-precision floating-point value `a' is less than
4070
| the corresponding value `b', and 0 otherwise. The invalid exception is
4071
| raised if either operand is a NaN. The comparison is performed according
4072
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073
*----------------------------------------------------------------------------*/
4075
int float64_lt( float64 a, float64 b STATUS_PARAM )
4080
a = float64_squash_input_denormal(a STATUS_VAR);
4081
b = float64_squash_input_denormal(b STATUS_VAR);
4082
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4083
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4085
float_raise( float_flag_invalid STATUS_VAR);
4088
aSign = extractFloat64Sign( a );
4089
bSign = extractFloat64Sign( b );
4090
av = float64_val(a);
4091
bv = float64_val(b);
4092
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4093
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4097
/*----------------------------------------------------------------------------
4098
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4099
| be compared, and 0 otherwise. The invalid exception is raised if either
4100
| operand is a NaN. The comparison is performed according to the IEC/IEEE
4101
| Standard for Binary Floating-Point Arithmetic.
4102
*----------------------------------------------------------------------------*/
4104
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4106
a = float64_squash_input_denormal(a STATUS_VAR);
4107
b = float64_squash_input_denormal(b STATUS_VAR);
4109
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4110
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4112
float_raise( float_flag_invalid STATUS_VAR);
4118
/*----------------------------------------------------------------------------
4119
| Returns 1 if the double-precision floating-point value `a' is equal to the
4120
| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4121
| exception.The comparison is performed according to the IEC/IEEE Standard
4122
| for Binary Floating-Point Arithmetic.
4123
*----------------------------------------------------------------------------*/
4125
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4128
a = float64_squash_input_denormal(a STATUS_VAR);
4129
b = float64_squash_input_denormal(b STATUS_VAR);
4131
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4132
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4134
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4135
float_raise( float_flag_invalid STATUS_VAR);
4139
av = float64_val(a);
4140
bv = float64_val(b);
4141
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4145
/*----------------------------------------------------------------------------
4146
| Returns 1 if the double-precision floating-point value `a' is less than or
4147
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4148
| cause an exception. Otherwise, the comparison is performed according to the
4149
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4150
*----------------------------------------------------------------------------*/
4152
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4156
a = float64_squash_input_denormal(a STATUS_VAR);
4157
b = float64_squash_input_denormal(b STATUS_VAR);
4159
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4160
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4162
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4163
float_raise( float_flag_invalid STATUS_VAR);
4167
aSign = extractFloat64Sign( a );
4168
bSign = extractFloat64Sign( b );
4169
av = float64_val(a);
4170
bv = float64_val(b);
4171
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4172
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4176
/*----------------------------------------------------------------------------
4177
| Returns 1 if the double-precision floating-point value `a' is less than
4178
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4179
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4180
| Standard for Binary Floating-Point Arithmetic.
4181
*----------------------------------------------------------------------------*/
4183
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4187
a = float64_squash_input_denormal(a STATUS_VAR);
4188
b = float64_squash_input_denormal(b STATUS_VAR);
4190
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4191
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4193
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4194
float_raise( float_flag_invalid STATUS_VAR);
4198
aSign = extractFloat64Sign( a );
4199
bSign = extractFloat64Sign( b );
4200
av = float64_val(a);
4201
bv = float64_val(b);
4202
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4203
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4207
/*----------------------------------------------------------------------------
4208
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4209
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4210
| comparison is performed according to the IEC/IEEE Standard for Binary
4211
| Floating-Point Arithmetic.
4212
*----------------------------------------------------------------------------*/
4214
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4216
a = float64_squash_input_denormal(a STATUS_VAR);
4217
b = float64_squash_input_denormal(b STATUS_VAR);
4219
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4220
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4222
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4223
float_raise( float_flag_invalid STATUS_VAR);
4230
/*----------------------------------------------------------------------------
4231
| Returns the result of converting the extended double-precision floating-
4232
| point value `a' to the 32-bit two's complement integer format. The
4233
| conversion is performed according to the IEC/IEEE Standard for Binary
4234
| Floating-Point Arithmetic---which means in particular that the conversion
4235
| is rounded according to the current rounding mode. If `a' is a NaN, the
4236
| largest positive integer is returned. Otherwise, if the conversion
4237
| overflows, the largest integer with the same sign as `a' is returned.
4238
*----------------------------------------------------------------------------*/
4240
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4243
int32 aExp, shiftCount;
4246
aSig = extractFloatx80Frac( a );
4247
aExp = extractFloatx80Exp( a );
4248
aSign = extractFloatx80Sign( a );
4249
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4250
shiftCount = 0x4037 - aExp;
4251
if ( shiftCount <= 0 ) shiftCount = 1;
4252
shift64RightJamming( aSig, shiftCount, &aSig );
4253
return roundAndPackInt32( aSign, aSig STATUS_VAR );
4257
/*----------------------------------------------------------------------------
4258
| Returns the result of converting the extended double-precision floating-
4259
| point value `a' to the 32-bit two's complement integer format. The
4260
| conversion is performed according to the IEC/IEEE Standard for Binary
4261
| Floating-Point Arithmetic, except that the conversion is always rounded
4262
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4263
| Otherwise, if the conversion overflows, the largest integer with the same
4264
| sign as `a' is returned.
4265
*----------------------------------------------------------------------------*/
4267
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4270
int32 aExp, shiftCount;
4271
uint64_t aSig, savedASig;
4274
aSig = extractFloatx80Frac( a );
4275
aExp = extractFloatx80Exp( a );
4276
aSign = extractFloatx80Sign( a );
4277
if ( 0x401E < aExp ) {
4278
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4281
else if ( aExp < 0x3FFF ) {
4282
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4285
shiftCount = 0x403E - aExp;
4287
aSig >>= shiftCount;
4289
if ( aSign ) z = - z;
4290
if ( ( z < 0 ) ^ aSign ) {
4292
float_raise( float_flag_invalid STATUS_VAR);
4293
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4295
if ( ( aSig<<shiftCount ) != savedASig ) {
4296
STATUS(float_exception_flags) |= float_flag_inexact;
4302
/*----------------------------------------------------------------------------
4303
| Returns the result of converting the extended double-precision floating-
4304
| point value `a' to the 64-bit two's complement integer format. The
4305
| conversion is performed according to the IEC/IEEE Standard for Binary
4306
| Floating-Point Arithmetic---which means in particular that the conversion
4307
| is rounded according to the current rounding mode. If `a' is a NaN,
4308
| the largest positive integer is returned. Otherwise, if the conversion
4309
| overflows, the largest integer with the same sign as `a' is returned.
4310
*----------------------------------------------------------------------------*/
4312
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4315
int32 aExp, shiftCount;
4316
uint64_t aSig, aSigExtra;
4318
aSig = extractFloatx80Frac( a );
4319
aExp = extractFloatx80Exp( a );
4320
aSign = extractFloatx80Sign( a );
4321
shiftCount = 0x403E - aExp;
4322
if ( shiftCount <= 0 ) {
4324
float_raise( float_flag_invalid STATUS_VAR);
4326
|| ( ( aExp == 0x7FFF )
4327
&& ( aSig != LIT64( 0x8000000000000000 ) ) )
4329
return LIT64( 0x7FFFFFFFFFFFFFFF );
4331
return (int64_t) LIT64( 0x8000000000000000 );
4336
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4338
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4342
/*----------------------------------------------------------------------------
4343
| Returns the result of converting the extended double-precision floating-
4344
| point value `a' to the 64-bit two's complement integer format. The
4345
| conversion is performed according to the IEC/IEEE Standard for Binary
4346
| Floating-Point Arithmetic, except that the conversion is always rounded
4347
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4348
| Otherwise, if the conversion overflows, the largest integer with the same
4349
| sign as `a' is returned.
4350
*----------------------------------------------------------------------------*/
4352
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4355
int32 aExp, shiftCount;
4359
aSig = extractFloatx80Frac( a );
4360
aExp = extractFloatx80Exp( a );
4361
aSign = extractFloatx80Sign( a );
4362
shiftCount = aExp - 0x403E;
4363
if ( 0 <= shiftCount ) {
4364
aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4365
if ( ( a.high != 0xC03E ) || aSig ) {
4366
float_raise( float_flag_invalid STATUS_VAR);
4367
if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4368
return LIT64( 0x7FFFFFFFFFFFFFFF );
4371
return (int64_t) LIT64( 0x8000000000000000 );
4373
else if ( aExp < 0x3FFF ) {
4374
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4377
z = aSig>>( - shiftCount );
4378
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4379
STATUS(float_exception_flags) |= float_flag_inexact;
4381
if ( aSign ) z = - z;
4386
/*----------------------------------------------------------------------------
4387
| Returns the result of converting the extended double-precision floating-
4388
| point value `a' to the single-precision floating-point format. The
4389
| conversion is performed according to the IEC/IEEE Standard for Binary
4390
| Floating-Point Arithmetic.
4391
*----------------------------------------------------------------------------*/
4393
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4399
aSig = extractFloatx80Frac( a );
4400
aExp = extractFloatx80Exp( a );
4401
aSign = extractFloatx80Sign( a );
4402
if ( aExp == 0x7FFF ) {
4403
if ( (uint64_t) ( aSig<<1 ) ) {
4404
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4406
return packFloat32( aSign, 0xFF, 0 );
4408
shift64RightJamming( aSig, 33, &aSig );
4409
if ( aExp || aSig ) aExp -= 0x3F81;
4410
return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4414
/*----------------------------------------------------------------------------
4415
| Returns the result of converting the extended double-precision floating-
4416
| point value `a' to the double-precision floating-point format. The
4417
| conversion is performed according to the IEC/IEEE Standard for Binary
4418
| Floating-Point Arithmetic.
4419
*----------------------------------------------------------------------------*/
4421
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4425
uint64_t aSig, zSig;
4427
aSig = extractFloatx80Frac( a );
4428
aExp = extractFloatx80Exp( a );
4429
aSign = extractFloatx80Sign( a );
4430
if ( aExp == 0x7FFF ) {
4431
if ( (uint64_t) ( aSig<<1 ) ) {
4432
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4434
return packFloat64( aSign, 0x7FF, 0 );
4436
shift64RightJamming( aSig, 1, &zSig );
4437
if ( aExp || aSig ) aExp -= 0x3C01;
4438
return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4442
/*----------------------------------------------------------------------------
4443
| Returns the result of converting the extended double-precision floating-
4444
| point value `a' to the quadruple-precision floating-point format. The
4445
| conversion is performed according to the IEC/IEEE Standard for Binary
4446
| Floating-Point Arithmetic.
4447
*----------------------------------------------------------------------------*/
4449
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4453
uint64_t aSig, zSig0, zSig1;
4455
aSig = extractFloatx80Frac( a );
4456
aExp = extractFloatx80Exp( a );
4457
aSign = extractFloatx80Sign( a );
4458
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4459
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4461
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4462
return packFloat128( aSign, aExp, zSig0, zSig1 );
4466
/*----------------------------------------------------------------------------
4467
| Rounds the extended double-precision floating-point value `a' to an integer,
4468
| and returns the result as an extended quadruple-precision floating-point
4469
| value. The operation is performed according to the IEC/IEEE Standard for
4470
| Binary Floating-Point Arithmetic.
4471
*----------------------------------------------------------------------------*/
4473
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4477
uint64_t lastBitMask, roundBitsMask;
4481
aExp = extractFloatx80Exp( a );
4482
if ( 0x403E <= aExp ) {
4483
if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4484
return propagateFloatx80NaN( a, a STATUS_VAR );
4488
if ( aExp < 0x3FFF ) {
4490
&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4493
STATUS(float_exception_flags) |= float_flag_inexact;
4494
aSign = extractFloatx80Sign( a );
4495
switch ( STATUS(float_rounding_mode) ) {
4496
case float_round_nearest_even:
4497
if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4500
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4503
case float_round_down:
4506
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4507
: packFloatx80( 0, 0, 0 );
4508
case float_round_up:
4510
aSign ? packFloatx80( 1, 0, 0 )
4511
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4513
return packFloatx80( aSign, 0, 0 );
4516
lastBitMask <<= 0x403E - aExp;
4517
roundBitsMask = lastBitMask - 1;
4519
roundingMode = STATUS(float_rounding_mode);
4520
if ( roundingMode == float_round_nearest_even ) {
4521
z.low += lastBitMask>>1;
4522
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4524
else if ( roundingMode != float_round_to_zero ) {
4525
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4526
z.low += roundBitsMask;
4529
z.low &= ~ roundBitsMask;
4532
z.low = LIT64( 0x8000000000000000 );
4534
if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4539
/*----------------------------------------------------------------------------
4540
| Returns the result of adding the absolute values of the extended double-
4541
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4542
| negated before being returned. `zSign' is ignored if the result is a NaN.
4543
| The addition is performed according to the IEC/IEEE Standard for Binary
4544
| Floating-Point Arithmetic.
4545
*----------------------------------------------------------------------------*/
4547
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4549
int32 aExp, bExp, zExp;
4550
uint64_t aSig, bSig, zSig0, zSig1;
4553
aSig = extractFloatx80Frac( a );
4554
aExp = extractFloatx80Exp( a );
4555
bSig = extractFloatx80Frac( b );
4556
bExp = extractFloatx80Exp( b );
4557
expDiff = aExp - bExp;
4558
if ( 0 < expDiff ) {
4559
if ( aExp == 0x7FFF ) {
4560
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4563
if ( bExp == 0 ) --expDiff;
4564
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4567
else if ( expDiff < 0 ) {
4568
if ( bExp == 0x7FFF ) {
4569
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4570
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4572
if ( aExp == 0 ) ++expDiff;
4573
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4577
if ( aExp == 0x7FFF ) {
4578
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4579
return propagateFloatx80NaN( a, b STATUS_VAR );
4584
zSig0 = aSig + bSig;
4586
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4592
zSig0 = aSig + bSig;
4593
if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4595
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4596
zSig0 |= LIT64( 0x8000000000000000 );
4600
roundAndPackFloatx80(
4601
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4605
/*----------------------------------------------------------------------------
4606
| Returns the result of subtracting the absolute values of the extended
4607
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4608
| difference is negated before being returned. `zSign' is ignored if the
4609
| result is a NaN. The subtraction is performed according to the IEC/IEEE
4610
| Standard for Binary Floating-Point Arithmetic.
4611
*----------------------------------------------------------------------------*/
4613
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4615
int32 aExp, bExp, zExp;
4616
uint64_t aSig, bSig, zSig0, zSig1;
4620
aSig = extractFloatx80Frac( a );
4621
aExp = extractFloatx80Exp( a );
4622
bSig = extractFloatx80Frac( b );
4623
bExp = extractFloatx80Exp( b );
4624
expDiff = aExp - bExp;
4625
if ( 0 < expDiff ) goto aExpBigger;
4626
if ( expDiff < 0 ) goto bExpBigger;
4627
if ( aExp == 0x7FFF ) {
4628
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4629
return propagateFloatx80NaN( a, b STATUS_VAR );
4631
float_raise( float_flag_invalid STATUS_VAR);
4632
z.low = floatx80_default_nan_low;
4633
z.high = floatx80_default_nan_high;
4641
if ( bSig < aSig ) goto aBigger;
4642
if ( aSig < bSig ) goto bBigger;
4643
return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4645
if ( bExp == 0x7FFF ) {
4646
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4647
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4649
if ( aExp == 0 ) ++expDiff;
4650
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4652
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4655
goto normalizeRoundAndPack;
4657
if ( aExp == 0x7FFF ) {
4658
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4661
if ( bExp == 0 ) --expDiff;
4662
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4664
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4666
normalizeRoundAndPack:
4668
normalizeRoundAndPackFloatx80(
4669
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4673
/*----------------------------------------------------------------------------
4674
| Returns the result of adding the extended double-precision floating-point
4675
| values `a' and `b'. The operation is performed according to the IEC/IEEE
4676
| Standard for Binary Floating-Point Arithmetic.
4677
*----------------------------------------------------------------------------*/
4679
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4683
aSign = extractFloatx80Sign( a );
4684
bSign = extractFloatx80Sign( b );
4685
if ( aSign == bSign ) {
4686
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4689
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4694
/*----------------------------------------------------------------------------
4695
| Returns the result of subtracting the extended double-precision floating-
4696
| point values `a' and `b'. The operation is performed according to the
4697
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4698
*----------------------------------------------------------------------------*/
4700
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4704
aSign = extractFloatx80Sign( a );
4705
bSign = extractFloatx80Sign( b );
4706
if ( aSign == bSign ) {
4707
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4710
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4715
/*----------------------------------------------------------------------------
4716
| Returns the result of multiplying the extended double-precision floating-
4717
| point values `a' and `b'. The operation is performed according to the
4718
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4719
*----------------------------------------------------------------------------*/
4721
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4723
flag aSign, bSign, zSign;
4724
int32 aExp, bExp, zExp;
4725
uint64_t aSig, bSig, zSig0, zSig1;
4728
aSig = extractFloatx80Frac( a );
4729
aExp = extractFloatx80Exp( a );
4730
aSign = extractFloatx80Sign( a );
4731
bSig = extractFloatx80Frac( b );
4732
bExp = extractFloatx80Exp( b );
4733
bSign = extractFloatx80Sign( b );
4734
zSign = aSign ^ bSign;
4735
if ( aExp == 0x7FFF ) {
4736
if ( (uint64_t) ( aSig<<1 )
4737
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4738
return propagateFloatx80NaN( a, b STATUS_VAR );
4740
if ( ( bExp | bSig ) == 0 ) goto invalid;
4741
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4743
if ( bExp == 0x7FFF ) {
4744
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4745
if ( ( aExp | aSig ) == 0 ) {
4747
float_raise( float_flag_invalid STATUS_VAR);
4748
z.low = floatx80_default_nan_low;
4749
z.high = floatx80_default_nan_high;
4752
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4755
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4756
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4759
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4760
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4762
zExp = aExp + bExp - 0x3FFE;
4763
mul64To128( aSig, bSig, &zSig0, &zSig1 );
4764
if ( 0 < (int64_t) zSig0 ) {
4765
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4769
roundAndPackFloatx80(
4770
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4774
/*----------------------------------------------------------------------------
4775
| Returns the result of dividing the extended double-precision floating-point
4776
| value `a' by the corresponding value `b'. The operation is performed
4777
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4778
*----------------------------------------------------------------------------*/
4780
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4782
flag aSign, bSign, zSign;
4783
int32 aExp, bExp, zExp;
4784
uint64_t aSig, bSig, zSig0, zSig1;
4785
uint64_t rem0, rem1, rem2, term0, term1, term2;
4788
aSig = extractFloatx80Frac( a );
4789
aExp = extractFloatx80Exp( a );
4790
aSign = extractFloatx80Sign( a );
4791
bSig = extractFloatx80Frac( b );
4792
bExp = extractFloatx80Exp( b );
4793
bSign = extractFloatx80Sign( b );
4794
zSign = aSign ^ bSign;
4795
if ( aExp == 0x7FFF ) {
4796
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4797
if ( bExp == 0x7FFF ) {
4798
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4801
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4803
if ( bExp == 0x7FFF ) {
4804
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4805
return packFloatx80( zSign, 0, 0 );
4809
if ( ( aExp | aSig ) == 0 ) {
4811
float_raise( float_flag_invalid STATUS_VAR);
4812
z.low = floatx80_default_nan_low;
4813
z.high = floatx80_default_nan_high;
4816
float_raise( float_flag_divbyzero STATUS_VAR);
4817
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4819
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4822
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4823
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4825
zExp = aExp - bExp + 0x3FFE;
4827
if ( bSig <= aSig ) {
4828
shift128Right( aSig, 0, 1, &aSig, &rem1 );
4831
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4832
mul64To128( bSig, zSig0, &term0, &term1 );
4833
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4834
while ( (int64_t) rem0 < 0 ) {
4836
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4838
zSig1 = estimateDiv128To64( rem1, 0, bSig );
4839
if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4840
mul64To128( bSig, zSig1, &term1, &term2 );
4841
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4842
while ( (int64_t) rem1 < 0 ) {
4844
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4846
zSig1 |= ( ( rem1 | rem2 ) != 0 );
4849
roundAndPackFloatx80(
4850
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4854
/*----------------------------------------------------------------------------
4855
| Returns the remainder of the extended double-precision floating-point value
4856
| `a' with respect to the corresponding value `b'. The operation is performed
4857
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4858
*----------------------------------------------------------------------------*/
4860
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4863
int32 aExp, bExp, expDiff;
4864
uint64_t aSig0, aSig1, bSig;
4865
uint64_t q, term0, term1, alternateASig0, alternateASig1;
4868
aSig0 = extractFloatx80Frac( a );
4869
aExp = extractFloatx80Exp( a );
4870
aSign = extractFloatx80Sign( a );
4871
bSig = extractFloatx80Frac( b );
4872
bExp = extractFloatx80Exp( b );
4873
if ( aExp == 0x7FFF ) {
4874
if ( (uint64_t) ( aSig0<<1 )
4875
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4876
return propagateFloatx80NaN( a, b STATUS_VAR );
4880
if ( bExp == 0x7FFF ) {
4881
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4887
float_raise( float_flag_invalid STATUS_VAR);
4888
z.low = floatx80_default_nan_low;
4889
z.high = floatx80_default_nan_high;
4892
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4895
if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4896
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4898
bSig |= LIT64( 0x8000000000000000 );
4900
expDiff = aExp - bExp;
4902
if ( expDiff < 0 ) {
4903
if ( expDiff < -1 ) return a;
4904
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4907
q = ( bSig <= aSig0 );
4908
if ( q ) aSig0 -= bSig;
4910
while ( 0 < expDiff ) {
4911
q = estimateDiv128To64( aSig0, aSig1, bSig );
4912
q = ( 2 < q ) ? q - 2 : 0;
4913
mul64To128( bSig, q, &term0, &term1 );
4914
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4915
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4919
if ( 0 < expDiff ) {
4920
q = estimateDiv128To64( aSig0, aSig1, bSig );
4921
q = ( 2 < q ) ? q - 2 : 0;
4923
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4924
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4925
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4926
while ( le128( term0, term1, aSig0, aSig1 ) ) {
4928
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4935
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4936
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4937
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4940
aSig0 = alternateASig0;
4941
aSig1 = alternateASig1;
4945
normalizeRoundAndPackFloatx80(
4946
80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4950
/*----------------------------------------------------------------------------
4951
| Returns the square root of the extended double-precision floating-point
4952
| value `a'. The operation is performed according to the IEC/IEEE Standard
4953
| for Binary Floating-Point Arithmetic.
4954
*----------------------------------------------------------------------------*/
4956
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4960
uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4961
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4964
aSig0 = extractFloatx80Frac( a );
4965
aExp = extractFloatx80Exp( a );
4966
aSign = extractFloatx80Sign( a );
4967
if ( aExp == 0x7FFF ) {
4968
if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4969
if ( ! aSign ) return a;
4973
if ( ( aExp | aSig0 ) == 0 ) return a;
4975
float_raise( float_flag_invalid STATUS_VAR);
4976
z.low = floatx80_default_nan_low;
4977
z.high = floatx80_default_nan_high;
4981
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4982
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4984
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4985
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4986
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4987
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4988
doubleZSig0 = zSig0<<1;
4989
mul64To128( zSig0, zSig0, &term0, &term1 );
4990
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4991
while ( (int64_t) rem0 < 0 ) {
4994
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4996
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4997
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4998
if ( zSig1 == 0 ) zSig1 = 1;
4999
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5000
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5001
mul64To128( zSig1, zSig1, &term2, &term3 );
5002
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5003
while ( (int64_t) rem1 < 0 ) {
5005
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5007
term2 |= doubleZSig0;
5008
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5010
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5012
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5013
zSig0 |= doubleZSig0;
5015
roundAndPackFloatx80(
5016
STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5020
/*----------------------------------------------------------------------------
5021
| Returns 1 if the extended double-precision floating-point value `a' is equal
5022
| to the corresponding value `b', and 0 otherwise. The invalid exception is
5023
| raised if either operand is a NaN. Otherwise, the comparison is performed
5024
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025
*----------------------------------------------------------------------------*/
5027
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5030
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5031
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5032
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5033
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5035
float_raise( float_flag_invalid STATUS_VAR);
5040
&& ( ( a.high == b.high )
5042
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5047
/*----------------------------------------------------------------------------
5048
| Returns 1 if the extended double-precision floating-point value `a' is
5049
| less than or equal to the corresponding value `b', and 0 otherwise. The
5050
| invalid exception is raised if either operand is a NaN. The comparison is
5051
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5053
*----------------------------------------------------------------------------*/
5055
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5059
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5060
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5061
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5062
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5064
float_raise( float_flag_invalid STATUS_VAR);
5067
aSign = extractFloatx80Sign( a );
5068
bSign = extractFloatx80Sign( b );
5069
if ( aSign != bSign ) {
5072
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5076
aSign ? le128( b.high, b.low, a.high, a.low )
5077
: le128( a.high, a.low, b.high, b.low );
5081
/*----------------------------------------------------------------------------
5082
| Returns 1 if the extended double-precision floating-point value `a' is
5083
| less than the corresponding value `b', and 0 otherwise. The invalid
5084
| exception is raised if either operand is a NaN. The comparison is performed
5085
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5086
*----------------------------------------------------------------------------*/
5088
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5092
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5093
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5094
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5095
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5097
float_raise( float_flag_invalid STATUS_VAR);
5100
aSign = extractFloatx80Sign( a );
5101
bSign = extractFloatx80Sign( b );
5102
if ( aSign != bSign ) {
5105
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5109
aSign ? lt128( b.high, b.low, a.high, a.low )
5110
: lt128( a.high, a.low, b.high, b.low );
5114
/*----------------------------------------------------------------------------
5115
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5116
| cannot be compared, and 0 otherwise. The invalid exception is raised if
5117
| either operand is a NaN. The comparison is performed according to the
5118
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5119
*----------------------------------------------------------------------------*/
5120
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5122
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5123
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5124
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5125
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5127
float_raise( float_flag_invalid STATUS_VAR);
5133
/*----------------------------------------------------------------------------
5134
| Returns 1 if the extended double-precision floating-point value `a' is
5135
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5136
| cause an exception. The comparison is performed according to the IEC/IEEE
5137
| Standard for Binary Floating-Point Arithmetic.
5138
*----------------------------------------------------------------------------*/
5140
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5143
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5144
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5145
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5146
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5148
if ( floatx80_is_signaling_nan( a )
5149
|| floatx80_is_signaling_nan( b ) ) {
5150
float_raise( float_flag_invalid STATUS_VAR);
5156
&& ( ( a.high == b.high )
5158
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5163
/*----------------------------------------------------------------------------
5164
| Returns 1 if the extended double-precision floating-point value `a' is less
5165
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5166
| do not cause an exception. Otherwise, the comparison is performed according
5167
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5168
*----------------------------------------------------------------------------*/
5170
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5174
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5175
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5176
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5177
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5179
if ( floatx80_is_signaling_nan( a )
5180
|| floatx80_is_signaling_nan( b ) ) {
5181
float_raise( float_flag_invalid STATUS_VAR);
5185
aSign = extractFloatx80Sign( a );
5186
bSign = extractFloatx80Sign( b );
5187
if ( aSign != bSign ) {
5190
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5194
aSign ? le128( b.high, b.low, a.high, a.low )
5195
: le128( a.high, a.low, b.high, b.low );
5199
/*----------------------------------------------------------------------------
5200
| Returns 1 if the extended double-precision floating-point value `a' is less
5201
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5202
| an exception. Otherwise, the comparison is performed according to the
5203
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5204
*----------------------------------------------------------------------------*/
5206
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5210
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5211
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5212
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5213
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5215
if ( floatx80_is_signaling_nan( a )
5216
|| floatx80_is_signaling_nan( b ) ) {
5217
float_raise( float_flag_invalid STATUS_VAR);
5221
aSign = extractFloatx80Sign( a );
5222
bSign = extractFloatx80Sign( b );
5223
if ( aSign != bSign ) {
5226
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5230
aSign ? lt128( b.high, b.low, a.high, a.low )
5231
: lt128( a.high, a.low, b.high, b.low );
5235
/*----------------------------------------------------------------------------
5236
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5237
| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5238
| The comparison is performed according to the IEC/IEEE Standard for Binary
5239
| Floating-Point Arithmetic.
5240
*----------------------------------------------------------------------------*/
5241
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5243
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5244
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5245
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5246
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5248
if ( floatx80_is_signaling_nan( a )
5249
|| floatx80_is_signaling_nan( b ) ) {
5250
float_raise( float_flag_invalid STATUS_VAR);
5257
/*----------------------------------------------------------------------------
5258
| Returns the result of converting the quadruple-precision floating-point
5259
| value `a' to the 32-bit two's complement integer format. The conversion
5260
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5261
| Arithmetic---which means in particular that the conversion is rounded
5262
| according to the current rounding mode. If `a' is a NaN, the largest
5263
| positive integer is returned. Otherwise, if the conversion overflows, the
5264
| largest integer with the same sign as `a' is returned.
5265
*----------------------------------------------------------------------------*/
5267
int32 float128_to_int32( float128 a STATUS_PARAM )
5270
int32 aExp, shiftCount;
5271
uint64_t aSig0, aSig1;
5273
aSig1 = extractFloat128Frac1( a );
5274
aSig0 = extractFloat128Frac0( a );
5275
aExp = extractFloat128Exp( a );
5276
aSign = extractFloat128Sign( a );
5277
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5278
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5279
aSig0 |= ( aSig1 != 0 );
5280
shiftCount = 0x4028 - aExp;
5281
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5282
return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5286
/*----------------------------------------------------------------------------
5287
| Returns the result of converting the quadruple-precision floating-point
5288
| value `a' to the 32-bit two's complement integer format. The conversion
5289
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5290
| Arithmetic, except that the conversion is always rounded toward zero. If
5291
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5292
| conversion overflows, the largest integer with the same sign as `a' is
5294
*----------------------------------------------------------------------------*/
5296
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5299
int32 aExp, shiftCount;
5300
uint64_t aSig0, aSig1, savedASig;
5303
aSig1 = extractFloat128Frac1( a );
5304
aSig0 = extractFloat128Frac0( a );
5305
aExp = extractFloat128Exp( a );
5306
aSign = extractFloat128Sign( a );
5307
aSig0 |= ( aSig1 != 0 );
5308
if ( 0x401E < aExp ) {
5309
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5312
else if ( aExp < 0x3FFF ) {
5313
if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5316
aSig0 |= LIT64( 0x0001000000000000 );
5317
shiftCount = 0x402F - aExp;
5319
aSig0 >>= shiftCount;
5321
if ( aSign ) z = - z;
5322
if ( ( z < 0 ) ^ aSign ) {
5324
float_raise( float_flag_invalid STATUS_VAR);
5325
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5327
if ( ( aSig0<<shiftCount ) != savedASig ) {
5328
STATUS(float_exception_flags) |= float_flag_inexact;
5334
/*----------------------------------------------------------------------------
5335
| Returns the result of converting the quadruple-precision floating-point
5336
| value `a' to the 64-bit two's complement integer format. The conversion
5337
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338
| Arithmetic---which means in particular that the conversion is rounded
5339
| according to the current rounding mode. If `a' is a NaN, the largest
5340
| positive integer is returned. Otherwise, if the conversion overflows, the
5341
| largest integer with the same sign as `a' is returned.
5342
*----------------------------------------------------------------------------*/
5344
int64 float128_to_int64( float128 a STATUS_PARAM )
5347
int32 aExp, shiftCount;
5348
uint64_t aSig0, aSig1;
5350
aSig1 = extractFloat128Frac1( a );
5351
aSig0 = extractFloat128Frac0( a );
5352
aExp = extractFloat128Exp( a );
5353
aSign = extractFloat128Sign( a );
5354
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5355
shiftCount = 0x402F - aExp;
5356
if ( shiftCount <= 0 ) {
5357
if ( 0x403E < aExp ) {
5358
float_raise( float_flag_invalid STATUS_VAR);
5360
|| ( ( aExp == 0x7FFF )
5361
&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5364
return LIT64( 0x7FFFFFFFFFFFFFFF );
5366
return (int64_t) LIT64( 0x8000000000000000 );
5368
shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5371
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5373
return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5377
/*----------------------------------------------------------------------------
5378
| Returns the result of converting the quadruple-precision floating-point
5379
| value `a' to the 64-bit two's complement integer format. The conversion
5380
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5381
| Arithmetic, except that the conversion is always rounded toward zero.
5382
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5383
| the conversion overflows, the largest integer with the same sign as `a' is
5385
*----------------------------------------------------------------------------*/
5387
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5390
int32 aExp, shiftCount;
5391
uint64_t aSig0, aSig1;
5394
aSig1 = extractFloat128Frac1( a );
5395
aSig0 = extractFloat128Frac0( a );
5396
aExp = extractFloat128Exp( a );
5397
aSign = extractFloat128Sign( a );
5398
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5399
shiftCount = aExp - 0x402F;
5400
if ( 0 < shiftCount ) {
5401
if ( 0x403E <= aExp ) {
5402
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5403
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5404
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5405
if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5408
float_raise( float_flag_invalid STATUS_VAR);
5409
if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5410
return LIT64( 0x7FFFFFFFFFFFFFFF );
5413
return (int64_t) LIT64( 0x8000000000000000 );
5415
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5416
if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5417
STATUS(float_exception_flags) |= float_flag_inexact;
5421
if ( aExp < 0x3FFF ) {
5422
if ( aExp | aSig0 | aSig1 ) {
5423
STATUS(float_exception_flags) |= float_flag_inexact;
5427
z = aSig0>>( - shiftCount );
5429
|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5430
STATUS(float_exception_flags) |= float_flag_inexact;
5433
if ( aSign ) z = - z;
5438
/*----------------------------------------------------------------------------
5439
| Returns the result of converting the quadruple-precision floating-point
5440
| value `a' to the single-precision floating-point format. The conversion
5441
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5443
*----------------------------------------------------------------------------*/
5445
float32 float128_to_float32( float128 a STATUS_PARAM )
5449
uint64_t aSig0, aSig1;
5452
aSig1 = extractFloat128Frac1( a );
5453
aSig0 = extractFloat128Frac0( a );
5454
aExp = extractFloat128Exp( a );
5455
aSign = extractFloat128Sign( a );
5456
if ( aExp == 0x7FFF ) {
5457
if ( aSig0 | aSig1 ) {
5458
return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5460
return packFloat32( aSign, 0xFF, 0 );
5462
aSig0 |= ( aSig1 != 0 );
5463
shift64RightJamming( aSig0, 18, &aSig0 );
5465
if ( aExp || zSig ) {
5469
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5473
/*----------------------------------------------------------------------------
5474
| Returns the result of converting the quadruple-precision floating-point
5475
| value `a' to the double-precision floating-point format. The conversion
5476
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5478
*----------------------------------------------------------------------------*/
5480
float64 float128_to_float64( float128 a STATUS_PARAM )
5484
uint64_t aSig0, aSig1;
5486
aSig1 = extractFloat128Frac1( a );
5487
aSig0 = extractFloat128Frac0( a );
5488
aExp = extractFloat128Exp( a );
5489
aSign = extractFloat128Sign( a );
5490
if ( aExp == 0x7FFF ) {
5491
if ( aSig0 | aSig1 ) {
5492
return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5494
return packFloat64( aSign, 0x7FF, 0 );
5496
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5497
aSig0 |= ( aSig1 != 0 );
5498
if ( aExp || aSig0 ) {
5499
aSig0 |= LIT64( 0x4000000000000000 );
5502
return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5506
/*----------------------------------------------------------------------------
5507
| Returns the result of converting the quadruple-precision floating-point
5508
| value `a' to the extended double-precision floating-point format. The
5509
| conversion is performed according to the IEC/IEEE Standard for Binary
5510
| Floating-Point Arithmetic.
5511
*----------------------------------------------------------------------------*/
5513
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5517
uint64_t aSig0, aSig1;
5519
aSig1 = extractFloat128Frac1( a );
5520
aSig0 = extractFloat128Frac0( a );
5521
aExp = extractFloat128Exp( a );
5522
aSign = extractFloat128Sign( a );
5523
if ( aExp == 0x7FFF ) {
5524
if ( aSig0 | aSig1 ) {
5525
return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5527
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5530
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5531
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5534
aSig0 |= LIT64( 0x0001000000000000 );
5536
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5537
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5541
/*----------------------------------------------------------------------------
5542
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5543
| returns the result as a quadruple-precision floating-point value. The
5544
| operation is performed according to the IEC/IEEE Standard for Binary
5545
| Floating-Point Arithmetic.
5546
*----------------------------------------------------------------------------*/
5548
float128 float128_round_to_int( float128 a STATUS_PARAM )
5552
uint64_t lastBitMask, roundBitsMask;
5556
aExp = extractFloat128Exp( a );
5557
if ( 0x402F <= aExp ) {
5558
if ( 0x406F <= aExp ) {
5559
if ( ( aExp == 0x7FFF )
5560
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5562
return propagateFloat128NaN( a, a STATUS_VAR );
5567
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5568
roundBitsMask = lastBitMask - 1;
5570
roundingMode = STATUS(float_rounding_mode);
5571
if ( roundingMode == float_round_nearest_even ) {
5572
if ( lastBitMask ) {
5573
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5574
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5577
if ( (int64_t) z.low < 0 ) {
5579
if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5583
else if ( roundingMode != float_round_to_zero ) {
5584
if ( extractFloat128Sign( z )
5585
^ ( roundingMode == float_round_up ) ) {
5586
add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5589
z.low &= ~ roundBitsMask;
5592
if ( aExp < 0x3FFF ) {
5593
if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5594
STATUS(float_exception_flags) |= float_flag_inexact;
5595
aSign = extractFloat128Sign( a );
5596
switch ( STATUS(float_rounding_mode) ) {
5597
case float_round_nearest_even:
5598
if ( ( aExp == 0x3FFE )
5599
&& ( extractFloat128Frac0( a )
5600
| extractFloat128Frac1( a ) )
5602
return packFloat128( aSign, 0x3FFF, 0, 0 );
5605
case float_round_down:
5607
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5608
: packFloat128( 0, 0, 0, 0 );
5609
case float_round_up:
5611
aSign ? packFloat128( 1, 0, 0, 0 )
5612
: packFloat128( 0, 0x3FFF, 0, 0 );
5614
return packFloat128( aSign, 0, 0, 0 );
5617
lastBitMask <<= 0x402F - aExp;
5618
roundBitsMask = lastBitMask - 1;
5621
roundingMode = STATUS(float_rounding_mode);
5622
if ( roundingMode == float_round_nearest_even ) {
5623
z.high += lastBitMask>>1;
5624
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5625
z.high &= ~ lastBitMask;
5628
else if ( roundingMode != float_round_to_zero ) {
5629
if ( extractFloat128Sign( z )
5630
^ ( roundingMode == float_round_up ) ) {
5631
z.high |= ( a.low != 0 );
5632
z.high += roundBitsMask;
5635
z.high &= ~ roundBitsMask;
5637
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5638
STATUS(float_exception_flags) |= float_flag_inexact;
5644
/*----------------------------------------------------------------------------
5645
| Returns the result of adding the absolute values of the quadruple-precision
5646
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5647
| before being returned. `zSign' is ignored if the result is a NaN.
5648
| The addition is performed according to the IEC/IEEE Standard for Binary
5649
| Floating-Point Arithmetic.
5650
*----------------------------------------------------------------------------*/
5652
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5654
int32 aExp, bExp, zExp;
5655
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5658
aSig1 = extractFloat128Frac1( a );
5659
aSig0 = extractFloat128Frac0( a );
5660
aExp = extractFloat128Exp( a );
5661
bSig1 = extractFloat128Frac1( b );
5662
bSig0 = extractFloat128Frac0( b );
5663
bExp = extractFloat128Exp( b );
5664
expDiff = aExp - bExp;
5665
if ( 0 < expDiff ) {
5666
if ( aExp == 0x7FFF ) {
5667
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5674
bSig0 |= LIT64( 0x0001000000000000 );
5676
shift128ExtraRightJamming(
5677
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5680
else if ( expDiff < 0 ) {
5681
if ( bExp == 0x7FFF ) {
5682
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5683
return packFloat128( zSign, 0x7FFF, 0, 0 );
5689
aSig0 |= LIT64( 0x0001000000000000 );
5691
shift128ExtraRightJamming(
5692
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5696
if ( aExp == 0x7FFF ) {
5697
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5698
return propagateFloat128NaN( a, b STATUS_VAR );
5702
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5704
if (STATUS(flush_to_zero)) {
5705
if (zSig0 | zSig1) {
5706
float_raise(float_flag_output_denormal STATUS_VAR);
5708
return packFloat128(zSign, 0, 0, 0);
5710
return packFloat128( zSign, 0, zSig0, zSig1 );
5713
zSig0 |= LIT64( 0x0002000000000000 );
5717
aSig0 |= LIT64( 0x0001000000000000 );
5718
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5720
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5723
shift128ExtraRightJamming(
5724
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5726
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5730
/*----------------------------------------------------------------------------
5731
| Returns the result of subtracting the absolute values of the quadruple-
5732
| precision floating-point values `a' and `b'. If `zSign' is 1, the
5733
| difference is negated before being returned. `zSign' is ignored if the
5734
| result is a NaN. The subtraction is performed according to the IEC/IEEE
5735
| Standard for Binary Floating-Point Arithmetic.
5736
*----------------------------------------------------------------------------*/
5738
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5740
int32 aExp, bExp, zExp;
5741
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5745
aSig1 = extractFloat128Frac1( a );
5746
aSig0 = extractFloat128Frac0( a );
5747
aExp = extractFloat128Exp( a );
5748
bSig1 = extractFloat128Frac1( b );
5749
bSig0 = extractFloat128Frac0( b );
5750
bExp = extractFloat128Exp( b );
5751
expDiff = aExp - bExp;
5752
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5753
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5754
if ( 0 < expDiff ) goto aExpBigger;
5755
if ( expDiff < 0 ) goto bExpBigger;
5756
if ( aExp == 0x7FFF ) {
5757
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5758
return propagateFloat128NaN( a, b STATUS_VAR );
5760
float_raise( float_flag_invalid STATUS_VAR);
5761
z.low = float128_default_nan_low;
5762
z.high = float128_default_nan_high;
5769
if ( bSig0 < aSig0 ) goto aBigger;
5770
if ( aSig0 < bSig0 ) goto bBigger;
5771
if ( bSig1 < aSig1 ) goto aBigger;
5772
if ( aSig1 < bSig1 ) goto bBigger;
5773
return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5775
if ( bExp == 0x7FFF ) {
5776
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5777
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5783
aSig0 |= LIT64( 0x4000000000000000 );
5785
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5786
bSig0 |= LIT64( 0x4000000000000000 );
5788
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5791
goto normalizeRoundAndPack;
5793
if ( aExp == 0x7FFF ) {
5794
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5801
bSig0 |= LIT64( 0x4000000000000000 );
5803
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5804
aSig0 |= LIT64( 0x4000000000000000 );
5806
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5808
normalizeRoundAndPack:
5810
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5814
/*----------------------------------------------------------------------------
5815
| Returns the result of adding the quadruple-precision floating-point values
5816
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5817
| for Binary Floating-Point Arithmetic.
5818
*----------------------------------------------------------------------------*/
5820
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5824
aSign = extractFloat128Sign( a );
5825
bSign = extractFloat128Sign( b );
5826
if ( aSign == bSign ) {
5827
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5830
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5835
/*----------------------------------------------------------------------------
5836
| Returns the result of subtracting the quadruple-precision floating-point
5837
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5838
| Standard for Binary Floating-Point Arithmetic.
5839
*----------------------------------------------------------------------------*/
5841
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5845
aSign = extractFloat128Sign( a );
5846
bSign = extractFloat128Sign( b );
5847
if ( aSign == bSign ) {
5848
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5851
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5856
/*----------------------------------------------------------------------------
5857
| Returns the result of multiplying the quadruple-precision floating-point
5858
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5859
| Standard for Binary Floating-Point Arithmetic.
5860
*----------------------------------------------------------------------------*/
5862
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5864
flag aSign, bSign, zSign;
5865
int32 aExp, bExp, zExp;
5866
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5869
aSig1 = extractFloat128Frac1( a );
5870
aSig0 = extractFloat128Frac0( a );
5871
aExp = extractFloat128Exp( a );
5872
aSign = extractFloat128Sign( a );
5873
bSig1 = extractFloat128Frac1( b );
5874
bSig0 = extractFloat128Frac0( b );
5875
bExp = extractFloat128Exp( b );
5876
bSign = extractFloat128Sign( b );
5877
zSign = aSign ^ bSign;
5878
if ( aExp == 0x7FFF ) {
5879
if ( ( aSig0 | aSig1 )
5880
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5881
return propagateFloat128NaN( a, b STATUS_VAR );
5883
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5884
return packFloat128( zSign, 0x7FFF, 0, 0 );
5886
if ( bExp == 0x7FFF ) {
5887
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5888
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5890
float_raise( float_flag_invalid STATUS_VAR);
5891
z.low = float128_default_nan_low;
5892
z.high = float128_default_nan_high;
5895
return packFloat128( zSign, 0x7FFF, 0, 0 );
5898
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5899
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5902
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5903
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5905
zExp = aExp + bExp - 0x4000;
5906
aSig0 |= LIT64( 0x0001000000000000 );
5907
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5908
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5909
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5910
zSig2 |= ( zSig3 != 0 );
5911
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5912
shift128ExtraRightJamming(
5913
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5916
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5920
/*----------------------------------------------------------------------------
5921
| Returns the result of dividing the quadruple-precision floating-point value
5922
| `a' by the corresponding value `b'. The operation is performed according to
5923
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5924
*----------------------------------------------------------------------------*/
5926
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5928
flag aSign, bSign, zSign;
5929
int32 aExp, bExp, zExp;
5930
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5931
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5934
aSig1 = extractFloat128Frac1( a );
5935
aSig0 = extractFloat128Frac0( a );
5936
aExp = extractFloat128Exp( a );
5937
aSign = extractFloat128Sign( a );
5938
bSig1 = extractFloat128Frac1( b );
5939
bSig0 = extractFloat128Frac0( b );
5940
bExp = extractFloat128Exp( b );
5941
bSign = extractFloat128Sign( b );
5942
zSign = aSign ^ bSign;
5943
if ( aExp == 0x7FFF ) {
5944
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5945
if ( bExp == 0x7FFF ) {
5946
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5949
return packFloat128( zSign, 0x7FFF, 0, 0 );
5951
if ( bExp == 0x7FFF ) {
5952
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5953
return packFloat128( zSign, 0, 0, 0 );
5956
if ( ( bSig0 | bSig1 ) == 0 ) {
5957
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5959
float_raise( float_flag_invalid STATUS_VAR);
5960
z.low = float128_default_nan_low;
5961
z.high = float128_default_nan_high;
5964
float_raise( float_flag_divbyzero STATUS_VAR);
5965
return packFloat128( zSign, 0x7FFF, 0, 0 );
5967
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5970
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5971
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5973
zExp = aExp - bExp + 0x3FFD;
5975
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5977
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5978
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5979
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5982
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5983
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5984
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5985
while ( (int64_t) rem0 < 0 ) {
5987
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5989
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5990
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5991
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5992
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5993
while ( (int64_t) rem1 < 0 ) {
5995
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5997
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5999
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6000
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6004
/*----------------------------------------------------------------------------
6005
| Returns the remainder of the quadruple-precision floating-point value `a'
6006
| with respect to the corresponding value `b'. The operation is performed
6007
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6008
*----------------------------------------------------------------------------*/
6010
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6013
int32 aExp, bExp, expDiff;
6014
uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6015
uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6019
aSig1 = extractFloat128Frac1( a );
6020
aSig0 = extractFloat128Frac0( a );
6021
aExp = extractFloat128Exp( a );
6022
aSign = extractFloat128Sign( a );
6023
bSig1 = extractFloat128Frac1( b );
6024
bSig0 = extractFloat128Frac0( b );
6025
bExp = extractFloat128Exp( b );
6026
if ( aExp == 0x7FFF ) {
6027
if ( ( aSig0 | aSig1 )
6028
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6029
return propagateFloat128NaN( a, b STATUS_VAR );
6033
if ( bExp == 0x7FFF ) {
6034
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6038
if ( ( bSig0 | bSig1 ) == 0 ) {
6040
float_raise( float_flag_invalid STATUS_VAR);
6041
z.low = float128_default_nan_low;
6042
z.high = float128_default_nan_high;
6045
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6048
if ( ( aSig0 | aSig1 ) == 0 ) return a;
6049
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6051
expDiff = aExp - bExp;
6052
if ( expDiff < -1 ) return a;
6054
aSig0 | LIT64( 0x0001000000000000 ),
6056
15 - ( expDiff < 0 ),
6061
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6062
q = le128( bSig0, bSig1, aSig0, aSig1 );
6063
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6065
while ( 0 < expDiff ) {
6066
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6067
q = ( 4 < q ) ? q - 4 : 0;
6068
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6069
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6070
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6071
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6074
if ( -64 < expDiff ) {
6075
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6076
q = ( 4 < q ) ? q - 4 : 0;
6078
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6080
if ( expDiff < 0 ) {
6081
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6084
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6086
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6087
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6090
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6091
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6094
alternateASig0 = aSig0;
6095
alternateASig1 = aSig1;
6097
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6098
} while ( 0 <= (int64_t) aSig0 );
6100
aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6101
if ( ( sigMean0 < 0 )
6102
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6103
aSig0 = alternateASig0;
6104
aSig1 = alternateASig1;
6106
zSign = ( (int64_t) aSig0 < 0 );
6107
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6109
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6113
/*----------------------------------------------------------------------------
6114
| Returns the square root of the quadruple-precision floating-point value `a'.
6115
| The operation is performed according to the IEC/IEEE Standard for Binary
6116
| Floating-Point Arithmetic.
6117
*----------------------------------------------------------------------------*/
6119
float128 float128_sqrt( float128 a STATUS_PARAM )
6123
uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6124
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6127
aSig1 = extractFloat128Frac1( a );
6128
aSig0 = extractFloat128Frac0( a );
6129
aExp = extractFloat128Exp( a );
6130
aSign = extractFloat128Sign( a );
6131
if ( aExp == 0x7FFF ) {
6132
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6133
if ( ! aSign ) return a;
6137
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6139
float_raise( float_flag_invalid STATUS_VAR);
6140
z.low = float128_default_nan_low;
6141
z.high = float128_default_nan_high;
6145
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6146
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6148
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6149
aSig0 |= LIT64( 0x0001000000000000 );
6150
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6151
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6152
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6153
doubleZSig0 = zSig0<<1;
6154
mul64To128( zSig0, zSig0, &term0, &term1 );
6155
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6156
while ( (int64_t) rem0 < 0 ) {
6159
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6161
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6162
if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6163
if ( zSig1 == 0 ) zSig1 = 1;
6164
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6165
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6166
mul64To128( zSig1, zSig1, &term2, &term3 );
6167
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6168
while ( (int64_t) rem1 < 0 ) {
6170
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6172
term2 |= doubleZSig0;
6173
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6175
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6177
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6178
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6182
/*----------------------------------------------------------------------------
6183
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6184
| the corresponding value `b', and 0 otherwise. The invalid exception is
6185
| raised if either operand is a NaN. Otherwise, the comparison is performed
6186
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6187
*----------------------------------------------------------------------------*/
6189
int float128_eq( float128 a, float128 b STATUS_PARAM )
6192
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6193
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6194
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6195
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6197
float_raise( float_flag_invalid STATUS_VAR);
6202
&& ( ( a.high == b.high )
6204
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6209
/*----------------------------------------------------------------------------
6210
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6211
| or equal to the corresponding value `b', and 0 otherwise. The invalid
6212
| exception is raised if either operand is a NaN. The comparison is performed
6213
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6214
*----------------------------------------------------------------------------*/
6216
int float128_le( float128 a, float128 b STATUS_PARAM )
6220
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6221
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6222
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6223
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6225
float_raise( float_flag_invalid STATUS_VAR);
6228
aSign = extractFloat128Sign( a );
6229
bSign = extractFloat128Sign( b );
6230
if ( aSign != bSign ) {
6233
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6237
aSign ? le128( b.high, b.low, a.high, a.low )
6238
: le128( a.high, a.low, b.high, b.low );
6242
/*----------------------------------------------------------------------------
6243
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6244
| the corresponding value `b', and 0 otherwise. The invalid exception is
6245
| raised if either operand is a NaN. The comparison is performed according
6246
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6247
*----------------------------------------------------------------------------*/
6249
int float128_lt( float128 a, float128 b STATUS_PARAM )
6253
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6254
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6255
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6256
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6258
float_raise( float_flag_invalid STATUS_VAR);
6261
aSign = extractFloat128Sign( a );
6262
bSign = extractFloat128Sign( b );
6263
if ( aSign != bSign ) {
6266
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6270
aSign ? lt128( b.high, b.low, a.high, a.low )
6271
: lt128( a.high, a.low, b.high, b.low );
6275
/*----------------------------------------------------------------------------
6276
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6277
| be compared, and 0 otherwise. The invalid exception is raised if either
6278
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6279
| Standard for Binary Floating-Point Arithmetic.
6280
*----------------------------------------------------------------------------*/
6282
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6284
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6285
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6286
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6287
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6289
float_raise( float_flag_invalid STATUS_VAR);
6295
/*----------------------------------------------------------------------------
6296
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6297
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6298
| exception. The comparison is performed according to the IEC/IEEE Standard
6299
| for Binary Floating-Point Arithmetic.
6300
*----------------------------------------------------------------------------*/
6302
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6305
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6306
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6307
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6308
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6310
if ( float128_is_signaling_nan( a )
6311
|| float128_is_signaling_nan( b ) ) {
6312
float_raise( float_flag_invalid STATUS_VAR);
6318
&& ( ( a.high == b.high )
6320
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6325
/*----------------------------------------------------------------------------
6326
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6327
| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6328
| cause an exception. Otherwise, the comparison is performed according to the
6329
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6330
*----------------------------------------------------------------------------*/
6332
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6336
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6337
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6338
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6339
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6341
if ( float128_is_signaling_nan( a )
6342
|| float128_is_signaling_nan( b ) ) {
6343
float_raise( float_flag_invalid STATUS_VAR);
6347
aSign = extractFloat128Sign( a );
6348
bSign = extractFloat128Sign( b );
6349
if ( aSign != bSign ) {
6352
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6356
aSign ? le128( b.high, b.low, a.high, a.low )
6357
: le128( a.high, a.low, b.high, b.low );
6361
/*----------------------------------------------------------------------------
6362
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6363
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6364
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6365
| Standard for Binary Floating-Point Arithmetic.
6366
*----------------------------------------------------------------------------*/
6368
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6372
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6373
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6374
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6375
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6377
if ( float128_is_signaling_nan( a )
6378
|| float128_is_signaling_nan( b ) ) {
6379
float_raise( float_flag_invalid STATUS_VAR);
6383
aSign = extractFloat128Sign( a );
6384
bSign = extractFloat128Sign( b );
6385
if ( aSign != bSign ) {
6388
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6392
aSign ? lt128( b.high, b.low, a.high, a.low )
6393
: lt128( a.high, a.low, b.high, b.low );
6397
/*----------------------------------------------------------------------------
6398
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6399
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6400
| comparison is performed according to the IEC/IEEE Standard for Binary
6401
| Floating-Point Arithmetic.
6402
*----------------------------------------------------------------------------*/
6404
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6406
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6407
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6408
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6409
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6411
if ( float128_is_signaling_nan( a )
6412
|| float128_is_signaling_nan( b ) ) {
6413
float_raise( float_flag_invalid STATUS_VAR);
6420
/* misc functions */
6421
float32 uint32_to_float32( uint32 a STATUS_PARAM )
6423
return int64_to_float32(a STATUS_VAR);
6426
float64 uint32_to_float64( uint32 a STATUS_PARAM )
6428
return int64_to_float64(a STATUS_VAR);
6431
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6436
v = float32_to_int64(a STATUS_VAR);
6439
float_raise( float_flag_invalid STATUS_VAR);
6440
} else if (v > 0xffffffff) {
6442
float_raise( float_flag_invalid STATUS_VAR);
6449
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6454
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6457
float_raise( float_flag_invalid STATUS_VAR);
6458
} else if (v > 0xffffffff) {
6460
float_raise( float_flag_invalid STATUS_VAR);
6467
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6472
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6475
float_raise( float_flag_invalid STATUS_VAR);
6476
} else if (v > 0xffff) {
6478
float_raise( float_flag_invalid STATUS_VAR);
6485
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6490
v = float64_to_int64(a STATUS_VAR);
6493
float_raise( float_flag_invalid STATUS_VAR);
6494
} else if (v > 0xffffffff) {
6496
float_raise( float_flag_invalid STATUS_VAR);
6503
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6508
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6511
float_raise( float_flag_invalid STATUS_VAR);
6512
} else if (v > 0xffffffff) {
6514
float_raise( float_flag_invalid STATUS_VAR);
6521
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6526
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6529
float_raise( float_flag_invalid STATUS_VAR);
6530
} else if (v > 0xffff) {
6532
float_raise( float_flag_invalid STATUS_VAR);
6539
/* FIXME: This looks broken. */
6540
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6544
v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6545
v += float64_val(a);
6546
v = float64_to_int64(make_float64(v) STATUS_VAR);
6548
return v - INT64_MIN;
6551
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6555
v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6556
v += float64_val(a);
6557
v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6559
return v - INT64_MIN;
6562
#define COMPARE(s, nan_exp) \
6563
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6564
int is_quiet STATUS_PARAM ) \
6566
flag aSign, bSign; \
6567
uint ## s ## _t av, bv; \
6568
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6569
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6571
if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6572
extractFloat ## s ## Frac( a ) ) || \
6573
( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6574
extractFloat ## s ## Frac( b ) )) { \
6576
float ## s ## _is_signaling_nan( a ) || \
6577
float ## s ## _is_signaling_nan( b ) ) { \
6578
float_raise( float_flag_invalid STATUS_VAR); \
6580
return float_relation_unordered; \
6582
aSign = extractFloat ## s ## Sign( a ); \
6583
bSign = extractFloat ## s ## Sign( b ); \
6584
av = float ## s ## _val(a); \
6585
bv = float ## s ## _val(b); \
6586
if ( aSign != bSign ) { \
6587
if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6589
return float_relation_equal; \
6591
return 1 - (2 * aSign); \
6595
return float_relation_equal; \
6597
return 1 - 2 * (aSign ^ ( av < bv )); \
6602
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6604
return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6607
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6609
return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6615
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6616
int is_quiet STATUS_PARAM )
6620
if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6621
( extractFloatx80Frac( a )<<1 ) ) ||
6622
( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6623
( extractFloatx80Frac( b )<<1 ) )) {
6625
floatx80_is_signaling_nan( a ) ||
6626
floatx80_is_signaling_nan( b ) ) {
6627
float_raise( float_flag_invalid STATUS_VAR);
6629
return float_relation_unordered;
6631
aSign = extractFloatx80Sign( a );
6632
bSign = extractFloatx80Sign( b );
6633
if ( aSign != bSign ) {
6635
if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6636
( ( a.low | b.low ) == 0 ) ) {
6638
return float_relation_equal;
6640
return 1 - (2 * aSign);
6643
if (a.low == b.low && a.high == b.high) {
6644
return float_relation_equal;
6646
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6651
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6653
return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6656
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6658
return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6661
INLINE int float128_compare_internal( float128 a, float128 b,
6662
int is_quiet STATUS_PARAM )
6666
if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6667
( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6668
( ( extractFloat128Exp( b ) == 0x7fff ) &&
6669
( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6671
float128_is_signaling_nan( a ) ||
6672
float128_is_signaling_nan( b ) ) {
6673
float_raise( float_flag_invalid STATUS_VAR);
6675
return float_relation_unordered;
6677
aSign = extractFloat128Sign( a );
6678
bSign = extractFloat128Sign( b );
6679
if ( aSign != bSign ) {
6680
if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6682
return float_relation_equal;
6684
return 1 - (2 * aSign);
6687
if (a.low == b.low && a.high == b.high) {
6688
return float_relation_equal;
6690
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6695
int float128_compare( float128 a, float128 b STATUS_PARAM )
6697
return float128_compare_internal(a, b, 0 STATUS_VAR);
6700
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6702
return float128_compare_internal(a, b, 1 STATUS_VAR);
6705
/* min() and max() functions. These can't be implemented as
6706
* 'compare and pick one input' because that would mishandle
6707
* NaNs and +0 vs -0.
6709
#define MINMAX(s, nan_exp) \
6710
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6711
int ismin STATUS_PARAM ) \
6713
flag aSign, bSign; \
6714
uint ## s ## _t av, bv; \
6715
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6716
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6717
if (float ## s ## _is_any_nan(a) || \
6718
float ## s ## _is_any_nan(b)) { \
6719
return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6721
aSign = extractFloat ## s ## Sign(a); \
6722
bSign = extractFloat ## s ## Sign(b); \
6723
av = float ## s ## _val(a); \
6724
bv = float ## s ## _val(b); \
6725
if (aSign != bSign) { \
6727
return aSign ? a : b; \
6729
return aSign ? b : a; \
6733
return (aSign ^ (av < bv)) ? a : b; \
6735
return (aSign ^ (av < bv)) ? b : a; \
6740
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6742
return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6745
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6747
return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6754
/* Multiply A by 2 raised to the power N. */
6755
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6761
a = float32_squash_input_denormal(a STATUS_VAR);
6762
aSig = extractFloat32Frac( a );
6763
aExp = extractFloat32Exp( a );
6764
aSign = extractFloat32Sign( a );
6766
if ( aExp == 0xFF ) {
6768
return propagateFloat32NaN( a, a STATUS_VAR );
6774
else if ( aSig == 0 )
6779
} else if (n < -0x200) {
6785
return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6788
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6794
a = float64_squash_input_denormal(a STATUS_VAR);
6795
aSig = extractFloat64Frac( a );
6796
aExp = extractFloat64Exp( a );
6797
aSign = extractFloat64Sign( a );
6799
if ( aExp == 0x7FF ) {
6801
return propagateFloat64NaN( a, a STATUS_VAR );
6806
aSig |= LIT64( 0x0010000000000000 );
6807
else if ( aSig == 0 )
6812
} else if (n < -0x1000) {
6818
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6821
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6827
aSig = extractFloatx80Frac( a );
6828
aExp = extractFloatx80Exp( a );
6829
aSign = extractFloatx80Sign( a );
6831
if ( aExp == 0x7FFF ) {
6833
return propagateFloatx80NaN( a, a STATUS_VAR );
6838
if (aExp == 0 && aSig == 0)
6843
} else if (n < -0x10000) {
6848
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6849
aSign, aExp, aSig, 0 STATUS_VAR );
6852
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6856
uint64_t aSig0, aSig1;
6858
aSig1 = extractFloat128Frac1( a );
6859
aSig0 = extractFloat128Frac0( a );
6860
aExp = extractFloat128Exp( a );
6861
aSign = extractFloat128Sign( a );
6862
if ( aExp == 0x7FFF ) {
6863
if ( aSig0 | aSig1 ) {
6864
return propagateFloat128NaN( a, a STATUS_VAR );
6869
aSig0 |= LIT64( 0x0001000000000000 );
6870
else if ( aSig0 == 0 && aSig1 == 0 )
6875
} else if (n < -0x10000) {
6880
return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1