4
* Derived from SoftFloat.
7
/*============================================================================
9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12
Written by John R. Hauser. This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704. Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980. The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
36
=============================================================================*/
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
* target-dependent and needs the TARGET_* macros.
43
#include "fpu/softfloat.h"
45
/*----------------------------------------------------------------------------
46
| Primitive arithmetic functions, including multi-word arithmetic, and
47
| division and square root approximations. (Can be specialized to target if
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-macros.h"
52
/*----------------------------------------------------------------------------
53
| Functions and definitions to determine: (1) whether tininess for underflow
54
| is detected before or after rounding by default, (2) what (if anything)
55
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57
| are propagated from function inputs to output. These details are target-
59
*----------------------------------------------------------------------------*/
60
#include "softfloat-specialize.h"
62
void set_float_rounding_mode(int val STATUS_PARAM)
64
STATUS(float_rounding_mode) = val;
67
void set_float_exception_flags(int val STATUS_PARAM)
69
STATUS(float_exception_flags) = val;
72
void set_floatx80_rounding_precision(int val STATUS_PARAM)
74
STATUS(floatx80_rounding_precision) = val;
77
/*----------------------------------------------------------------------------
78
| Returns the fraction bits of the half-precision floating-point value `a'.
79
*----------------------------------------------------------------------------*/
81
INLINE uint32_t extractFloat16Frac(float16 a)
83
return float16_val(a) & 0x3ff;
86
/*----------------------------------------------------------------------------
87
| Returns the exponent bits of the half-precision floating-point value `a'.
88
*----------------------------------------------------------------------------*/
90
INLINE int_fast16_t extractFloat16Exp(float16 a)
92
return (float16_val(a) >> 10) & 0x1f;
95
/*----------------------------------------------------------------------------
96
| Returns the sign bit of the single-precision floating-point value `a'.
97
*----------------------------------------------------------------------------*/
99
INLINE flag extractFloat16Sign(float16 a)
101
return float16_val(a)>>15;
104
/*----------------------------------------------------------------------------
105
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106
| and 7, and returns the properly rounded 32-bit integer corresponding to the
107
| input. If `zSign' is 1, the input is negated before being converted to an
108
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109
| is simply rounded to an integer, with the inexact exception raised if the
110
| input cannot be represented exactly as an integer. However, if the fixed-
111
| point input is too large, the invalid exception is raised and the largest
112
| positive or negative integer is returned.
113
*----------------------------------------------------------------------------*/
115
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
118
flag roundNearestEven;
119
int8 roundIncrement, roundBits;
122
roundingMode = STATUS(float_rounding_mode);
123
roundNearestEven = ( roundingMode == float_round_nearest_even );
124
roundIncrement = 0x40;
125
if ( ! roundNearestEven ) {
126
if ( roundingMode == float_round_to_zero ) {
130
roundIncrement = 0x7F;
132
if ( roundingMode == float_round_up ) roundIncrement = 0;
135
if ( roundingMode == float_round_down ) roundIncrement = 0;
139
roundBits = absZ & 0x7F;
140
absZ = ( absZ + roundIncrement )>>7;
141
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143
if ( zSign ) z = - z;
144
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145
float_raise( float_flag_invalid STATUS_VAR);
146
return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
148
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
153
/*----------------------------------------------------------------------------
154
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155
| `absZ1', with binary point between bits 63 and 64 (between the input words),
156
| and returns the properly rounded 64-bit integer corresponding to the input.
157
| If `zSign' is 1, the input is negated before being converted to an integer.
158
| Ordinarily, the fixed-point input is simply rounded to an integer, with
159
| the inexact exception raised if the input cannot be represented exactly as
160
| an integer. However, if the fixed-point input is too large, the invalid
161
| exception is raised and the largest positive or negative integer is
163
*----------------------------------------------------------------------------*/
165
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
168
flag roundNearestEven, increment;
171
roundingMode = STATUS(float_rounding_mode);
172
roundNearestEven = ( roundingMode == float_round_nearest_even );
173
increment = ( (int64_t) absZ1 < 0 );
174
if ( ! roundNearestEven ) {
175
if ( roundingMode == float_round_to_zero ) {
180
increment = ( roundingMode == float_round_down ) && absZ1;
183
increment = ( roundingMode == float_round_up ) && absZ1;
189
if ( absZ0 == 0 ) goto overflow;
190
absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193
if ( zSign ) z = - z;
194
if ( z && ( ( z < 0 ) ^ zSign ) ) {
196
float_raise( float_flag_invalid STATUS_VAR);
198
zSign ? (int64_t) LIT64( 0x8000000000000000 )
199
: LIT64( 0x7FFFFFFFFFFFFFFF );
201
if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
206
/*----------------------------------------------------------------------------
207
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
208
| `absZ1', with binary point between bits 63 and 64 (between the input words),
209
| and returns the properly rounded 64-bit unsigned integer corresponding to the
210
| input. Ordinarily, the fixed-point input is simply rounded to an integer,
211
| with the inexact exception raised if the input cannot be represented exactly
212
| as an integer. However, if the fixed-point input is too large, the invalid
213
| exception is raised and the largest unsigned integer is returned.
214
*----------------------------------------------------------------------------*/
216
static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
217
uint64_t absZ1 STATUS_PARAM)
220
flag roundNearestEven, increment;
222
roundingMode = STATUS(float_rounding_mode);
223
roundNearestEven = (roundingMode == float_round_nearest_even);
224
increment = ((int64_t)absZ1 < 0);
225
if (!roundNearestEven) {
226
if (roundingMode == float_round_to_zero) {
230
increment = (roundingMode == float_round_down) && absZ1;
232
increment = (roundingMode == float_round_up) && absZ1;
239
float_raise(float_flag_invalid STATUS_VAR);
240
return LIT64(0xFFFFFFFFFFFFFFFF);
242
absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
245
if (zSign && absZ0) {
246
float_raise(float_flag_invalid STATUS_VAR);
251
STATUS(float_exception_flags) |= float_flag_inexact;
256
/*----------------------------------------------------------------------------
257
| Returns the fraction bits of the single-precision floating-point value `a'.
258
*----------------------------------------------------------------------------*/
260
INLINE uint32_t extractFloat32Frac( float32 a )
263
return float32_val(a) & 0x007FFFFF;
267
/*----------------------------------------------------------------------------
268
| Returns the exponent bits of the single-precision floating-point value `a'.
269
*----------------------------------------------------------------------------*/
271
INLINE int_fast16_t extractFloat32Exp(float32 a)
274
return ( float32_val(a)>>23 ) & 0xFF;
278
/*----------------------------------------------------------------------------
279
| Returns the sign bit of the single-precision floating-point value `a'.
280
*----------------------------------------------------------------------------*/
282
INLINE flag extractFloat32Sign( float32 a )
285
return float32_val(a)>>31;
289
/*----------------------------------------------------------------------------
290
| If `a' is denormal and we are in flush-to-zero mode then set the
291
| input-denormal exception and return zero. Otherwise just return the value.
292
*----------------------------------------------------------------------------*/
293
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
295
if (STATUS(flush_inputs_to_zero)) {
296
if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
297
float_raise(float_flag_input_denormal STATUS_VAR);
298
return make_float32(float32_val(a) & 0x80000000);
304
/*----------------------------------------------------------------------------
305
| Normalizes the subnormal single-precision floating-point value represented
306
| by the denormalized significand `aSig'. The normalized exponent and
307
| significand are stored at the locations pointed to by `zExpPtr' and
308
| `zSigPtr', respectively.
309
*----------------------------------------------------------------------------*/
312
normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
316
shiftCount = countLeadingZeros32( aSig ) - 8;
317
*zSigPtr = aSig<<shiftCount;
318
*zExpPtr = 1 - shiftCount;
322
/*----------------------------------------------------------------------------
323
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
324
| single-precision floating-point value, returning the result. After being
325
| shifted into the proper positions, the three fields are simply added
326
| together to form the result. This means that any integer portion of `zSig'
327
| will be added into the exponent. Since a properly normalized significand
328
| will have an integer portion equal to 1, the `zExp' input should be 1 less
329
| than the desired result exponent whenever `zSig' is a complete, normalized
331
*----------------------------------------------------------------------------*/
333
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
337
( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
341
/*----------------------------------------------------------------------------
342
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
343
| and significand `zSig', and returns the proper single-precision floating-
344
| point value corresponding to the abstract input. Ordinarily, the abstract
345
| value is simply rounded and packed into the single-precision format, with
346
| the inexact exception raised if the abstract input cannot be represented
347
| exactly. However, if the abstract value is too large, the overflow and
348
| inexact exceptions are raised and an infinity or maximal finite value is
349
| returned. If the abstract value is too small, the input value is rounded to
350
| a subnormal number, and the underflow and inexact exceptions are raised if
351
| the abstract input cannot be represented exactly as a subnormal single-
352
| precision floating-point number.
353
| The input significand `zSig' has its binary point between bits 30
354
| and 29, which is 7 bits to the left of the usual location. This shifted
355
| significand must be normalized or smaller. If `zSig' is not normalized,
356
| `zExp' must be 0; in that case, the result returned is a subnormal number,
357
| and it must not require rounding. In the usual case that `zSig' is
358
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
359
| The handling of underflow and overflow follows the IEC/IEEE Standard for
360
| Binary Floating-Point Arithmetic.
361
*----------------------------------------------------------------------------*/
363
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
366
flag roundNearestEven;
367
int8 roundIncrement, roundBits;
370
roundingMode = STATUS(float_rounding_mode);
371
roundNearestEven = ( roundingMode == float_round_nearest_even );
372
roundIncrement = 0x40;
373
if ( ! roundNearestEven ) {
374
if ( roundingMode == float_round_to_zero ) {
378
roundIncrement = 0x7F;
380
if ( roundingMode == float_round_up ) roundIncrement = 0;
383
if ( roundingMode == float_round_down ) roundIncrement = 0;
387
roundBits = zSig & 0x7F;
388
if ( 0xFD <= (uint16_t) zExp ) {
390
|| ( ( zExp == 0xFD )
391
&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
394
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
397
if (STATUS(flush_to_zero)) {
398
float_raise(float_flag_output_denormal STATUS_VAR);
399
return packFloat32(zSign, 0, 0);
402
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404
|| ( zSig + roundIncrement < 0x80000000 );
405
shift32RightJamming( zSig, - zExp, &zSig );
407
roundBits = zSig & 0x7F;
408
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
411
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
412
zSig = ( zSig + roundIncrement )>>7;
413
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
414
if ( zSig == 0 ) zExp = 0;
415
return packFloat32( zSign, zExp, zSig );
419
/*----------------------------------------------------------------------------
420
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
421
| and significand `zSig', and returns the proper single-precision floating-
422
| point value corresponding to the abstract input. This routine is just like
423
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
424
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
425
| floating-point exponent.
426
*----------------------------------------------------------------------------*/
429
normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
433
shiftCount = countLeadingZeros32( zSig ) - 1;
434
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
438
/*----------------------------------------------------------------------------
439
| Returns the fraction bits of the double-precision floating-point value `a'.
440
*----------------------------------------------------------------------------*/
442
INLINE uint64_t extractFloat64Frac( float64 a )
445
return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
449
/*----------------------------------------------------------------------------
450
| Returns the exponent bits of the double-precision floating-point value `a'.
451
*----------------------------------------------------------------------------*/
453
INLINE int_fast16_t extractFloat64Exp(float64 a)
456
return ( float64_val(a)>>52 ) & 0x7FF;
460
/*----------------------------------------------------------------------------
461
| Returns the sign bit of the double-precision floating-point value `a'.
462
*----------------------------------------------------------------------------*/
464
INLINE flag extractFloat64Sign( float64 a )
467
return float64_val(a)>>63;
471
/*----------------------------------------------------------------------------
472
| If `a' is denormal and we are in flush-to-zero mode then set the
473
| input-denormal exception and return zero. Otherwise just return the value.
474
*----------------------------------------------------------------------------*/
475
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477
if (STATUS(flush_inputs_to_zero)) {
478
if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
479
float_raise(float_flag_input_denormal STATUS_VAR);
480
return make_float64(float64_val(a) & (1ULL << 63));
486
/*----------------------------------------------------------------------------
487
| Normalizes the subnormal double-precision floating-point value represented
488
| by the denormalized significand `aSig'. The normalized exponent and
489
| significand are stored at the locations pointed to by `zExpPtr' and
490
| `zSigPtr', respectively.
491
*----------------------------------------------------------------------------*/
494
normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
498
shiftCount = countLeadingZeros64( aSig ) - 11;
499
*zSigPtr = aSig<<shiftCount;
500
*zExpPtr = 1 - shiftCount;
504
/*----------------------------------------------------------------------------
505
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
506
| double-precision floating-point value, returning the result. After being
507
| shifted into the proper positions, the three fields are simply added
508
| together to form the result. This means that any integer portion of `zSig'
509
| will be added into the exponent. Since a properly normalized significand
510
| will have an integer portion equal to 1, the `zExp' input should be 1 less
511
| than the desired result exponent whenever `zSig' is a complete, normalized
513
*----------------------------------------------------------------------------*/
515
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
519
( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
523
/*----------------------------------------------------------------------------
524
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
525
| and significand `zSig', and returns the proper double-precision floating-
526
| point value corresponding to the abstract input. Ordinarily, the abstract
527
| value is simply rounded and packed into the double-precision format, with
528
| the inexact exception raised if the abstract input cannot be represented
529
| exactly. However, if the abstract value is too large, the overflow and
530
| inexact exceptions are raised and an infinity or maximal finite value is
531
| returned. If the abstract value is too small, the input value is rounded
532
| to a subnormal number, and the underflow and inexact exceptions are raised
533
| if the abstract input cannot be represented exactly as a subnormal double-
534
| precision floating-point number.
535
| The input significand `zSig' has its binary point between bits 62
536
| and 61, which is 10 bits to the left of the usual location. This shifted
537
| significand must be normalized or smaller. If `zSig' is not normalized,
538
| `zExp' must be 0; in that case, the result returned is a subnormal number,
539
| and it must not require rounding. In the usual case that `zSig' is
540
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
541
| The handling of underflow and overflow follows the IEC/IEEE Standard for
542
| Binary Floating-Point Arithmetic.
543
*----------------------------------------------------------------------------*/
545
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
548
flag roundNearestEven;
549
int_fast16_t roundIncrement, roundBits;
552
roundingMode = STATUS(float_rounding_mode);
553
roundNearestEven = ( roundingMode == float_round_nearest_even );
554
roundIncrement = 0x200;
555
if ( ! roundNearestEven ) {
556
if ( roundingMode == float_round_to_zero ) {
560
roundIncrement = 0x3FF;
562
if ( roundingMode == float_round_up ) roundIncrement = 0;
565
if ( roundingMode == float_round_down ) roundIncrement = 0;
569
roundBits = zSig & 0x3FF;
570
if ( 0x7FD <= (uint16_t) zExp ) {
571
if ( ( 0x7FD < zExp )
572
|| ( ( zExp == 0x7FD )
573
&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
575
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
576
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
579
if (STATUS(flush_to_zero)) {
580
float_raise(float_flag_output_denormal STATUS_VAR);
581
return packFloat64(zSign, 0, 0);
584
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
586
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
587
shift64RightJamming( zSig, - zExp, &zSig );
589
roundBits = zSig & 0x3FF;
590
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
593
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
594
zSig = ( zSig + roundIncrement )>>10;
595
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
596
if ( zSig == 0 ) zExp = 0;
597
return packFloat64( zSign, zExp, zSig );
601
/*----------------------------------------------------------------------------
602
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
603
| and significand `zSig', and returns the proper double-precision floating-
604
| point value corresponding to the abstract input. This routine is just like
605
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
606
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
607
| floating-point exponent.
608
*----------------------------------------------------------------------------*/
611
normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615
shiftCount = countLeadingZeros64( zSig ) - 1;
616
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620
/*----------------------------------------------------------------------------
621
| Returns the fraction bits of the extended double-precision floating-point
623
*----------------------------------------------------------------------------*/
625
INLINE uint64_t extractFloatx80Frac( floatx80 a )
632
/*----------------------------------------------------------------------------
633
| Returns the exponent bits of the extended double-precision floating-point
635
*----------------------------------------------------------------------------*/
637
INLINE int32 extractFloatx80Exp( floatx80 a )
640
return a.high & 0x7FFF;
644
/*----------------------------------------------------------------------------
645
| Returns the sign bit of the extended double-precision floating-point value
647
*----------------------------------------------------------------------------*/
649
INLINE flag extractFloatx80Sign( floatx80 a )
656
/*----------------------------------------------------------------------------
657
| Normalizes the subnormal extended double-precision floating-point value
658
| represented by the denormalized significand `aSig'. The normalized exponent
659
| and significand are stored at the locations pointed to by `zExpPtr' and
660
| `zSigPtr', respectively.
661
*----------------------------------------------------------------------------*/
664
normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668
shiftCount = countLeadingZeros64( aSig );
669
*zSigPtr = aSig<<shiftCount;
670
*zExpPtr = 1 - shiftCount;
674
/*----------------------------------------------------------------------------
675
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
676
| extended double-precision floating-point value, returning the result.
677
*----------------------------------------------------------------------------*/
679
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
684
z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
689
/*----------------------------------------------------------------------------
690
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
691
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
692
| and returns the proper extended double-precision floating-point value
693
| corresponding to the abstract input. Ordinarily, the abstract value is
694
| rounded and packed into the extended double-precision format, with the
695
| inexact exception raised if the abstract input cannot be represented
696
| exactly. However, if the abstract value is too large, the overflow and
697
| inexact exceptions are raised and an infinity or maximal finite value is
698
| returned. If the abstract value is too small, the input value is rounded to
699
| a subnormal number, and the underflow and inexact exceptions are raised if
700
| the abstract input cannot be represented exactly as a subnormal extended
701
| double-precision floating-point number.
702
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
703
| number of bits as single or double precision, respectively. Otherwise, the
704
| result is rounded to the full precision of the extended double-precision
706
| The input significand must be normalized or smaller. If the input
707
| significand is not normalized, `zExp' must be 0; in that case, the result
708
| returned is a subnormal number, and it must not require rounding. The
709
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
710
| Floating-Point Arithmetic.
711
*----------------------------------------------------------------------------*/
714
roundAndPackFloatx80(
715
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719
flag roundNearestEven, increment, isTiny;
720
int64 roundIncrement, roundMask, roundBits;
722
roundingMode = STATUS(float_rounding_mode);
723
roundNearestEven = ( roundingMode == float_round_nearest_even );
724
if ( roundingPrecision == 80 ) goto precision80;
725
if ( roundingPrecision == 64 ) {
726
roundIncrement = LIT64( 0x0000000000000400 );
727
roundMask = LIT64( 0x00000000000007FF );
729
else if ( roundingPrecision == 32 ) {
730
roundIncrement = LIT64( 0x0000008000000000 );
731
roundMask = LIT64( 0x000000FFFFFFFFFF );
736
zSig0 |= ( zSig1 != 0 );
737
if ( ! roundNearestEven ) {
738
if ( roundingMode == float_round_to_zero ) {
742
roundIncrement = roundMask;
744
if ( roundingMode == float_round_up ) roundIncrement = 0;
747
if ( roundingMode == float_round_down ) roundIncrement = 0;
751
roundBits = zSig0 & roundMask;
752
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
753
if ( ( 0x7FFE < zExp )
754
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
759
if (STATUS(flush_to_zero)) {
760
float_raise(float_flag_output_denormal STATUS_VAR);
761
return packFloatx80(zSign, 0, 0);
764
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766
|| ( zSig0 <= zSig0 + roundIncrement );
767
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
769
roundBits = zSig0 & roundMask;
770
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
771
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
772
zSig0 += roundIncrement;
773
if ( (int64_t) zSig0 < 0 ) zExp = 1;
774
roundIncrement = roundMask + 1;
775
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
776
roundMask |= roundIncrement;
778
zSig0 &= ~ roundMask;
779
return packFloatx80( zSign, zExp, zSig0 );
782
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
783
zSig0 += roundIncrement;
784
if ( zSig0 < roundIncrement ) {
786
zSig0 = LIT64( 0x8000000000000000 );
788
roundIncrement = roundMask + 1;
789
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
790
roundMask |= roundIncrement;
792
zSig0 &= ~ roundMask;
793
if ( zSig0 == 0 ) zExp = 0;
794
return packFloatx80( zSign, zExp, zSig0 );
796
increment = ( (int64_t) zSig1 < 0 );
797
if ( ! roundNearestEven ) {
798
if ( roundingMode == float_round_to_zero ) {
803
increment = ( roundingMode == float_round_down ) && zSig1;
806
increment = ( roundingMode == float_round_up ) && zSig1;
810
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
811
if ( ( 0x7FFE < zExp )
812
|| ( ( zExp == 0x7FFE )
813
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
819
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
820
if ( ( roundingMode == float_round_to_zero )
821
|| ( zSign && ( roundingMode == float_round_up ) )
822
|| ( ! zSign && ( roundingMode == float_round_down ) )
824
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
826
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
830
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
833
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
834
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
836
if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
837
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
838
if ( roundNearestEven ) {
839
increment = ( (int64_t) zSig1 < 0 );
843
increment = ( roundingMode == float_round_down ) && zSig1;
846
increment = ( roundingMode == float_round_up ) && zSig1;
852
~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
853
if ( (int64_t) zSig0 < 0 ) zExp = 1;
855
return packFloatx80( zSign, zExp, zSig0 );
858
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
863
zSig0 = LIT64( 0x8000000000000000 );
866
zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
870
if ( zSig0 == 0 ) zExp = 0;
872
return packFloatx80( zSign, zExp, zSig0 );
876
/*----------------------------------------------------------------------------
877
| Takes an abstract floating-point value having sign `zSign', exponent
878
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
879
| and returns the proper extended double-precision floating-point value
880
| corresponding to the abstract input. This routine is just like
881
| `roundAndPackFloatx80' except that the input significand does not have to be
883
*----------------------------------------------------------------------------*/
886
normalizeRoundAndPackFloatx80(
887
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
897
shiftCount = countLeadingZeros64( zSig0 );
898
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
901
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
905
/*----------------------------------------------------------------------------
906
| Returns the least-significant 64 fraction bits of the quadruple-precision
907
| floating-point value `a'.
908
*----------------------------------------------------------------------------*/
910
INLINE uint64_t extractFloat128Frac1( float128 a )
917
/*----------------------------------------------------------------------------
918
| Returns the most-significant 48 fraction bits of the quadruple-precision
919
| floating-point value `a'.
920
*----------------------------------------------------------------------------*/
922
INLINE uint64_t extractFloat128Frac0( float128 a )
925
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
929
/*----------------------------------------------------------------------------
930
| Returns the exponent bits of the quadruple-precision floating-point value
932
*----------------------------------------------------------------------------*/
934
INLINE int32 extractFloat128Exp( float128 a )
937
return ( a.high>>48 ) & 0x7FFF;
941
/*----------------------------------------------------------------------------
942
| Returns the sign bit of the quadruple-precision floating-point value `a'.
943
*----------------------------------------------------------------------------*/
945
INLINE flag extractFloat128Sign( float128 a )
952
/*----------------------------------------------------------------------------
953
| Normalizes the subnormal quadruple-precision floating-point value
954
| represented by the denormalized significand formed by the concatenation of
955
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
956
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
957
| significand are stored at the location pointed to by `zSig0Ptr', and the
958
| least significant 64 bits of the normalized significand are stored at the
959
| location pointed to by `zSig1Ptr'.
960
*----------------------------------------------------------------------------*/
963
normalizeFloat128Subnormal(
974
shiftCount = countLeadingZeros64( aSig1 ) - 15;
975
if ( shiftCount < 0 ) {
976
*zSig0Ptr = aSig1>>( - shiftCount );
977
*zSig1Ptr = aSig1<<( shiftCount & 63 );
980
*zSig0Ptr = aSig1<<shiftCount;
983
*zExpPtr = - shiftCount - 63;
986
shiftCount = countLeadingZeros64( aSig0 ) - 15;
987
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
988
*zExpPtr = 1 - shiftCount;
993
/*----------------------------------------------------------------------------
994
| Packs the sign `zSign', the exponent `zExp', and the significand formed
995
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
996
| floating-point value, returning the result. After being shifted into the
997
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
998
| added together to form the most significant 32 bits of the result. This
999
| means that any integer portion of `zSig0' will be added into the exponent.
1000
| Since a properly normalized significand will have an integer portion equal
1001
| to 1, the `zExp' input should be 1 less than the desired result exponent
1002
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1004
*----------------------------------------------------------------------------*/
1007
packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1012
z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1017
/*----------------------------------------------------------------------------
1018
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019
| and extended significand formed by the concatenation of `zSig0', `zSig1',
1020
| and `zSig2', and returns the proper quadruple-precision floating-point value
1021
| corresponding to the abstract input. Ordinarily, the abstract value is
1022
| simply rounded and packed into the quadruple-precision format, with the
1023
| inexact exception raised if the abstract input cannot be represented
1024
| exactly. However, if the abstract value is too large, the overflow and
1025
| inexact exceptions are raised and an infinity or maximal finite value is
1026
| returned. If the abstract value is too small, the input value is rounded to
1027
| a subnormal number, and the underflow and inexact exceptions are raised if
1028
| the abstract input cannot be represented exactly as a subnormal quadruple-
1029
| precision floating-point number.
1030
| The input significand must be normalized or smaller. If the input
1031
| significand is not normalized, `zExp' must be 0; in that case, the result
1032
| returned is a subnormal number, and it must not require rounding. In the
1033
| usual case that the input significand is normalized, `zExp' must be 1 less
1034
| than the ``true'' floating-point exponent. The handling of underflow and
1035
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036
*----------------------------------------------------------------------------*/
1039
roundAndPackFloat128(
1040
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1043
flag roundNearestEven, increment, isTiny;
1045
roundingMode = STATUS(float_rounding_mode);
1046
roundNearestEven = ( roundingMode == float_round_nearest_even );
1047
increment = ( (int64_t) zSig2 < 0 );
1048
if ( ! roundNearestEven ) {
1049
if ( roundingMode == float_round_to_zero ) {
1054
increment = ( roundingMode == float_round_down ) && zSig2;
1057
increment = ( roundingMode == float_round_up ) && zSig2;
1061
if ( 0x7FFD <= (uint32_t) zExp ) {
1062
if ( ( 0x7FFD < zExp )
1063
|| ( ( zExp == 0x7FFD )
1065
LIT64( 0x0001FFFFFFFFFFFF ),
1066
LIT64( 0xFFFFFFFFFFFFFFFF ),
1073
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1074
if ( ( roundingMode == float_round_to_zero )
1075
|| ( zSign && ( roundingMode == float_round_up ) )
1076
|| ( ! zSign && ( roundingMode == float_round_down ) )
1082
LIT64( 0x0000FFFFFFFFFFFF ),
1083
LIT64( 0xFFFFFFFFFFFFFFFF )
1086
return packFloat128( zSign, 0x7FFF, 0, 0 );
1089
if (STATUS(flush_to_zero)) {
1090
float_raise(float_flag_output_denormal STATUS_VAR);
1091
return packFloat128(zSign, 0, 0, 0);
1094
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1100
LIT64( 0x0001FFFFFFFFFFFF ),
1101
LIT64( 0xFFFFFFFFFFFFFFFF )
1103
shift128ExtraRightJamming(
1104
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1106
if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1107
if ( roundNearestEven ) {
1108
increment = ( (int64_t) zSig2 < 0 );
1112
increment = ( roundingMode == float_round_down ) && zSig2;
1115
increment = ( roundingMode == float_round_up ) && zSig2;
1120
if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1122
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1123
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1126
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1128
return packFloat128( zSign, zExp, zSig0, zSig1 );
1132
/*----------------------------------------------------------------------------
1133
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1134
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1135
| returns the proper quadruple-precision floating-point value corresponding
1136
| to the abstract input. This routine is just like `roundAndPackFloat128'
1137
| except that the input significand has fewer bits and does not have to be
1138
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1140
*----------------------------------------------------------------------------*/
1143
normalizeRoundAndPackFloat128(
1144
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1154
shiftCount = countLeadingZeros64( zSig0 ) - 15;
1155
if ( 0 <= shiftCount ) {
1157
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1160
shift128ExtraRightJamming(
1161
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1164
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1168
/*----------------------------------------------------------------------------
1169
| Returns the result of converting the 32-bit two's complement integer `a'
1170
| to the single-precision floating-point format. The conversion is performed
1171
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172
*----------------------------------------------------------------------------*/
1174
float32 int32_to_float32(int32_t a STATUS_PARAM)
1178
if ( a == 0 ) return float32_zero;
1179
if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1181
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1185
/*----------------------------------------------------------------------------
1186
| Returns the result of converting the 32-bit two's complement integer `a'
1187
| to the double-precision floating-point format. The conversion is performed
1188
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189
*----------------------------------------------------------------------------*/
1191
float64 int32_to_float64(int32_t a STATUS_PARAM)
1198
if ( a == 0 ) return float64_zero;
1200
absA = zSign ? - a : a;
1201
shiftCount = countLeadingZeros32( absA ) + 21;
1203
return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1207
/*----------------------------------------------------------------------------
1208
| Returns the result of converting the 32-bit two's complement integer `a'
1209
| to the extended double-precision floating-point format. The conversion
1210
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1212
*----------------------------------------------------------------------------*/
1214
floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1221
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1223
absA = zSign ? - a : a;
1224
shiftCount = countLeadingZeros32( absA ) + 32;
1226
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1230
/*----------------------------------------------------------------------------
1231
| Returns the result of converting the 32-bit two's complement integer `a' to
1232
| the quadruple-precision floating-point format. The conversion is performed
1233
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1234
*----------------------------------------------------------------------------*/
1236
float128 int32_to_float128(int32_t a STATUS_PARAM)
1243
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1245
absA = zSign ? - a : a;
1246
shiftCount = countLeadingZeros32( absA ) + 17;
1248
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1252
/*----------------------------------------------------------------------------
1253
| Returns the result of converting the 64-bit two's complement integer `a'
1254
| to the single-precision floating-point format. The conversion is performed
1255
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256
*----------------------------------------------------------------------------*/
1258
float32 int64_to_float32(int64_t a STATUS_PARAM)
1264
if ( a == 0 ) return float32_zero;
1266
absA = zSign ? - a : a;
1267
shiftCount = countLeadingZeros64( absA ) - 40;
1268
if ( 0 <= shiftCount ) {
1269
return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1273
if ( shiftCount < 0 ) {
1274
shift64RightJamming( absA, - shiftCount, &absA );
1277
absA <<= shiftCount;
1279
return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1284
float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1288
if ( a == 0 ) return float32_zero;
1289
shiftCount = countLeadingZeros64( a ) - 40;
1290
if ( 0 <= shiftCount ) {
1291
return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1295
if ( shiftCount < 0 ) {
1296
shift64RightJamming( a, - shiftCount, &a );
1301
return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1305
/*----------------------------------------------------------------------------
1306
| Returns the result of converting the 64-bit two's complement integer `a'
1307
| to the double-precision floating-point format. The conversion is performed
1308
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1309
*----------------------------------------------------------------------------*/
1311
float64 int64_to_float64(int64_t a STATUS_PARAM)
1315
if ( a == 0 ) return float64_zero;
1316
if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1317
return packFloat64( 1, 0x43E, 0 );
1320
return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1324
float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1329
return float64_zero;
1331
if ((int64_t)a < 0) {
1332
shift64RightJamming(a, 1, &a);
1335
return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1338
/*----------------------------------------------------------------------------
1339
| Returns the result of converting the 64-bit two's complement integer `a'
1340
| to the extended double-precision floating-point format. The conversion
1341
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1343
*----------------------------------------------------------------------------*/
1345
floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1351
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1353
absA = zSign ? - a : a;
1354
shiftCount = countLeadingZeros64( absA );
1355
return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1359
/*----------------------------------------------------------------------------
1360
| Returns the result of converting the 64-bit two's complement integer `a' to
1361
| the quadruple-precision floating-point format. The conversion is performed
1362
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1363
*----------------------------------------------------------------------------*/
1365
float128 int64_to_float128(int64_t a STATUS_PARAM)
1371
uint64_t zSig0, zSig1;
1373
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1375
absA = zSign ? - a : a;
1376
shiftCount = countLeadingZeros64( absA ) + 49;
1377
zExp = 0x406E - shiftCount;
1378
if ( 64 <= shiftCount ) {
1387
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1388
return packFloat128( zSign, zExp, zSig0, zSig1 );
1392
float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1395
return float128_zero;
1397
return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1400
/*----------------------------------------------------------------------------
1401
| Returns the result of converting the single-precision floating-point value
1402
| `a' to the 32-bit two's complement integer format. The conversion is
1403
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1404
| Arithmetic---which means in particular that the conversion is rounded
1405
| according to the current rounding mode. If `a' is a NaN, the largest
1406
| positive integer is returned. Otherwise, if the conversion overflows, the
1407
| largest integer with the same sign as `a' is returned.
1408
*----------------------------------------------------------------------------*/
1410
int32 float32_to_int32( float32 a STATUS_PARAM )
1413
int_fast16_t aExp, shiftCount;
1417
a = float32_squash_input_denormal(a STATUS_VAR);
1418
aSig = extractFloat32Frac( a );
1419
aExp = extractFloat32Exp( a );
1420
aSign = extractFloat32Sign( a );
1421
if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1422
if ( aExp ) aSig |= 0x00800000;
1423
shiftCount = 0xAF - aExp;
1426
if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1427
return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1431
/*----------------------------------------------------------------------------
1432
| Returns the result of converting the single-precision floating-point value
1433
| `a' to the 32-bit two's complement integer format. The conversion is
1434
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1435
| Arithmetic, except that the conversion is always rounded toward zero.
1436
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1437
| the conversion overflows, the largest integer with the same sign as `a' is
1439
*----------------------------------------------------------------------------*/
1441
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1444
int_fast16_t aExp, shiftCount;
1447
a = float32_squash_input_denormal(a STATUS_VAR);
1449
aSig = extractFloat32Frac( a );
1450
aExp = extractFloat32Exp( a );
1451
aSign = extractFloat32Sign( a );
1452
shiftCount = aExp - 0x9E;
1453
if ( 0 <= shiftCount ) {
1454
if ( float32_val(a) != 0xCF000000 ) {
1455
float_raise( float_flag_invalid STATUS_VAR);
1456
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1458
return (int32_t) 0x80000000;
1460
else if ( aExp <= 0x7E ) {
1461
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1464
aSig = ( aSig | 0x00800000 )<<8;
1465
z = aSig>>( - shiftCount );
1466
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1467
STATUS(float_exception_flags) |= float_flag_inexact;
1469
if ( aSign ) z = - z;
1474
/*----------------------------------------------------------------------------
1475
| Returns the result of converting the single-precision floating-point value
1476
| `a' to the 16-bit two's complement integer format. The conversion is
1477
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1478
| Arithmetic, except that the conversion is always rounded toward zero.
1479
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1480
| the conversion overflows, the largest integer with the same sign as `a' is
1482
*----------------------------------------------------------------------------*/
1484
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1487
int_fast16_t aExp, shiftCount;
1491
aSig = extractFloat32Frac( a );
1492
aExp = extractFloat32Exp( a );
1493
aSign = extractFloat32Sign( a );
1494
shiftCount = aExp - 0x8E;
1495
if ( 0 <= shiftCount ) {
1496
if ( float32_val(a) != 0xC7000000 ) {
1497
float_raise( float_flag_invalid STATUS_VAR);
1498
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1502
return (int32_t) 0xffff8000;
1504
else if ( aExp <= 0x7E ) {
1505
if ( aExp | aSig ) {
1506
STATUS(float_exception_flags) |= float_flag_inexact;
1511
aSig = ( aSig | 0x00800000 )<<8;
1512
z = aSig>>( - shiftCount );
1513
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1514
STATUS(float_exception_flags) |= float_flag_inexact;
1523
/*----------------------------------------------------------------------------
1524
| Returns the result of converting the single-precision floating-point value
1525
| `a' to the 64-bit two's complement integer format. The conversion is
1526
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1527
| Arithmetic---which means in particular that the conversion is rounded
1528
| according to the current rounding mode. If `a' is a NaN, the largest
1529
| positive integer is returned. Otherwise, if the conversion overflows, the
1530
| largest integer with the same sign as `a' is returned.
1531
*----------------------------------------------------------------------------*/
1533
int64 float32_to_int64( float32 a STATUS_PARAM )
1536
int_fast16_t aExp, shiftCount;
1538
uint64_t aSig64, aSigExtra;
1539
a = float32_squash_input_denormal(a STATUS_VAR);
1541
aSig = extractFloat32Frac( a );
1542
aExp = extractFloat32Exp( a );
1543
aSign = extractFloat32Sign( a );
1544
shiftCount = 0xBE - aExp;
1545
if ( shiftCount < 0 ) {
1546
float_raise( float_flag_invalid STATUS_VAR);
1547
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1548
return LIT64( 0x7FFFFFFFFFFFFFFF );
1550
return (int64_t) LIT64( 0x8000000000000000 );
1552
if ( aExp ) aSig |= 0x00800000;
1555
shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1556
return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1560
/*----------------------------------------------------------------------------
1561
| Returns the result of converting the single-precision floating-point value
1562
| `a' to the 64-bit two's complement integer format. The conversion is
1563
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1564
| Arithmetic, except that the conversion is always rounded toward zero. If
1565
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1566
| conversion overflows, the largest integer with the same sign as `a' is
1568
*----------------------------------------------------------------------------*/
1570
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1573
int_fast16_t aExp, shiftCount;
1577
a = float32_squash_input_denormal(a STATUS_VAR);
1579
aSig = extractFloat32Frac( a );
1580
aExp = extractFloat32Exp( a );
1581
aSign = extractFloat32Sign( a );
1582
shiftCount = aExp - 0xBE;
1583
if ( 0 <= shiftCount ) {
1584
if ( float32_val(a) != 0xDF000000 ) {
1585
float_raise( float_flag_invalid STATUS_VAR);
1586
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1587
return LIT64( 0x7FFFFFFFFFFFFFFF );
1590
return (int64_t) LIT64( 0x8000000000000000 );
1592
else if ( aExp <= 0x7E ) {
1593
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1596
aSig64 = aSig | 0x00800000;
1598
z = aSig64>>( - shiftCount );
1599
if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1600
STATUS(float_exception_flags) |= float_flag_inexact;
1602
if ( aSign ) z = - z;
1607
/*----------------------------------------------------------------------------
1608
| Returns the result of converting the single-precision floating-point value
1609
| `a' to the double-precision floating-point format. The conversion is
1610
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1612
*----------------------------------------------------------------------------*/
1614
float64 float32_to_float64( float32 a STATUS_PARAM )
1619
a = float32_squash_input_denormal(a STATUS_VAR);
1621
aSig = extractFloat32Frac( a );
1622
aExp = extractFloat32Exp( a );
1623
aSign = extractFloat32Sign( a );
1624
if ( aExp == 0xFF ) {
1625
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1626
return packFloat64( aSign, 0x7FF, 0 );
1629
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1630
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1633
return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1637
/*----------------------------------------------------------------------------
1638
| Returns the result of converting the single-precision floating-point value
1639
| `a' to the extended double-precision floating-point format. The conversion
1640
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1642
*----------------------------------------------------------------------------*/
1644
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1650
a = float32_squash_input_denormal(a STATUS_VAR);
1651
aSig = extractFloat32Frac( a );
1652
aExp = extractFloat32Exp( a );
1653
aSign = extractFloat32Sign( a );
1654
if ( aExp == 0xFF ) {
1655
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1656
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1659
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1660
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1663
return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1667
/*----------------------------------------------------------------------------
1668
| Returns the result of converting the single-precision floating-point value
1669
| `a' to the double-precision floating-point format. The conversion is
1670
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1672
*----------------------------------------------------------------------------*/
1674
float128 float32_to_float128( float32 a STATUS_PARAM )
1680
a = float32_squash_input_denormal(a STATUS_VAR);
1681
aSig = extractFloat32Frac( a );
1682
aExp = extractFloat32Exp( a );
1683
aSign = extractFloat32Sign( a );
1684
if ( aExp == 0xFF ) {
1685
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1686
return packFloat128( aSign, 0x7FFF, 0, 0 );
1689
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1690
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1693
return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1697
/*----------------------------------------------------------------------------
1698
| Rounds the single-precision floating-point value `a' to an integer, and
1699
| returns the result as a single-precision floating-point value. The
1700
| operation is performed according to the IEC/IEEE Standard for Binary
1701
| Floating-Point Arithmetic.
1702
*----------------------------------------------------------------------------*/
1704
float32 float32_round_to_int( float32 a STATUS_PARAM)
1708
uint32_t lastBitMask, roundBitsMask;
1711
a = float32_squash_input_denormal(a STATUS_VAR);
1713
aExp = extractFloat32Exp( a );
1714
if ( 0x96 <= aExp ) {
1715
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1716
return propagateFloat32NaN( a, a STATUS_VAR );
1720
if ( aExp <= 0x7E ) {
1721
if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1722
STATUS(float_exception_flags) |= float_flag_inexact;
1723
aSign = extractFloat32Sign( a );
1724
switch ( STATUS(float_rounding_mode) ) {
1725
case float_round_nearest_even:
1726
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1727
return packFloat32( aSign, 0x7F, 0 );
1730
case float_round_down:
1731
return make_float32(aSign ? 0xBF800000 : 0);
1732
case float_round_up:
1733
return make_float32(aSign ? 0x80000000 : 0x3F800000);
1735
return packFloat32( aSign, 0, 0 );
1738
lastBitMask <<= 0x96 - aExp;
1739
roundBitsMask = lastBitMask - 1;
1741
roundingMode = STATUS(float_rounding_mode);
1742
if ( roundingMode == float_round_nearest_even ) {
1743
z += lastBitMask>>1;
1744
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1746
else if ( roundingMode != float_round_to_zero ) {
1747
if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1751
z &= ~ roundBitsMask;
1752
if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1753
return make_float32(z);
1757
/*----------------------------------------------------------------------------
1758
| Returns the result of adding the absolute values of the single-precision
1759
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1760
| before being returned. `zSign' is ignored if the result is a NaN.
1761
| The addition is performed according to the IEC/IEEE Standard for Binary
1762
| Floating-Point Arithmetic.
1763
*----------------------------------------------------------------------------*/
1765
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1767
int_fast16_t aExp, bExp, zExp;
1768
uint32_t aSig, bSig, zSig;
1769
int_fast16_t expDiff;
1771
aSig = extractFloat32Frac( a );
1772
aExp = extractFloat32Exp( a );
1773
bSig = extractFloat32Frac( b );
1774
bExp = extractFloat32Exp( b );
1775
expDiff = aExp - bExp;
1778
if ( 0 < expDiff ) {
1779
if ( aExp == 0xFF ) {
1780
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1789
shift32RightJamming( bSig, expDiff, &bSig );
1792
else if ( expDiff < 0 ) {
1793
if ( bExp == 0xFF ) {
1794
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1795
return packFloat32( zSign, 0xFF, 0 );
1803
shift32RightJamming( aSig, - expDiff, &aSig );
1807
if ( aExp == 0xFF ) {
1808
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1812
if (STATUS(flush_to_zero)) {
1814
float_raise(float_flag_output_denormal STATUS_VAR);
1816
return packFloat32(zSign, 0, 0);
1818
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1820
zSig = 0x40000000 + aSig + bSig;
1825
zSig = ( aSig + bSig )<<1;
1827
if ( (int32_t) zSig < 0 ) {
1832
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1836
/*----------------------------------------------------------------------------
1837
| Returns the result of subtracting the absolute values of the single-
1838
| precision floating-point values `a' and `b'. If `zSign' is 1, the
1839
| difference is negated before being returned. `zSign' is ignored if the
1840
| result is a NaN. The subtraction is performed according to the IEC/IEEE
1841
| Standard for Binary Floating-Point Arithmetic.
1842
*----------------------------------------------------------------------------*/
1844
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1846
int_fast16_t aExp, bExp, zExp;
1847
uint32_t aSig, bSig, zSig;
1848
int_fast16_t expDiff;
1850
aSig = extractFloat32Frac( a );
1851
aExp = extractFloat32Exp( a );
1852
bSig = extractFloat32Frac( b );
1853
bExp = extractFloat32Exp( b );
1854
expDiff = aExp - bExp;
1857
if ( 0 < expDiff ) goto aExpBigger;
1858
if ( expDiff < 0 ) goto bExpBigger;
1859
if ( aExp == 0xFF ) {
1860
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1861
float_raise( float_flag_invalid STATUS_VAR);
1862
return float32_default_nan;
1868
if ( bSig < aSig ) goto aBigger;
1869
if ( aSig < bSig ) goto bBigger;
1870
return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1872
if ( bExp == 0xFF ) {
1873
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1874
return packFloat32( zSign ^ 1, 0xFF, 0 );
1882
shift32RightJamming( aSig, - expDiff, &aSig );
1888
goto normalizeRoundAndPack;
1890
if ( aExp == 0xFF ) {
1891
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1900
shift32RightJamming( bSig, expDiff, &bSig );
1905
normalizeRoundAndPack:
1907
return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1911
/*----------------------------------------------------------------------------
1912
| Returns the result of adding the single-precision floating-point values `a'
1913
| and `b'. The operation is performed according to the IEC/IEEE Standard for
1914
| Binary Floating-Point Arithmetic.
1915
*----------------------------------------------------------------------------*/
1917
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1920
a = float32_squash_input_denormal(a STATUS_VAR);
1921
b = float32_squash_input_denormal(b STATUS_VAR);
1923
aSign = extractFloat32Sign( a );
1924
bSign = extractFloat32Sign( b );
1925
if ( aSign == bSign ) {
1926
return addFloat32Sigs( a, b, aSign STATUS_VAR);
1929
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1934
/*----------------------------------------------------------------------------
1935
| Returns the result of subtracting the single-precision floating-point values
1936
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1937
| for Binary Floating-Point Arithmetic.
1938
*----------------------------------------------------------------------------*/
1940
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1943
a = float32_squash_input_denormal(a STATUS_VAR);
1944
b = float32_squash_input_denormal(b STATUS_VAR);
1946
aSign = extractFloat32Sign( a );
1947
bSign = extractFloat32Sign( b );
1948
if ( aSign == bSign ) {
1949
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1952
return addFloat32Sigs( a, b, aSign STATUS_VAR );
1957
/*----------------------------------------------------------------------------
1958
| Returns the result of multiplying the single-precision floating-point values
1959
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1960
| for Binary Floating-Point Arithmetic.
1961
*----------------------------------------------------------------------------*/
1963
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1965
flag aSign, bSign, zSign;
1966
int_fast16_t aExp, bExp, zExp;
1967
uint32_t aSig, bSig;
1971
a = float32_squash_input_denormal(a STATUS_VAR);
1972
b = float32_squash_input_denormal(b STATUS_VAR);
1974
aSig = extractFloat32Frac( a );
1975
aExp = extractFloat32Exp( a );
1976
aSign = extractFloat32Sign( a );
1977
bSig = extractFloat32Frac( b );
1978
bExp = extractFloat32Exp( b );
1979
bSign = extractFloat32Sign( b );
1980
zSign = aSign ^ bSign;
1981
if ( aExp == 0xFF ) {
1982
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1983
return propagateFloat32NaN( a, b STATUS_VAR );
1985
if ( ( bExp | bSig ) == 0 ) {
1986
float_raise( float_flag_invalid STATUS_VAR);
1987
return float32_default_nan;
1989
return packFloat32( zSign, 0xFF, 0 );
1991
if ( bExp == 0xFF ) {
1992
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1993
if ( ( aExp | aSig ) == 0 ) {
1994
float_raise( float_flag_invalid STATUS_VAR);
1995
return float32_default_nan;
1997
return packFloat32( zSign, 0xFF, 0 );
2000
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2001
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2004
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2005
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2007
zExp = aExp + bExp - 0x7F;
2008
aSig = ( aSig | 0x00800000 )<<7;
2009
bSig = ( bSig | 0x00800000 )<<8;
2010
shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2012
if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2016
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2020
/*----------------------------------------------------------------------------
2021
| Returns the result of dividing the single-precision floating-point value `a'
2022
| by the corresponding value `b'. The operation is performed according to the
2023
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024
*----------------------------------------------------------------------------*/
2026
float32 float32_div( float32 a, float32 b STATUS_PARAM )
2028
flag aSign, bSign, zSign;
2029
int_fast16_t aExp, bExp, zExp;
2030
uint32_t aSig, bSig, zSig;
2031
a = float32_squash_input_denormal(a STATUS_VAR);
2032
b = float32_squash_input_denormal(b STATUS_VAR);
2034
aSig = extractFloat32Frac( a );
2035
aExp = extractFloat32Exp( a );
2036
aSign = extractFloat32Sign( a );
2037
bSig = extractFloat32Frac( b );
2038
bExp = extractFloat32Exp( b );
2039
bSign = extractFloat32Sign( b );
2040
zSign = aSign ^ bSign;
2041
if ( aExp == 0xFF ) {
2042
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2043
if ( bExp == 0xFF ) {
2044
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2045
float_raise( float_flag_invalid STATUS_VAR);
2046
return float32_default_nan;
2048
return packFloat32( zSign, 0xFF, 0 );
2050
if ( bExp == 0xFF ) {
2051
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2052
return packFloat32( zSign, 0, 0 );
2056
if ( ( aExp | aSig ) == 0 ) {
2057
float_raise( float_flag_invalid STATUS_VAR);
2058
return float32_default_nan;
2060
float_raise( float_flag_divbyzero STATUS_VAR);
2061
return packFloat32( zSign, 0xFF, 0 );
2063
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2066
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2067
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2069
zExp = aExp - bExp + 0x7D;
2070
aSig = ( aSig | 0x00800000 )<<7;
2071
bSig = ( bSig | 0x00800000 )<<8;
2072
if ( bSig <= ( aSig + aSig ) ) {
2076
zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2077
if ( ( zSig & 0x3F ) == 0 ) {
2078
zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2080
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2084
/*----------------------------------------------------------------------------
2085
| Returns the remainder of the single-precision floating-point value `a'
2086
| with respect to the corresponding value `b'. The operation is performed
2087
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088
*----------------------------------------------------------------------------*/
2090
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2093
int_fast16_t aExp, bExp, expDiff;
2094
uint32_t aSig, bSig;
2096
uint64_t aSig64, bSig64, q64;
2097
uint32_t alternateASig;
2099
a = float32_squash_input_denormal(a STATUS_VAR);
2100
b = float32_squash_input_denormal(b STATUS_VAR);
2102
aSig = extractFloat32Frac( a );
2103
aExp = extractFloat32Exp( a );
2104
aSign = extractFloat32Sign( a );
2105
bSig = extractFloat32Frac( b );
2106
bExp = extractFloat32Exp( b );
2107
if ( aExp == 0xFF ) {
2108
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2109
return propagateFloat32NaN( a, b STATUS_VAR );
2111
float_raise( float_flag_invalid STATUS_VAR);
2112
return float32_default_nan;
2114
if ( bExp == 0xFF ) {
2115
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2120
float_raise( float_flag_invalid STATUS_VAR);
2121
return float32_default_nan;
2123
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2126
if ( aSig == 0 ) return a;
2127
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2129
expDiff = aExp - bExp;
2132
if ( expDiff < 32 ) {
2135
if ( expDiff < 0 ) {
2136
if ( expDiff < -1 ) return a;
2139
q = ( bSig <= aSig );
2140
if ( q ) aSig -= bSig;
2141
if ( 0 < expDiff ) {
2142
q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2145
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2153
if ( bSig <= aSig ) aSig -= bSig;
2154
aSig64 = ( (uint64_t) aSig )<<40;
2155
bSig64 = ( (uint64_t) bSig )<<40;
2157
while ( 0 < expDiff ) {
2158
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2159
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2160
aSig64 = - ( ( bSig * q64 )<<38 );
2164
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2165
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2166
q = q64>>( 64 - expDiff );
2168
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2171
alternateASig = aSig;
2174
} while ( 0 <= (int32_t) aSig );
2175
sigMean = aSig + alternateASig;
2176
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2177
aSig = alternateASig;
2179
zSign = ( (int32_t) aSig < 0 );
2180
if ( zSign ) aSig = - aSig;
2181
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2185
/*----------------------------------------------------------------------------
2186
| Returns the result of multiplying the single-precision floating-point values
2187
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2188
| multiplication. The operation is performed according to the IEC/IEEE
2189
| Standard for Binary Floating-Point Arithmetic 754-2008.
2190
| The flags argument allows the caller to select negation of the
2191
| addend, the intermediate product, or the final result. (The difference
2192
| between this and having the caller do a separate negation is that negating
2193
| externally will flip the sign bit on NaNs.)
2194
*----------------------------------------------------------------------------*/
2196
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2198
flag aSign, bSign, cSign, zSign;
2199
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2200
uint32_t aSig, bSig, cSig;
2201
flag pInf, pZero, pSign;
2202
uint64_t pSig64, cSig64, zSig64;
2205
flag signflip, infzero;
2207
a = float32_squash_input_denormal(a STATUS_VAR);
2208
b = float32_squash_input_denormal(b STATUS_VAR);
2209
c = float32_squash_input_denormal(c STATUS_VAR);
2210
aSig = extractFloat32Frac(a);
2211
aExp = extractFloat32Exp(a);
2212
aSign = extractFloat32Sign(a);
2213
bSig = extractFloat32Frac(b);
2214
bExp = extractFloat32Exp(b);
2215
bSign = extractFloat32Sign(b);
2216
cSig = extractFloat32Frac(c);
2217
cExp = extractFloat32Exp(c);
2218
cSign = extractFloat32Sign(c);
2220
infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2221
(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2223
/* It is implementation-defined whether the cases of (0,inf,qnan)
2224
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2225
* they return if they do), so we have to hand this information
2226
* off to the target-specific pick-a-NaN routine.
2228
if (((aExp == 0xff) && aSig) ||
2229
((bExp == 0xff) && bSig) ||
2230
((cExp == 0xff) && cSig)) {
2231
return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2235
float_raise(float_flag_invalid STATUS_VAR);
2236
return float32_default_nan;
2239
if (flags & float_muladd_negate_c) {
2243
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2245
/* Work out the sign and type of the product */
2246
pSign = aSign ^ bSign;
2247
if (flags & float_muladd_negate_product) {
2250
pInf = (aExp == 0xff) || (bExp == 0xff);
2251
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2254
if (pInf && (pSign ^ cSign)) {
2255
/* addition of opposite-signed infinities => InvalidOperation */
2256
float_raise(float_flag_invalid STATUS_VAR);
2257
return float32_default_nan;
2259
/* Otherwise generate an infinity of the same sign */
2260
return packFloat32(cSign ^ signflip, 0xff, 0);
2264
return packFloat32(pSign ^ signflip, 0xff, 0);
2270
/* Adding two exact zeroes */
2271
if (pSign == cSign) {
2273
} else if (STATUS(float_rounding_mode) == float_round_down) {
2278
return packFloat32(zSign ^ signflip, 0, 0);
2280
/* Exact zero plus a denorm */
2281
if (STATUS(flush_to_zero)) {
2282
float_raise(float_flag_output_denormal STATUS_VAR);
2283
return packFloat32(cSign ^ signflip, 0, 0);
2286
/* Zero plus something non-zero : just return the something */
2287
return packFloat32(cSign ^ signflip, cExp, cSig);
2291
normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2294
normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2297
/* Calculate the actual result a * b + c */
2299
/* Multiply first; this is easy. */
2300
/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2301
* because we want the true exponent, not the "one-less-than"
2302
* flavour that roundAndPackFloat32() takes.
2304
pExp = aExp + bExp - 0x7e;
2305
aSig = (aSig | 0x00800000) << 7;
2306
bSig = (bSig | 0x00800000) << 8;
2307
pSig64 = (uint64_t)aSig * bSig;
2308
if ((int64_t)(pSig64 << 1) >= 0) {
2313
zSign = pSign ^ signflip;
2315
/* Now pSig64 is the significand of the multiply, with the explicit bit in
2320
/* Throw out the special case of c being an exact zero now */
2321
shift64RightJamming(pSig64, 32, &pSig64);
2323
return roundAndPackFloat32(zSign, pExp - 1,
2326
normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2329
cSig64 = (uint64_t)cSig << (62 - 23);
2330
cSig64 |= LIT64(0x4000000000000000);
2331
expDiff = pExp - cExp;
2333
if (pSign == cSign) {
2336
/* scale c to match p */
2337
shift64RightJamming(cSig64, expDiff, &cSig64);
2339
} else if (expDiff < 0) {
2340
/* scale p to match c */
2341
shift64RightJamming(pSig64, -expDiff, &pSig64);
2344
/* no scaling needed */
2347
/* Add significands and make sure explicit bit ends up in posn 62 */
2348
zSig64 = pSig64 + cSig64;
2349
if ((int64_t)zSig64 < 0) {
2350
shift64RightJamming(zSig64, 1, &zSig64);
2357
shift64RightJamming(cSig64, expDiff, &cSig64);
2358
zSig64 = pSig64 - cSig64;
2360
} else if (expDiff < 0) {
2361
shift64RightJamming(pSig64, -expDiff, &pSig64);
2362
zSig64 = cSig64 - pSig64;
2367
if (cSig64 < pSig64) {
2368
zSig64 = pSig64 - cSig64;
2369
} else if (pSig64 < cSig64) {
2370
zSig64 = cSig64 - pSig64;
2375
if (STATUS(float_rounding_mode) == float_round_down) {
2378
return packFloat32(zSign, 0, 0);
2382
/* Normalize to put the explicit bit back into bit 62. */
2383
shiftcount = countLeadingZeros64(zSig64) - 1;
2384
zSig64 <<= shiftcount;
2387
shift64RightJamming(zSig64, 32, &zSig64);
2388
return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2392
/*----------------------------------------------------------------------------
2393
| Returns the square root of the single-precision floating-point value `a'.
2394
| The operation is performed according to the IEC/IEEE Standard for Binary
2395
| Floating-Point Arithmetic.
2396
*----------------------------------------------------------------------------*/
2398
float32 float32_sqrt( float32 a STATUS_PARAM )
2401
int_fast16_t aExp, zExp;
2402
uint32_t aSig, zSig;
2404
a = float32_squash_input_denormal(a STATUS_VAR);
2406
aSig = extractFloat32Frac( a );
2407
aExp = extractFloat32Exp( a );
2408
aSign = extractFloat32Sign( a );
2409
if ( aExp == 0xFF ) {
2410
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2411
if ( ! aSign ) return a;
2412
float_raise( float_flag_invalid STATUS_VAR);
2413
return float32_default_nan;
2416
if ( ( aExp | aSig ) == 0 ) return a;
2417
float_raise( float_flag_invalid STATUS_VAR);
2418
return float32_default_nan;
2421
if ( aSig == 0 ) return float32_zero;
2422
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2424
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2425
aSig = ( aSig | 0x00800000 )<<8;
2426
zSig = estimateSqrt32( aExp, aSig ) + 2;
2427
if ( ( zSig & 0x7F ) <= 5 ) {
2433
term = ( (uint64_t) zSig ) * zSig;
2434
rem = ( ( (uint64_t) aSig )<<32 ) - term;
2435
while ( (int64_t) rem < 0 ) {
2437
rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2439
zSig |= ( rem != 0 );
2441
shift32RightJamming( zSig, 1, &zSig );
2443
return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2447
/*----------------------------------------------------------------------------
2448
| Returns the binary exponential of the single-precision floating-point value
2449
| `a'. The operation is performed according to the IEC/IEEE Standard for
2450
| Binary Floating-Point Arithmetic.
2452
| Uses the following identities:
2454
| 1. -------------------------------------------------------------------------
2458
| 2. -------------------------------------------------------------------------
2461
| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2463
*----------------------------------------------------------------------------*/
2465
static const float64 float32_exp2_coefficients[15] =
2467
const_float64( 0x3ff0000000000000ll ), /* 1 */
2468
const_float64( 0x3fe0000000000000ll ), /* 2 */
2469
const_float64( 0x3fc5555555555555ll ), /* 3 */
2470
const_float64( 0x3fa5555555555555ll ), /* 4 */
2471
const_float64( 0x3f81111111111111ll ), /* 5 */
2472
const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2473
const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2474
const_float64( 0x3efa01a01a01a01all ), /* 8 */
2475
const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2476
const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2477
const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2478
const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2479
const_float64( 0x3de6124613a86d09ll ), /* 13 */
2480
const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2481
const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2484
float32 float32_exp2( float32 a STATUS_PARAM )
2491
a = float32_squash_input_denormal(a STATUS_VAR);
2493
aSig = extractFloat32Frac( a );
2494
aExp = extractFloat32Exp( a );
2495
aSign = extractFloat32Sign( a );
2497
if ( aExp == 0xFF) {
2498
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2499
return (aSign) ? float32_zero : a;
2502
if (aSig == 0) return float32_one;
2505
float_raise( float_flag_inexact STATUS_VAR);
2507
/* ******************************* */
2508
/* using float64 for approximation */
2509
/* ******************************* */
2510
x = float32_to_float64(a STATUS_VAR);
2511
x = float64_mul(x, float64_ln2 STATUS_VAR);
2515
for (i = 0 ; i < 15 ; i++) {
2518
f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2519
r = float64_add(r, f STATUS_VAR);
2521
xn = float64_mul(xn, x STATUS_VAR);
2524
return float64_to_float32(r, status);
2527
/*----------------------------------------------------------------------------
2528
| Returns the binary log of the single-precision floating-point value `a'.
2529
| The operation is performed according to the IEC/IEEE Standard for Binary
2530
| Floating-Point Arithmetic.
2531
*----------------------------------------------------------------------------*/
2532
float32 float32_log2( float32 a STATUS_PARAM )
2536
uint32_t aSig, zSig, i;
2538
a = float32_squash_input_denormal(a STATUS_VAR);
2539
aSig = extractFloat32Frac( a );
2540
aExp = extractFloat32Exp( a );
2541
aSign = extractFloat32Sign( a );
2544
if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2545
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2548
float_raise( float_flag_invalid STATUS_VAR);
2549
return float32_default_nan;
2551
if ( aExp == 0xFF ) {
2552
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2561
for (i = 1 << 22; i > 0; i >>= 1) {
2562
aSig = ( (uint64_t)aSig * aSig ) >> 23;
2563
if ( aSig & 0x01000000 ) {
2572
return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2575
/*----------------------------------------------------------------------------
2576
| Returns 1 if the single-precision floating-point value `a' is equal to
2577
| the corresponding value `b', and 0 otherwise. The invalid exception is
2578
| raised if either operand is a NaN. Otherwise, the comparison is performed
2579
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2580
*----------------------------------------------------------------------------*/
2582
int float32_eq( float32 a, float32 b STATUS_PARAM )
2585
a = float32_squash_input_denormal(a STATUS_VAR);
2586
b = float32_squash_input_denormal(b STATUS_VAR);
2588
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2589
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2591
float_raise( float_flag_invalid STATUS_VAR);
2594
av = float32_val(a);
2595
bv = float32_val(b);
2596
return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2599
/*----------------------------------------------------------------------------
2600
| Returns 1 if the single-precision floating-point value `a' is less than
2601
| or equal to the corresponding value `b', and 0 otherwise. The invalid
2602
| exception is raised if either operand is a NaN. The comparison is performed
2603
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2604
*----------------------------------------------------------------------------*/
2606
int float32_le( float32 a, float32 b STATUS_PARAM )
2610
a = float32_squash_input_denormal(a STATUS_VAR);
2611
b = float32_squash_input_denormal(b STATUS_VAR);
2613
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2614
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2616
float_raise( float_flag_invalid STATUS_VAR);
2619
aSign = extractFloat32Sign( a );
2620
bSign = extractFloat32Sign( b );
2621
av = float32_val(a);
2622
bv = float32_val(b);
2623
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2624
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2628
/*----------------------------------------------------------------------------
2629
| Returns 1 if the single-precision floating-point value `a' is less than
2630
| the corresponding value `b', and 0 otherwise. The invalid exception is
2631
| raised if either operand is a NaN. The comparison is performed according
2632
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2633
*----------------------------------------------------------------------------*/
2635
int float32_lt( float32 a, float32 b STATUS_PARAM )
2639
a = float32_squash_input_denormal(a STATUS_VAR);
2640
b = float32_squash_input_denormal(b STATUS_VAR);
2642
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2643
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2645
float_raise( float_flag_invalid STATUS_VAR);
2648
aSign = extractFloat32Sign( a );
2649
bSign = extractFloat32Sign( b );
2650
av = float32_val(a);
2651
bv = float32_val(b);
2652
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2653
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2657
/*----------------------------------------------------------------------------
2658
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2659
| be compared, and 0 otherwise. The invalid exception is raised if either
2660
| operand is a NaN. The comparison is performed according to the IEC/IEEE
2661
| Standard for Binary Floating-Point Arithmetic.
2662
*----------------------------------------------------------------------------*/
2664
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2666
a = float32_squash_input_denormal(a STATUS_VAR);
2667
b = float32_squash_input_denormal(b STATUS_VAR);
2669
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2670
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2672
float_raise( float_flag_invalid STATUS_VAR);
2678
/*----------------------------------------------------------------------------
2679
| Returns 1 if the single-precision floating-point value `a' is equal to
2680
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2681
| exception. The comparison is performed according to the IEC/IEEE Standard
2682
| for Binary Floating-Point Arithmetic.
2683
*----------------------------------------------------------------------------*/
2685
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2687
a = float32_squash_input_denormal(a STATUS_VAR);
2688
b = float32_squash_input_denormal(b STATUS_VAR);
2690
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2691
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2693
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2694
float_raise( float_flag_invalid STATUS_VAR);
2698
return ( float32_val(a) == float32_val(b) ) ||
2699
( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2702
/*----------------------------------------------------------------------------
2703
| Returns 1 if the single-precision floating-point value `a' is less than or
2704
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2705
| cause an exception. Otherwise, the comparison is performed according to the
2706
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2707
*----------------------------------------------------------------------------*/
2709
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2713
a = float32_squash_input_denormal(a STATUS_VAR);
2714
b = float32_squash_input_denormal(b STATUS_VAR);
2716
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2717
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2719
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2720
float_raise( float_flag_invalid STATUS_VAR);
2724
aSign = extractFloat32Sign( a );
2725
bSign = extractFloat32Sign( b );
2726
av = float32_val(a);
2727
bv = float32_val(b);
2728
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2729
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2733
/*----------------------------------------------------------------------------
2734
| Returns 1 if the single-precision floating-point value `a' is less than
2735
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2736
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2737
| Standard for Binary Floating-Point Arithmetic.
2738
*----------------------------------------------------------------------------*/
2740
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2744
a = float32_squash_input_denormal(a STATUS_VAR);
2745
b = float32_squash_input_denormal(b STATUS_VAR);
2747
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2748
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2750
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2751
float_raise( float_flag_invalid STATUS_VAR);
2755
aSign = extractFloat32Sign( a );
2756
bSign = extractFloat32Sign( b );
2757
av = float32_val(a);
2758
bv = float32_val(b);
2759
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2760
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2764
/*----------------------------------------------------------------------------
2765
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2766
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2767
| comparison is performed according to the IEC/IEEE Standard for Binary
2768
| Floating-Point Arithmetic.
2769
*----------------------------------------------------------------------------*/
2771
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2773
a = float32_squash_input_denormal(a STATUS_VAR);
2774
b = float32_squash_input_denormal(b STATUS_VAR);
2776
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2777
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2779
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2780
float_raise( float_flag_invalid STATUS_VAR);
2787
/*----------------------------------------------------------------------------
2788
| Returns the result of converting the double-precision floating-point value
2789
| `a' to the 32-bit two's complement integer format. The conversion is
2790
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2791
| Arithmetic---which means in particular that the conversion is rounded
2792
| according to the current rounding mode. If `a' is a NaN, the largest
2793
| positive integer is returned. Otherwise, if the conversion overflows, the
2794
| largest integer with the same sign as `a' is returned.
2795
*----------------------------------------------------------------------------*/
2797
int32 float64_to_int32( float64 a STATUS_PARAM )
2800
int_fast16_t aExp, shiftCount;
2802
a = float64_squash_input_denormal(a STATUS_VAR);
2804
aSig = extractFloat64Frac( a );
2805
aExp = extractFloat64Exp( a );
2806
aSign = extractFloat64Sign( a );
2807
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2808
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2809
shiftCount = 0x42C - aExp;
2810
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2811
return roundAndPackInt32( aSign, aSig STATUS_VAR );
2815
/*----------------------------------------------------------------------------
2816
| Returns the result of converting the double-precision floating-point value
2817
| `a' to the 32-bit two's complement integer format. The conversion is
2818
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2819
| Arithmetic, except that the conversion is always rounded toward zero.
2820
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2821
| the conversion overflows, the largest integer with the same sign as `a' is
2823
*----------------------------------------------------------------------------*/
2825
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2828
int_fast16_t aExp, shiftCount;
2829
uint64_t aSig, savedASig;
2831
a = float64_squash_input_denormal(a STATUS_VAR);
2833
aSig = extractFloat64Frac( a );
2834
aExp = extractFloat64Exp( a );
2835
aSign = extractFloat64Sign( a );
2836
if ( 0x41E < aExp ) {
2837
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2840
else if ( aExp < 0x3FF ) {
2841
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2844
aSig |= LIT64( 0x0010000000000000 );
2845
shiftCount = 0x433 - aExp;
2847
aSig >>= shiftCount;
2849
if ( aSign ) z = - z;
2850
if ( ( z < 0 ) ^ aSign ) {
2852
float_raise( float_flag_invalid STATUS_VAR);
2853
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2855
if ( ( aSig<<shiftCount ) != savedASig ) {
2856
STATUS(float_exception_flags) |= float_flag_inexact;
2862
/*----------------------------------------------------------------------------
2863
| Returns the result of converting the double-precision floating-point value
2864
| `a' to the 16-bit two's complement integer format. The conversion is
2865
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2866
| Arithmetic, except that the conversion is always rounded toward zero.
2867
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2868
| the conversion overflows, the largest integer with the same sign as `a' is
2870
*----------------------------------------------------------------------------*/
2872
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2875
int_fast16_t aExp, shiftCount;
2876
uint64_t aSig, savedASig;
2879
aSig = extractFloat64Frac( a );
2880
aExp = extractFloat64Exp( a );
2881
aSign = extractFloat64Sign( a );
2882
if ( 0x40E < aExp ) {
2883
if ( ( aExp == 0x7FF ) && aSig ) {
2888
else if ( aExp < 0x3FF ) {
2889
if ( aExp || aSig ) {
2890
STATUS(float_exception_flags) |= float_flag_inexact;
2894
aSig |= LIT64( 0x0010000000000000 );
2895
shiftCount = 0x433 - aExp;
2897
aSig >>= shiftCount;
2902
if ( ( (int16_t)z < 0 ) ^ aSign ) {
2904
float_raise( float_flag_invalid STATUS_VAR);
2905
return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2907
if ( ( aSig<<shiftCount ) != savedASig ) {
2908
STATUS(float_exception_flags) |= float_flag_inexact;
2913
/*----------------------------------------------------------------------------
2914
| Returns the result of converting the double-precision floating-point value
2915
| `a' to the 64-bit two's complement integer format. The conversion is
2916
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2917
| Arithmetic---which means in particular that the conversion is rounded
2918
| according to the current rounding mode. If `a' is a NaN, the largest
2919
| positive integer is returned. Otherwise, if the conversion overflows, the
2920
| largest integer with the same sign as `a' is returned.
2921
*----------------------------------------------------------------------------*/
2923
int64 float64_to_int64( float64 a STATUS_PARAM )
2926
int_fast16_t aExp, shiftCount;
2927
uint64_t aSig, aSigExtra;
2928
a = float64_squash_input_denormal(a STATUS_VAR);
2930
aSig = extractFloat64Frac( a );
2931
aExp = extractFloat64Exp( a );
2932
aSign = extractFloat64Sign( a );
2933
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2934
shiftCount = 0x433 - aExp;
2935
if ( shiftCount <= 0 ) {
2936
if ( 0x43E < aExp ) {
2937
float_raise( float_flag_invalid STATUS_VAR);
2939
|| ( ( aExp == 0x7FF )
2940
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2942
return LIT64( 0x7FFFFFFFFFFFFFFF );
2944
return (int64_t) LIT64( 0x8000000000000000 );
2947
aSig <<= - shiftCount;
2950
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2952
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2956
/*----------------------------------------------------------------------------
2957
| Returns the result of converting the double-precision floating-point value
2958
| `a' to the 64-bit two's complement integer format. The conversion is
2959
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2960
| Arithmetic, except that the conversion is always rounded toward zero.
2961
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2962
| the conversion overflows, the largest integer with the same sign as `a' is
2964
*----------------------------------------------------------------------------*/
2966
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2969
int_fast16_t aExp, shiftCount;
2972
a = float64_squash_input_denormal(a STATUS_VAR);
2974
aSig = extractFloat64Frac( a );
2975
aExp = extractFloat64Exp( a );
2976
aSign = extractFloat64Sign( a );
2977
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2978
shiftCount = aExp - 0x433;
2979
if ( 0 <= shiftCount ) {
2980
if ( 0x43E <= aExp ) {
2981
if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2982
float_raise( float_flag_invalid STATUS_VAR);
2984
|| ( ( aExp == 0x7FF )
2985
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2987
return LIT64( 0x7FFFFFFFFFFFFFFF );
2990
return (int64_t) LIT64( 0x8000000000000000 );
2992
z = aSig<<shiftCount;
2995
if ( aExp < 0x3FE ) {
2996
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2999
z = aSig>>( - shiftCount );
3000
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3001
STATUS(float_exception_flags) |= float_flag_inexact;
3004
if ( aSign ) z = - z;
3009
/*----------------------------------------------------------------------------
3010
| Returns the result of converting the double-precision floating-point value
3011
| `a' to the single-precision floating-point format. The conversion is
3012
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3014
*----------------------------------------------------------------------------*/
3016
float32 float64_to_float32( float64 a STATUS_PARAM )
3022
a = float64_squash_input_denormal(a STATUS_VAR);
3024
aSig = extractFloat64Frac( a );
3025
aExp = extractFloat64Exp( a );
3026
aSign = extractFloat64Sign( a );
3027
if ( aExp == 0x7FF ) {
3028
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3029
return packFloat32( aSign, 0xFF, 0 );
3031
shift64RightJamming( aSig, 22, &aSig );
3033
if ( aExp || zSig ) {
3037
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3042
/*----------------------------------------------------------------------------
3043
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3044
| half-precision floating-point value, returning the result. After being
3045
| shifted into the proper positions, the three fields are simply added
3046
| together to form the result. This means that any integer portion of `zSig'
3047
| will be added into the exponent. Since a properly normalized significand
3048
| will have an integer portion equal to 1, the `zExp' input should be 1 less
3049
| than the desired result exponent whenever `zSig' is a complete, normalized
3051
*----------------------------------------------------------------------------*/
3052
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3054
return make_float16(
3055
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3058
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3059
The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3061
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3067
aSign = extractFloat16Sign(a);
3068
aExp = extractFloat16Exp(a);
3069
aSig = extractFloat16Frac(a);
3071
if (aExp == 0x1f && ieee) {
3073
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3075
return packFloat32(aSign, 0xff, 0);
3081
return packFloat32(aSign, 0, 0);
3084
shiftCount = countLeadingZeros32( aSig ) - 21;
3085
aSig = aSig << shiftCount;
3088
return packFloat32( aSign, aExp + 0x70, aSig << 13);
3091
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3099
int maxexp = ieee ? 15 : 16;
3100
bool rounding_bumps_exp;
3101
bool is_tiny = false;
3103
a = float32_squash_input_denormal(a STATUS_VAR);
3105
aSig = extractFloat32Frac( a );
3106
aExp = extractFloat32Exp( a );
3107
aSign = extractFloat32Sign( a );
3108
if ( aExp == 0xFF ) {
3110
/* Input is a NaN */
3112
float_raise(float_flag_invalid STATUS_VAR);
3113
return packFloat16(aSign, 0, 0);
3115
return commonNaNToFloat16(
3116
float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3120
float_raise(float_flag_invalid STATUS_VAR);
3121
return packFloat16(aSign, 0x1f, 0x3ff);
3123
return packFloat16(aSign, 0x1f, 0);
3125
if (aExp == 0 && aSig == 0) {
3126
return packFloat16(aSign, 0, 0);
3128
/* Decimal point between bits 22 and 23. Note that we add the 1 bit
3129
* even if the input is denormal; however this is harmless because
3130
* the largest possible single-precision denormal is still smaller
3131
* than the smallest representable half-precision denormal, and so we
3132
* will end up ignoring aSig and returning via the "always return zero"
3137
/* Calculate the mask of bits of the mantissa which are not
3138
* representable in half-precision and will be lost.
3141
/* Will be denormal in halfprec */
3147
/* Normal number in halfprec */
3151
roundingMode = STATUS(float_rounding_mode);
3152
switch (roundingMode) {
3153
case float_round_nearest_even:
3154
increment = (mask + 1) >> 1;
3155
if ((aSig & mask) == increment) {
3156
increment = aSig & (increment << 1);
3159
case float_round_up:
3160
increment = aSign ? 0 : mask;
3162
case float_round_down:
3163
increment = aSign ? mask : 0;
3165
default: /* round_to_zero */
3170
rounding_bumps_exp = (aSig + increment >= 0x01000000);
3172
if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3174
float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3175
return packFloat16(aSign, 0x1f, 0);
3177
float_raise(float_flag_invalid STATUS_VAR);
3178
return packFloat16(aSign, 0x1f, 0x3ff);
3183
/* Note that flush-to-zero does not affect half-precision results */
3185
(STATUS(float_detect_tininess) == float_tininess_before_rounding)
3187
|| (!rounding_bumps_exp);
3190
float_raise(float_flag_inexact STATUS_VAR);
3192
float_raise(float_flag_underflow STATUS_VAR);
3197
if (rounding_bumps_exp) {
3203
return packFloat16(aSign, 0, 0);
3206
aSig >>= -14 - aExp;
3209
return packFloat16(aSign, aExp + 14, aSig >> 13);
3212
/*----------------------------------------------------------------------------
3213
| Returns the result of converting the double-precision floating-point value
3214
| `a' to the extended double-precision floating-point format. The conversion
3215
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3217
*----------------------------------------------------------------------------*/
3219
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3225
a = float64_squash_input_denormal(a STATUS_VAR);
3226
aSig = extractFloat64Frac( a );
3227
aExp = extractFloat64Exp( a );
3228
aSign = extractFloat64Sign( a );
3229
if ( aExp == 0x7FF ) {
3230
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3231
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3234
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3235
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3239
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3243
/*----------------------------------------------------------------------------
3244
| Returns the result of converting the double-precision floating-point value
3245
| `a' to the quadruple-precision floating-point format. The conversion is
3246
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3248
*----------------------------------------------------------------------------*/
3250
float128 float64_to_float128( float64 a STATUS_PARAM )
3254
uint64_t aSig, zSig0, zSig1;
3256
a = float64_squash_input_denormal(a STATUS_VAR);
3257
aSig = extractFloat64Frac( a );
3258
aExp = extractFloat64Exp( a );
3259
aSign = extractFloat64Sign( a );
3260
if ( aExp == 0x7FF ) {
3261
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3262
return packFloat128( aSign, 0x7FFF, 0, 0 );
3265
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3266
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3269
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3270
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3274
/*----------------------------------------------------------------------------
3275
| Rounds the double-precision floating-point value `a' to an integer, and
3276
| returns the result as a double-precision floating-point value. The
3277
| operation is performed according to the IEC/IEEE Standard for Binary
3278
| Floating-Point Arithmetic.
3279
*----------------------------------------------------------------------------*/
3281
float64 float64_round_to_int( float64 a STATUS_PARAM )
3285
uint64_t lastBitMask, roundBitsMask;
3288
a = float64_squash_input_denormal(a STATUS_VAR);
3290
aExp = extractFloat64Exp( a );
3291
if ( 0x433 <= aExp ) {
3292
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3293
return propagateFloat64NaN( a, a STATUS_VAR );
3297
if ( aExp < 0x3FF ) {
3298
if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3299
STATUS(float_exception_flags) |= float_flag_inexact;
3300
aSign = extractFloat64Sign( a );
3301
switch ( STATUS(float_rounding_mode) ) {
3302
case float_round_nearest_even:
3303
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3304
return packFloat64( aSign, 0x3FF, 0 );
3307
case float_round_down:
3308
return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3309
case float_round_up:
3310
return make_float64(
3311
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3313
return packFloat64( aSign, 0, 0 );
3316
lastBitMask <<= 0x433 - aExp;
3317
roundBitsMask = lastBitMask - 1;
3319
roundingMode = STATUS(float_rounding_mode);
3320
if ( roundingMode == float_round_nearest_even ) {
3321
z += lastBitMask>>1;
3322
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3324
else if ( roundingMode != float_round_to_zero ) {
3325
if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3329
z &= ~ roundBitsMask;
3330
if ( z != float64_val(a) )
3331
STATUS(float_exception_flags) |= float_flag_inexact;
3332
return make_float64(z);
3336
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3340
oldmode = STATUS(float_rounding_mode);
3341
STATUS(float_rounding_mode) = float_round_to_zero;
3342
res = float64_round_to_int(a STATUS_VAR);
3343
STATUS(float_rounding_mode) = oldmode;
3347
/*----------------------------------------------------------------------------
3348
| Returns the result of adding the absolute values of the double-precision
3349
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3350
| before being returned. `zSign' is ignored if the result is a NaN.
3351
| The addition is performed according to the IEC/IEEE Standard for Binary
3352
| Floating-Point Arithmetic.
3353
*----------------------------------------------------------------------------*/
3355
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3357
int_fast16_t aExp, bExp, zExp;
3358
uint64_t aSig, bSig, zSig;
3359
int_fast16_t expDiff;
3361
aSig = extractFloat64Frac( a );
3362
aExp = extractFloat64Exp( a );
3363
bSig = extractFloat64Frac( b );
3364
bExp = extractFloat64Exp( b );
3365
expDiff = aExp - bExp;
3368
if ( 0 < expDiff ) {
3369
if ( aExp == 0x7FF ) {
3370
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3377
bSig |= LIT64( 0x2000000000000000 );
3379
shift64RightJamming( bSig, expDiff, &bSig );
3382
else if ( expDiff < 0 ) {
3383
if ( bExp == 0x7FF ) {
3384
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3385
return packFloat64( zSign, 0x7FF, 0 );
3391
aSig |= LIT64( 0x2000000000000000 );
3393
shift64RightJamming( aSig, - expDiff, &aSig );
3397
if ( aExp == 0x7FF ) {
3398
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3402
if (STATUS(flush_to_zero)) {
3404
float_raise(float_flag_output_denormal STATUS_VAR);
3406
return packFloat64(zSign, 0, 0);
3408
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3410
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3414
aSig |= LIT64( 0x2000000000000000 );
3415
zSig = ( aSig + bSig )<<1;
3417
if ( (int64_t) zSig < 0 ) {
3422
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3426
/*----------------------------------------------------------------------------
3427
| Returns the result of subtracting the absolute values of the double-
3428
| precision floating-point values `a' and `b'. If `zSign' is 1, the
3429
| difference is negated before being returned. `zSign' is ignored if the
3430
| result is a NaN. The subtraction is performed according to the IEC/IEEE
3431
| Standard for Binary Floating-Point Arithmetic.
3432
*----------------------------------------------------------------------------*/
3434
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3436
int_fast16_t aExp, bExp, zExp;
3437
uint64_t aSig, bSig, zSig;
3438
int_fast16_t expDiff;
3440
aSig = extractFloat64Frac( a );
3441
aExp = extractFloat64Exp( a );
3442
bSig = extractFloat64Frac( b );
3443
bExp = extractFloat64Exp( b );
3444
expDiff = aExp - bExp;
3447
if ( 0 < expDiff ) goto aExpBigger;
3448
if ( expDiff < 0 ) goto bExpBigger;
3449
if ( aExp == 0x7FF ) {
3450
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3451
float_raise( float_flag_invalid STATUS_VAR);
3452
return float64_default_nan;
3458
if ( bSig < aSig ) goto aBigger;
3459
if ( aSig < bSig ) goto bBigger;
3460
return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3462
if ( bExp == 0x7FF ) {
3463
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3464
return packFloat64( zSign ^ 1, 0x7FF, 0 );
3470
aSig |= LIT64( 0x4000000000000000 );
3472
shift64RightJamming( aSig, - expDiff, &aSig );
3473
bSig |= LIT64( 0x4000000000000000 );
3478
goto normalizeRoundAndPack;
3480
if ( aExp == 0x7FF ) {
3481
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3488
bSig |= LIT64( 0x4000000000000000 );
3490
shift64RightJamming( bSig, expDiff, &bSig );
3491
aSig |= LIT64( 0x4000000000000000 );
3495
normalizeRoundAndPack:
3497
return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3501
/*----------------------------------------------------------------------------
3502
| Returns the result of adding the double-precision floating-point values `a'
3503
| and `b'. The operation is performed according to the IEC/IEEE Standard for
3504
| Binary Floating-Point Arithmetic.
3505
*----------------------------------------------------------------------------*/
3507
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3510
a = float64_squash_input_denormal(a STATUS_VAR);
3511
b = float64_squash_input_denormal(b STATUS_VAR);
3513
aSign = extractFloat64Sign( a );
3514
bSign = extractFloat64Sign( b );
3515
if ( aSign == bSign ) {
3516
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3519
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3524
/*----------------------------------------------------------------------------
3525
| Returns the result of subtracting the double-precision floating-point values
3526
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3527
| for Binary Floating-Point Arithmetic.
3528
*----------------------------------------------------------------------------*/
3530
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3533
a = float64_squash_input_denormal(a STATUS_VAR);
3534
b = float64_squash_input_denormal(b STATUS_VAR);
3536
aSign = extractFloat64Sign( a );
3537
bSign = extractFloat64Sign( b );
3538
if ( aSign == bSign ) {
3539
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3542
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3547
/*----------------------------------------------------------------------------
3548
| Returns the result of multiplying the double-precision floating-point values
3549
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3550
| for Binary Floating-Point Arithmetic.
3551
*----------------------------------------------------------------------------*/
3553
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3555
flag aSign, bSign, zSign;
3556
int_fast16_t aExp, bExp, zExp;
3557
uint64_t aSig, bSig, zSig0, zSig1;
3559
a = float64_squash_input_denormal(a STATUS_VAR);
3560
b = float64_squash_input_denormal(b STATUS_VAR);
3562
aSig = extractFloat64Frac( a );
3563
aExp = extractFloat64Exp( a );
3564
aSign = extractFloat64Sign( a );
3565
bSig = extractFloat64Frac( b );
3566
bExp = extractFloat64Exp( b );
3567
bSign = extractFloat64Sign( b );
3568
zSign = aSign ^ bSign;
3569
if ( aExp == 0x7FF ) {
3570
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3571
return propagateFloat64NaN( a, b STATUS_VAR );
3573
if ( ( bExp | bSig ) == 0 ) {
3574
float_raise( float_flag_invalid STATUS_VAR);
3575
return float64_default_nan;
3577
return packFloat64( zSign, 0x7FF, 0 );
3579
if ( bExp == 0x7FF ) {
3580
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3581
if ( ( aExp | aSig ) == 0 ) {
3582
float_raise( float_flag_invalid STATUS_VAR);
3583
return float64_default_nan;
3585
return packFloat64( zSign, 0x7FF, 0 );
3588
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3589
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3592
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3593
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3595
zExp = aExp + bExp - 0x3FF;
3596
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3597
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3598
mul64To128( aSig, bSig, &zSig0, &zSig1 );
3599
zSig0 |= ( zSig1 != 0 );
3600
if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3604
return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3608
/*----------------------------------------------------------------------------
3609
| Returns the result of dividing the double-precision floating-point value `a'
3610
| by the corresponding value `b'. The operation is performed according to
3611
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3612
*----------------------------------------------------------------------------*/
3614
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3616
flag aSign, bSign, zSign;
3617
int_fast16_t aExp, bExp, zExp;
3618
uint64_t aSig, bSig, zSig;
3619
uint64_t rem0, rem1;
3620
uint64_t term0, term1;
3621
a = float64_squash_input_denormal(a STATUS_VAR);
3622
b = float64_squash_input_denormal(b STATUS_VAR);
3624
aSig = extractFloat64Frac( a );
3625
aExp = extractFloat64Exp( a );
3626
aSign = extractFloat64Sign( a );
3627
bSig = extractFloat64Frac( b );
3628
bExp = extractFloat64Exp( b );
3629
bSign = extractFloat64Sign( b );
3630
zSign = aSign ^ bSign;
3631
if ( aExp == 0x7FF ) {
3632
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633
if ( bExp == 0x7FF ) {
3634
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3635
float_raise( float_flag_invalid STATUS_VAR);
3636
return float64_default_nan;
3638
return packFloat64( zSign, 0x7FF, 0 );
3640
if ( bExp == 0x7FF ) {
3641
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3642
return packFloat64( zSign, 0, 0 );
3646
if ( ( aExp | aSig ) == 0 ) {
3647
float_raise( float_flag_invalid STATUS_VAR);
3648
return float64_default_nan;
3650
float_raise( float_flag_divbyzero STATUS_VAR);
3651
return packFloat64( zSign, 0x7FF, 0 );
3653
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3656
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3657
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3659
zExp = aExp - bExp + 0x3FD;
3660
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3661
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3662
if ( bSig <= ( aSig + aSig ) ) {
3666
zSig = estimateDiv128To64( aSig, 0, bSig );
3667
if ( ( zSig & 0x1FF ) <= 2 ) {
3668
mul64To128( bSig, zSig, &term0, &term1 );
3669
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3670
while ( (int64_t) rem0 < 0 ) {
3672
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3674
zSig |= ( rem1 != 0 );
3676
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3680
/*----------------------------------------------------------------------------
3681
| Returns the remainder of the double-precision floating-point value `a'
3682
| with respect to the corresponding value `b'. The operation is performed
3683
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3684
*----------------------------------------------------------------------------*/
3686
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3689
int_fast16_t aExp, bExp, expDiff;
3690
uint64_t aSig, bSig;
3691
uint64_t q, alternateASig;
3694
a = float64_squash_input_denormal(a STATUS_VAR);
3695
b = float64_squash_input_denormal(b STATUS_VAR);
3696
aSig = extractFloat64Frac( a );
3697
aExp = extractFloat64Exp( a );
3698
aSign = extractFloat64Sign( a );
3699
bSig = extractFloat64Frac( b );
3700
bExp = extractFloat64Exp( b );
3701
if ( aExp == 0x7FF ) {
3702
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3703
return propagateFloat64NaN( a, b STATUS_VAR );
3705
float_raise( float_flag_invalid STATUS_VAR);
3706
return float64_default_nan;
3708
if ( bExp == 0x7FF ) {
3709
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3714
float_raise( float_flag_invalid STATUS_VAR);
3715
return float64_default_nan;
3717
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3720
if ( aSig == 0 ) return a;
3721
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3723
expDiff = aExp - bExp;
3724
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3725
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3726
if ( expDiff < 0 ) {
3727
if ( expDiff < -1 ) return a;
3730
q = ( bSig <= aSig );
3731
if ( q ) aSig -= bSig;
3733
while ( 0 < expDiff ) {
3734
q = estimateDiv128To64( aSig, 0, bSig );
3735
q = ( 2 < q ) ? q - 2 : 0;
3736
aSig = - ( ( bSig>>2 ) * q );
3740
if ( 0 < expDiff ) {
3741
q = estimateDiv128To64( aSig, 0, bSig );
3742
q = ( 2 < q ) ? q - 2 : 0;
3745
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3752
alternateASig = aSig;
3755
} while ( 0 <= (int64_t) aSig );
3756
sigMean = aSig + alternateASig;
3757
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3758
aSig = alternateASig;
3760
zSign = ( (int64_t) aSig < 0 );
3761
if ( zSign ) aSig = - aSig;
3762
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3766
/*----------------------------------------------------------------------------
3767
| Returns the result of multiplying the double-precision floating-point values
3768
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3769
| multiplication. The operation is performed according to the IEC/IEEE
3770
| Standard for Binary Floating-Point Arithmetic 754-2008.
3771
| The flags argument allows the caller to select negation of the
3772
| addend, the intermediate product, or the final result. (The difference
3773
| between this and having the caller do a separate negation is that negating
3774
| externally will flip the sign bit on NaNs.)
3775
*----------------------------------------------------------------------------*/
3777
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3779
flag aSign, bSign, cSign, zSign;
3780
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3781
uint64_t aSig, bSig, cSig;
3782
flag pInf, pZero, pSign;
3783
uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3785
flag signflip, infzero;
3787
a = float64_squash_input_denormal(a STATUS_VAR);
3788
b = float64_squash_input_denormal(b STATUS_VAR);
3789
c = float64_squash_input_denormal(c STATUS_VAR);
3790
aSig = extractFloat64Frac(a);
3791
aExp = extractFloat64Exp(a);
3792
aSign = extractFloat64Sign(a);
3793
bSig = extractFloat64Frac(b);
3794
bExp = extractFloat64Exp(b);
3795
bSign = extractFloat64Sign(b);
3796
cSig = extractFloat64Frac(c);
3797
cExp = extractFloat64Exp(c);
3798
cSign = extractFloat64Sign(c);
3800
infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3801
(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3803
/* It is implementation-defined whether the cases of (0,inf,qnan)
3804
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3805
* they return if they do), so we have to hand this information
3806
* off to the target-specific pick-a-NaN routine.
3808
if (((aExp == 0x7ff) && aSig) ||
3809
((bExp == 0x7ff) && bSig) ||
3810
((cExp == 0x7ff) && cSig)) {
3811
return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3815
float_raise(float_flag_invalid STATUS_VAR);
3816
return float64_default_nan;
3819
if (flags & float_muladd_negate_c) {
3823
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3825
/* Work out the sign and type of the product */
3826
pSign = aSign ^ bSign;
3827
if (flags & float_muladd_negate_product) {
3830
pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3831
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3833
if (cExp == 0x7ff) {
3834
if (pInf && (pSign ^ cSign)) {
3835
/* addition of opposite-signed infinities => InvalidOperation */
3836
float_raise(float_flag_invalid STATUS_VAR);
3837
return float64_default_nan;
3839
/* Otherwise generate an infinity of the same sign */
3840
return packFloat64(cSign ^ signflip, 0x7ff, 0);
3844
return packFloat64(pSign ^ signflip, 0x7ff, 0);
3850
/* Adding two exact zeroes */
3851
if (pSign == cSign) {
3853
} else if (STATUS(float_rounding_mode) == float_round_down) {
3858
return packFloat64(zSign ^ signflip, 0, 0);
3860
/* Exact zero plus a denorm */
3861
if (STATUS(flush_to_zero)) {
3862
float_raise(float_flag_output_denormal STATUS_VAR);
3863
return packFloat64(cSign ^ signflip, 0, 0);
3866
/* Zero plus something non-zero : just return the something */
3867
return packFloat64(cSign ^ signflip, cExp, cSig);
3871
normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3874
normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3877
/* Calculate the actual result a * b + c */
3879
/* Multiply first; this is easy. */
3880
/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3881
* because we want the true exponent, not the "one-less-than"
3882
* flavour that roundAndPackFloat64() takes.
3884
pExp = aExp + bExp - 0x3fe;
3885
aSig = (aSig | LIT64(0x0010000000000000))<<10;
3886
bSig = (bSig | LIT64(0x0010000000000000))<<11;
3887
mul64To128(aSig, bSig, &pSig0, &pSig1);
3888
if ((int64_t)(pSig0 << 1) >= 0) {
3889
shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3893
zSign = pSign ^ signflip;
3895
/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3896
* bit in position 126.
3900
/* Throw out the special case of c being an exact zero now */
3901
shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3902
return roundAndPackFloat64(zSign, pExp - 1,
3905
normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3908
/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3909
* significand of the addend, with the explicit bit in position 126.
3911
cSig0 = cSig << (126 - 64 - 52);
3913
cSig0 |= LIT64(0x4000000000000000);
3914
expDiff = pExp - cExp;
3916
if (pSign == cSign) {
3919
/* scale c to match p */
3920
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3922
} else if (expDiff < 0) {
3923
/* scale p to match c */
3924
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3927
/* no scaling needed */
3930
/* Add significands and make sure explicit bit ends up in posn 126 */
3931
add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3932
if ((int64_t)zSig0 < 0) {
3933
shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3937
shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3938
return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3942
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3943
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3945
} else if (expDiff < 0) {
3946
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3947
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3952
if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3953
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3954
} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3955
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3960
if (STATUS(float_rounding_mode) == float_round_down) {
3963
return packFloat64(zSign, 0, 0);
3967
/* Do the equivalent of normalizeRoundAndPackFloat64() but
3968
* starting with the significand in a pair of uint64_t.
3971
shiftcount = countLeadingZeros64(zSig0) - 1;
3972
shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3978
shiftcount = countLeadingZeros64(zSig1);
3979
if (shiftcount == 0) {
3980
zSig0 = (zSig1 >> 1) | (zSig1 & 1);
3984
zSig0 = zSig1 << shiftcount;
3985
zExp -= (shiftcount + 64);
3988
return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3992
/*----------------------------------------------------------------------------
3993
| Returns the square root of the double-precision floating-point value `a'.
3994
| The operation is performed according to the IEC/IEEE Standard for Binary
3995
| Floating-Point Arithmetic.
3996
*----------------------------------------------------------------------------*/
3998
float64 float64_sqrt( float64 a STATUS_PARAM )
4001
int_fast16_t aExp, zExp;
4002
uint64_t aSig, zSig, doubleZSig;
4003
uint64_t rem0, rem1, term0, term1;
4004
a = float64_squash_input_denormal(a STATUS_VAR);
4006
aSig = extractFloat64Frac( a );
4007
aExp = extractFloat64Exp( a );
4008
aSign = extractFloat64Sign( a );
4009
if ( aExp == 0x7FF ) {
4010
if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4011
if ( ! aSign ) return a;
4012
float_raise( float_flag_invalid STATUS_VAR);
4013
return float64_default_nan;
4016
if ( ( aExp | aSig ) == 0 ) return a;
4017
float_raise( float_flag_invalid STATUS_VAR);
4018
return float64_default_nan;
4021
if ( aSig == 0 ) return float64_zero;
4022
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4024
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4025
aSig |= LIT64( 0x0010000000000000 );
4026
zSig = estimateSqrt32( aExp, aSig>>21 );
4027
aSig <<= 9 - ( aExp & 1 );
4028
zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4029
if ( ( zSig & 0x1FF ) <= 5 ) {
4030
doubleZSig = zSig<<1;
4031
mul64To128( zSig, zSig, &term0, &term1 );
4032
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4033
while ( (int64_t) rem0 < 0 ) {
4036
add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4038
zSig |= ( ( rem0 | rem1 ) != 0 );
4040
return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4044
/*----------------------------------------------------------------------------
4045
| Returns the binary log of the double-precision floating-point value `a'.
4046
| The operation is performed according to the IEC/IEEE Standard for Binary
4047
| Floating-Point Arithmetic.
4048
*----------------------------------------------------------------------------*/
4049
float64 float64_log2( float64 a STATUS_PARAM )
4053
uint64_t aSig, aSig0, aSig1, zSig, i;
4054
a = float64_squash_input_denormal(a STATUS_VAR);
4056
aSig = extractFloat64Frac( a );
4057
aExp = extractFloat64Exp( a );
4058
aSign = extractFloat64Sign( a );
4061
if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4062
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4065
float_raise( float_flag_invalid STATUS_VAR);
4066
return float64_default_nan;
4068
if ( aExp == 0x7FF ) {
4069
if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4074
aSig |= LIT64( 0x0010000000000000 );
4076
zSig = (uint64_t)aExp << 52;
4077
for (i = 1LL << 51; i > 0; i >>= 1) {
4078
mul64To128( aSig, aSig, &aSig0, &aSig1 );
4079
aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4080
if ( aSig & LIT64( 0x0020000000000000 ) ) {
4088
return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4091
/*----------------------------------------------------------------------------
4092
| Returns 1 if the double-precision floating-point value `a' is equal to the
4093
| corresponding value `b', and 0 otherwise. The invalid exception is raised
4094
| if either operand is a NaN. Otherwise, the comparison is performed
4095
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4096
*----------------------------------------------------------------------------*/
4098
int float64_eq( float64 a, float64 b STATUS_PARAM )
4101
a = float64_squash_input_denormal(a STATUS_VAR);
4102
b = float64_squash_input_denormal(b STATUS_VAR);
4104
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4105
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4107
float_raise( float_flag_invalid STATUS_VAR);
4110
av = float64_val(a);
4111
bv = float64_val(b);
4112
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4116
/*----------------------------------------------------------------------------
4117
| Returns 1 if the double-precision floating-point value `a' is less than or
4118
| equal to the corresponding value `b', and 0 otherwise. The invalid
4119
| exception is raised if either operand is a NaN. The comparison is performed
4120
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4121
*----------------------------------------------------------------------------*/
4123
int float64_le( float64 a, float64 b STATUS_PARAM )
4127
a = float64_squash_input_denormal(a STATUS_VAR);
4128
b = float64_squash_input_denormal(b STATUS_VAR);
4130
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4131
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4133
float_raise( float_flag_invalid STATUS_VAR);
4136
aSign = extractFloat64Sign( a );
4137
bSign = extractFloat64Sign( b );
4138
av = float64_val(a);
4139
bv = float64_val(b);
4140
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4141
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4145
/*----------------------------------------------------------------------------
4146
| Returns 1 if the double-precision floating-point value `a' is less than
4147
| the corresponding value `b', and 0 otherwise. The invalid exception is
4148
| raised if either operand is a NaN. The comparison is performed according
4149
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4150
*----------------------------------------------------------------------------*/
4152
int float64_lt( float64 a, float64 b STATUS_PARAM )
4157
a = float64_squash_input_denormal(a STATUS_VAR);
4158
b = float64_squash_input_denormal(b STATUS_VAR);
4159
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4160
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4162
float_raise( float_flag_invalid STATUS_VAR);
4165
aSign = extractFloat64Sign( a );
4166
bSign = extractFloat64Sign( b );
4167
av = float64_val(a);
4168
bv = float64_val(b);
4169
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4170
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4174
/*----------------------------------------------------------------------------
4175
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4176
| be compared, and 0 otherwise. The invalid exception is raised if either
4177
| operand is a NaN. The comparison is performed according to the IEC/IEEE
4178
| Standard for Binary Floating-Point Arithmetic.
4179
*----------------------------------------------------------------------------*/
4181
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4183
a = float64_squash_input_denormal(a STATUS_VAR);
4184
b = float64_squash_input_denormal(b STATUS_VAR);
4186
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4187
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4189
float_raise( float_flag_invalid STATUS_VAR);
4195
/*----------------------------------------------------------------------------
4196
| Returns 1 if the double-precision floating-point value `a' is equal to the
4197
| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4198
| exception.The comparison is performed according to the IEC/IEEE Standard
4199
| for Binary Floating-Point Arithmetic.
4200
*----------------------------------------------------------------------------*/
4202
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4205
a = float64_squash_input_denormal(a STATUS_VAR);
4206
b = float64_squash_input_denormal(b STATUS_VAR);
4208
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4209
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4211
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4212
float_raise( float_flag_invalid STATUS_VAR);
4216
av = float64_val(a);
4217
bv = float64_val(b);
4218
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4222
/*----------------------------------------------------------------------------
4223
| Returns 1 if the double-precision floating-point value `a' is less than or
4224
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4225
| cause an exception. Otherwise, the comparison is performed according to the
4226
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227
*----------------------------------------------------------------------------*/
4229
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4233
a = float64_squash_input_denormal(a STATUS_VAR);
4234
b = float64_squash_input_denormal(b STATUS_VAR);
4236
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4237
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4239
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4240
float_raise( float_flag_invalid STATUS_VAR);
4244
aSign = extractFloat64Sign( a );
4245
bSign = extractFloat64Sign( b );
4246
av = float64_val(a);
4247
bv = float64_val(b);
4248
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4249
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4253
/*----------------------------------------------------------------------------
4254
| Returns 1 if the double-precision floating-point value `a' is less than
4255
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4256
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4257
| Standard for Binary Floating-Point Arithmetic.
4258
*----------------------------------------------------------------------------*/
4260
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4264
a = float64_squash_input_denormal(a STATUS_VAR);
4265
b = float64_squash_input_denormal(b STATUS_VAR);
4267
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4268
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4270
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4271
float_raise( float_flag_invalid STATUS_VAR);
4275
aSign = extractFloat64Sign( a );
4276
bSign = extractFloat64Sign( b );
4277
av = float64_val(a);
4278
bv = float64_val(b);
4279
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4280
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4284
/*----------------------------------------------------------------------------
4285
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4286
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4287
| comparison is performed according to the IEC/IEEE Standard for Binary
4288
| Floating-Point Arithmetic.
4289
*----------------------------------------------------------------------------*/
4291
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4293
a = float64_squash_input_denormal(a STATUS_VAR);
4294
b = float64_squash_input_denormal(b STATUS_VAR);
4296
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4297
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4299
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4300
float_raise( float_flag_invalid STATUS_VAR);
4307
/*----------------------------------------------------------------------------
4308
| Returns the result of converting the extended double-precision floating-
4309
| point value `a' to the 32-bit two's complement integer format. The
4310
| conversion is performed according to the IEC/IEEE Standard for Binary
4311
| Floating-Point Arithmetic---which means in particular that the conversion
4312
| is rounded according to the current rounding mode. If `a' is a NaN, the
4313
| largest positive integer is returned. Otherwise, if the conversion
4314
| overflows, the largest integer with the same sign as `a' is returned.
4315
*----------------------------------------------------------------------------*/
4317
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4320
int32 aExp, shiftCount;
4323
aSig = extractFloatx80Frac( a );
4324
aExp = extractFloatx80Exp( a );
4325
aSign = extractFloatx80Sign( a );
4326
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4327
shiftCount = 0x4037 - aExp;
4328
if ( shiftCount <= 0 ) shiftCount = 1;
4329
shift64RightJamming( aSig, shiftCount, &aSig );
4330
return roundAndPackInt32( aSign, aSig STATUS_VAR );
4334
/*----------------------------------------------------------------------------
4335
| Returns the result of converting the extended double-precision floating-
4336
| point value `a' to the 32-bit two's complement integer format. The
4337
| conversion is performed according to the IEC/IEEE Standard for Binary
4338
| Floating-Point Arithmetic, except that the conversion is always rounded
4339
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4340
| Otherwise, if the conversion overflows, the largest integer with the same
4341
| sign as `a' is returned.
4342
*----------------------------------------------------------------------------*/
4344
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4347
int32 aExp, shiftCount;
4348
uint64_t aSig, savedASig;
4351
aSig = extractFloatx80Frac( a );
4352
aExp = extractFloatx80Exp( a );
4353
aSign = extractFloatx80Sign( a );
4354
if ( 0x401E < aExp ) {
4355
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4358
else if ( aExp < 0x3FFF ) {
4359
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4362
shiftCount = 0x403E - aExp;
4364
aSig >>= shiftCount;
4366
if ( aSign ) z = - z;
4367
if ( ( z < 0 ) ^ aSign ) {
4369
float_raise( float_flag_invalid STATUS_VAR);
4370
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4372
if ( ( aSig<<shiftCount ) != savedASig ) {
4373
STATUS(float_exception_flags) |= float_flag_inexact;
4379
/*----------------------------------------------------------------------------
4380
| Returns the result of converting the extended double-precision floating-
4381
| point value `a' to the 64-bit two's complement integer format. The
4382
| conversion is performed according to the IEC/IEEE Standard for Binary
4383
| Floating-Point Arithmetic---which means in particular that the conversion
4384
| is rounded according to the current rounding mode. If `a' is a NaN,
4385
| the largest positive integer is returned. Otherwise, if the conversion
4386
| overflows, the largest integer with the same sign as `a' is returned.
4387
*----------------------------------------------------------------------------*/
4389
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4392
int32 aExp, shiftCount;
4393
uint64_t aSig, aSigExtra;
4395
aSig = extractFloatx80Frac( a );
4396
aExp = extractFloatx80Exp( a );
4397
aSign = extractFloatx80Sign( a );
4398
shiftCount = 0x403E - aExp;
4399
if ( shiftCount <= 0 ) {
4401
float_raise( float_flag_invalid STATUS_VAR);
4403
|| ( ( aExp == 0x7FFF )
4404
&& ( aSig != LIT64( 0x8000000000000000 ) ) )
4406
return LIT64( 0x7FFFFFFFFFFFFFFF );
4408
return (int64_t) LIT64( 0x8000000000000000 );
4413
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4415
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4419
/*----------------------------------------------------------------------------
4420
| Returns the result of converting the extended double-precision floating-
4421
| point value `a' to the 64-bit two's complement integer format. The
4422
| conversion is performed according to the IEC/IEEE Standard for Binary
4423
| Floating-Point Arithmetic, except that the conversion is always rounded
4424
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4425
| Otherwise, if the conversion overflows, the largest integer with the same
4426
| sign as `a' is returned.
4427
*----------------------------------------------------------------------------*/
4429
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4432
int32 aExp, shiftCount;
4436
aSig = extractFloatx80Frac( a );
4437
aExp = extractFloatx80Exp( a );
4438
aSign = extractFloatx80Sign( a );
4439
shiftCount = aExp - 0x403E;
4440
if ( 0 <= shiftCount ) {
4441
aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4442
if ( ( a.high != 0xC03E ) || aSig ) {
4443
float_raise( float_flag_invalid STATUS_VAR);
4444
if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4445
return LIT64( 0x7FFFFFFFFFFFFFFF );
4448
return (int64_t) LIT64( 0x8000000000000000 );
4450
else if ( aExp < 0x3FFF ) {
4451
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4454
z = aSig>>( - shiftCount );
4455
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4456
STATUS(float_exception_flags) |= float_flag_inexact;
4458
if ( aSign ) z = - z;
4463
/*----------------------------------------------------------------------------
4464
| Returns the result of converting the extended double-precision floating-
4465
| point value `a' to the single-precision floating-point format. The
4466
| conversion is performed according to the IEC/IEEE Standard for Binary
4467
| Floating-Point Arithmetic.
4468
*----------------------------------------------------------------------------*/
4470
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4476
aSig = extractFloatx80Frac( a );
4477
aExp = extractFloatx80Exp( a );
4478
aSign = extractFloatx80Sign( a );
4479
if ( aExp == 0x7FFF ) {
4480
if ( (uint64_t) ( aSig<<1 ) ) {
4481
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4483
return packFloat32( aSign, 0xFF, 0 );
4485
shift64RightJamming( aSig, 33, &aSig );
4486
if ( aExp || aSig ) aExp -= 0x3F81;
4487
return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4491
/*----------------------------------------------------------------------------
4492
| Returns the result of converting the extended double-precision floating-
4493
| point value `a' to the double-precision floating-point format. The
4494
| conversion is performed according to the IEC/IEEE Standard for Binary
4495
| Floating-Point Arithmetic.
4496
*----------------------------------------------------------------------------*/
4498
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4502
uint64_t aSig, zSig;
4504
aSig = extractFloatx80Frac( a );
4505
aExp = extractFloatx80Exp( a );
4506
aSign = extractFloatx80Sign( a );
4507
if ( aExp == 0x7FFF ) {
4508
if ( (uint64_t) ( aSig<<1 ) ) {
4509
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4511
return packFloat64( aSign, 0x7FF, 0 );
4513
shift64RightJamming( aSig, 1, &zSig );
4514
if ( aExp || aSig ) aExp -= 0x3C01;
4515
return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4519
/*----------------------------------------------------------------------------
4520
| Returns the result of converting the extended double-precision floating-
4521
| point value `a' to the quadruple-precision floating-point format. The
4522
| conversion is performed according to the IEC/IEEE Standard for Binary
4523
| Floating-Point Arithmetic.
4524
*----------------------------------------------------------------------------*/
4526
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4530
uint64_t aSig, zSig0, zSig1;
4532
aSig = extractFloatx80Frac( a );
4533
aExp = extractFloatx80Exp( a );
4534
aSign = extractFloatx80Sign( a );
4535
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4536
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4538
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4539
return packFloat128( aSign, aExp, zSig0, zSig1 );
4543
/*----------------------------------------------------------------------------
4544
| Rounds the extended double-precision floating-point value `a' to an integer,
4545
| and returns the result as an extended quadruple-precision floating-point
4546
| value. The operation is performed according to the IEC/IEEE Standard for
4547
| Binary Floating-Point Arithmetic.
4548
*----------------------------------------------------------------------------*/
4550
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4554
uint64_t lastBitMask, roundBitsMask;
4558
aExp = extractFloatx80Exp( a );
4559
if ( 0x403E <= aExp ) {
4560
if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4561
return propagateFloatx80NaN( a, a STATUS_VAR );
4565
if ( aExp < 0x3FFF ) {
4567
&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4570
STATUS(float_exception_flags) |= float_flag_inexact;
4571
aSign = extractFloatx80Sign( a );
4572
switch ( STATUS(float_rounding_mode) ) {
4573
case float_round_nearest_even:
4574
if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4577
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4580
case float_round_down:
4583
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4584
: packFloatx80( 0, 0, 0 );
4585
case float_round_up:
4587
aSign ? packFloatx80( 1, 0, 0 )
4588
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4590
return packFloatx80( aSign, 0, 0 );
4593
lastBitMask <<= 0x403E - aExp;
4594
roundBitsMask = lastBitMask - 1;
4596
roundingMode = STATUS(float_rounding_mode);
4597
if ( roundingMode == float_round_nearest_even ) {
4598
z.low += lastBitMask>>1;
4599
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4601
else if ( roundingMode != float_round_to_zero ) {
4602
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4603
z.low += roundBitsMask;
4606
z.low &= ~ roundBitsMask;
4609
z.low = LIT64( 0x8000000000000000 );
4611
if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4616
/*----------------------------------------------------------------------------
4617
| Returns the result of adding the absolute values of the extended double-
4618
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4619
| negated before being returned. `zSign' is ignored if the result is a NaN.
4620
| The addition is performed according to the IEC/IEEE Standard for Binary
4621
| Floating-Point Arithmetic.
4622
*----------------------------------------------------------------------------*/
4624
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4626
int32 aExp, bExp, zExp;
4627
uint64_t aSig, bSig, zSig0, zSig1;
4630
aSig = extractFloatx80Frac( a );
4631
aExp = extractFloatx80Exp( a );
4632
bSig = extractFloatx80Frac( b );
4633
bExp = extractFloatx80Exp( b );
4634
expDiff = aExp - bExp;
4635
if ( 0 < expDiff ) {
4636
if ( aExp == 0x7FFF ) {
4637
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4640
if ( bExp == 0 ) --expDiff;
4641
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4644
else if ( expDiff < 0 ) {
4645
if ( bExp == 0x7FFF ) {
4646
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4647
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4649
if ( aExp == 0 ) ++expDiff;
4650
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4654
if ( aExp == 0x7FFF ) {
4655
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4656
return propagateFloatx80NaN( a, b STATUS_VAR );
4661
zSig0 = aSig + bSig;
4663
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4669
zSig0 = aSig + bSig;
4670
if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4672
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4673
zSig0 |= LIT64( 0x8000000000000000 );
4677
roundAndPackFloatx80(
4678
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4682
/*----------------------------------------------------------------------------
4683
| Returns the result of subtracting the absolute values of the extended
4684
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4685
| difference is negated before being returned. `zSign' is ignored if the
4686
| result is a NaN. The subtraction is performed according to the IEC/IEEE
4687
| Standard for Binary Floating-Point Arithmetic.
4688
*----------------------------------------------------------------------------*/
4690
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4692
int32 aExp, bExp, zExp;
4693
uint64_t aSig, bSig, zSig0, zSig1;
4697
aSig = extractFloatx80Frac( a );
4698
aExp = extractFloatx80Exp( a );
4699
bSig = extractFloatx80Frac( b );
4700
bExp = extractFloatx80Exp( b );
4701
expDiff = aExp - bExp;
4702
if ( 0 < expDiff ) goto aExpBigger;
4703
if ( expDiff < 0 ) goto bExpBigger;
4704
if ( aExp == 0x7FFF ) {
4705
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4706
return propagateFloatx80NaN( a, b STATUS_VAR );
4708
float_raise( float_flag_invalid STATUS_VAR);
4709
z.low = floatx80_default_nan_low;
4710
z.high = floatx80_default_nan_high;
4718
if ( bSig < aSig ) goto aBigger;
4719
if ( aSig < bSig ) goto bBigger;
4720
return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4722
if ( bExp == 0x7FFF ) {
4723
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4724
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4726
if ( aExp == 0 ) ++expDiff;
4727
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4729
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4732
goto normalizeRoundAndPack;
4734
if ( aExp == 0x7FFF ) {
4735
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4738
if ( bExp == 0 ) --expDiff;
4739
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4741
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4743
normalizeRoundAndPack:
4745
normalizeRoundAndPackFloatx80(
4746
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4750
/*----------------------------------------------------------------------------
4751
| Returns the result of adding the extended double-precision floating-point
4752
| values `a' and `b'. The operation is performed according to the IEC/IEEE
4753
| Standard for Binary Floating-Point Arithmetic.
4754
*----------------------------------------------------------------------------*/
4756
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4760
aSign = extractFloatx80Sign( a );
4761
bSign = extractFloatx80Sign( b );
4762
if ( aSign == bSign ) {
4763
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4766
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4771
/*----------------------------------------------------------------------------
4772
| Returns the result of subtracting the extended double-precision floating-
4773
| point values `a' and `b'. The operation is performed according to the
4774
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4775
*----------------------------------------------------------------------------*/
4777
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4781
aSign = extractFloatx80Sign( a );
4782
bSign = extractFloatx80Sign( b );
4783
if ( aSign == bSign ) {
4784
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4787
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4792
/*----------------------------------------------------------------------------
4793
| Returns the result of multiplying the extended double-precision floating-
4794
| point values `a' and `b'. The operation is performed according to the
4795
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4796
*----------------------------------------------------------------------------*/
4798
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4800
flag aSign, bSign, zSign;
4801
int32 aExp, bExp, zExp;
4802
uint64_t aSig, bSig, zSig0, zSig1;
4805
aSig = extractFloatx80Frac( a );
4806
aExp = extractFloatx80Exp( a );
4807
aSign = extractFloatx80Sign( a );
4808
bSig = extractFloatx80Frac( b );
4809
bExp = extractFloatx80Exp( b );
4810
bSign = extractFloatx80Sign( b );
4811
zSign = aSign ^ bSign;
4812
if ( aExp == 0x7FFF ) {
4813
if ( (uint64_t) ( aSig<<1 )
4814
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4815
return propagateFloatx80NaN( a, b STATUS_VAR );
4817
if ( ( bExp | bSig ) == 0 ) goto invalid;
4818
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4820
if ( bExp == 0x7FFF ) {
4821
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4822
if ( ( aExp | aSig ) == 0 ) {
4824
float_raise( float_flag_invalid STATUS_VAR);
4825
z.low = floatx80_default_nan_low;
4826
z.high = floatx80_default_nan_high;
4829
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4832
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4833
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4836
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4837
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4839
zExp = aExp + bExp - 0x3FFE;
4840
mul64To128( aSig, bSig, &zSig0, &zSig1 );
4841
if ( 0 < (int64_t) zSig0 ) {
4842
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4846
roundAndPackFloatx80(
4847
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4851
/*----------------------------------------------------------------------------
4852
| Returns the result of dividing the extended double-precision floating-point
4853
| value `a' by the corresponding value `b'. The operation is performed
4854
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4855
*----------------------------------------------------------------------------*/
4857
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4859
flag aSign, bSign, zSign;
4860
int32 aExp, bExp, zExp;
4861
uint64_t aSig, bSig, zSig0, zSig1;
4862
uint64_t rem0, rem1, rem2, term0, term1, term2;
4865
aSig = extractFloatx80Frac( a );
4866
aExp = extractFloatx80Exp( a );
4867
aSign = extractFloatx80Sign( a );
4868
bSig = extractFloatx80Frac( b );
4869
bExp = extractFloatx80Exp( b );
4870
bSign = extractFloatx80Sign( b );
4871
zSign = aSign ^ bSign;
4872
if ( aExp == 0x7FFF ) {
4873
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4874
if ( bExp == 0x7FFF ) {
4875
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4878
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4880
if ( bExp == 0x7FFF ) {
4881
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4882
return packFloatx80( zSign, 0, 0 );
4886
if ( ( aExp | aSig ) == 0 ) {
4888
float_raise( float_flag_invalid STATUS_VAR);
4889
z.low = floatx80_default_nan_low;
4890
z.high = floatx80_default_nan_high;
4893
float_raise( float_flag_divbyzero STATUS_VAR);
4894
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4896
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4899
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4900
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4902
zExp = aExp - bExp + 0x3FFE;
4904
if ( bSig <= aSig ) {
4905
shift128Right( aSig, 0, 1, &aSig, &rem1 );
4908
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4909
mul64To128( bSig, zSig0, &term0, &term1 );
4910
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4911
while ( (int64_t) rem0 < 0 ) {
4913
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4915
zSig1 = estimateDiv128To64( rem1, 0, bSig );
4916
if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4917
mul64To128( bSig, zSig1, &term1, &term2 );
4918
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4919
while ( (int64_t) rem1 < 0 ) {
4921
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4923
zSig1 |= ( ( rem1 | rem2 ) != 0 );
4926
roundAndPackFloatx80(
4927
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4931
/*----------------------------------------------------------------------------
4932
| Returns the remainder of the extended double-precision floating-point value
4933
| `a' with respect to the corresponding value `b'. The operation is performed
4934
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935
*----------------------------------------------------------------------------*/
4937
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4940
int32 aExp, bExp, expDiff;
4941
uint64_t aSig0, aSig1, bSig;
4942
uint64_t q, term0, term1, alternateASig0, alternateASig1;
4945
aSig0 = extractFloatx80Frac( a );
4946
aExp = extractFloatx80Exp( a );
4947
aSign = extractFloatx80Sign( a );
4948
bSig = extractFloatx80Frac( b );
4949
bExp = extractFloatx80Exp( b );
4950
if ( aExp == 0x7FFF ) {
4951
if ( (uint64_t) ( aSig0<<1 )
4952
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4953
return propagateFloatx80NaN( a, b STATUS_VAR );
4957
if ( bExp == 0x7FFF ) {
4958
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4964
float_raise( float_flag_invalid STATUS_VAR);
4965
z.low = floatx80_default_nan_low;
4966
z.high = floatx80_default_nan_high;
4969
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4972
if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4973
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4975
bSig |= LIT64( 0x8000000000000000 );
4977
expDiff = aExp - bExp;
4979
if ( expDiff < 0 ) {
4980
if ( expDiff < -1 ) return a;
4981
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4984
q = ( bSig <= aSig0 );
4985
if ( q ) aSig0 -= bSig;
4987
while ( 0 < expDiff ) {
4988
q = estimateDiv128To64( aSig0, aSig1, bSig );
4989
q = ( 2 < q ) ? q - 2 : 0;
4990
mul64To128( bSig, q, &term0, &term1 );
4991
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4992
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4996
if ( 0 < expDiff ) {
4997
q = estimateDiv128To64( aSig0, aSig1, bSig );
4998
q = ( 2 < q ) ? q - 2 : 0;
5000
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5001
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5002
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5003
while ( le128( term0, term1, aSig0, aSig1 ) ) {
5005
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5012
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5013
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5014
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5017
aSig0 = alternateASig0;
5018
aSig1 = alternateASig1;
5022
normalizeRoundAndPackFloatx80(
5023
80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5027
/*----------------------------------------------------------------------------
5028
| Returns the square root of the extended double-precision floating-point
5029
| value `a'. The operation is performed according to the IEC/IEEE Standard
5030
| for Binary Floating-Point Arithmetic.
5031
*----------------------------------------------------------------------------*/
5033
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5037
uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5038
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5041
aSig0 = extractFloatx80Frac( a );
5042
aExp = extractFloatx80Exp( a );
5043
aSign = extractFloatx80Sign( a );
5044
if ( aExp == 0x7FFF ) {
5045
if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5046
if ( ! aSign ) return a;
5050
if ( ( aExp | aSig0 ) == 0 ) return a;
5052
float_raise( float_flag_invalid STATUS_VAR);
5053
z.low = floatx80_default_nan_low;
5054
z.high = floatx80_default_nan_high;
5058
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5059
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5061
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5062
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5063
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5064
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5065
doubleZSig0 = zSig0<<1;
5066
mul64To128( zSig0, zSig0, &term0, &term1 );
5067
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5068
while ( (int64_t) rem0 < 0 ) {
5071
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5073
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5074
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5075
if ( zSig1 == 0 ) zSig1 = 1;
5076
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5077
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5078
mul64To128( zSig1, zSig1, &term2, &term3 );
5079
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5080
while ( (int64_t) rem1 < 0 ) {
5082
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5084
term2 |= doubleZSig0;
5085
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5087
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5089
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5090
zSig0 |= doubleZSig0;
5092
roundAndPackFloatx80(
5093
STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5097
/*----------------------------------------------------------------------------
5098
| Returns 1 if the extended double-precision floating-point value `a' is equal
5099
| to the corresponding value `b', and 0 otherwise. The invalid exception is
5100
| raised if either operand is a NaN. Otherwise, the comparison is performed
5101
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5102
*----------------------------------------------------------------------------*/
5104
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5107
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5108
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5109
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5110
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5112
float_raise( float_flag_invalid STATUS_VAR);
5117
&& ( ( a.high == b.high )
5119
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5124
/*----------------------------------------------------------------------------
5125
| Returns 1 if the extended double-precision floating-point value `a' is
5126
| less than or equal to the corresponding value `b', and 0 otherwise. The
5127
| invalid exception is raised if either operand is a NaN. The comparison is
5128
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5130
*----------------------------------------------------------------------------*/
5132
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5136
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5137
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5138
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5139
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5141
float_raise( float_flag_invalid STATUS_VAR);
5144
aSign = extractFloatx80Sign( a );
5145
bSign = extractFloatx80Sign( b );
5146
if ( aSign != bSign ) {
5149
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5153
aSign ? le128( b.high, b.low, a.high, a.low )
5154
: le128( a.high, a.low, b.high, b.low );
5158
/*----------------------------------------------------------------------------
5159
| Returns 1 if the extended double-precision floating-point value `a' is
5160
| less than the corresponding value `b', and 0 otherwise. The invalid
5161
| exception is raised if either operand is a NaN. The comparison is performed
5162
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5163
*----------------------------------------------------------------------------*/
5165
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5169
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5170
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5171
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5172
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5174
float_raise( float_flag_invalid STATUS_VAR);
5177
aSign = extractFloatx80Sign( a );
5178
bSign = extractFloatx80Sign( b );
5179
if ( aSign != bSign ) {
5182
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5186
aSign ? lt128( b.high, b.low, a.high, a.low )
5187
: lt128( a.high, a.low, b.high, b.low );
5191
/*----------------------------------------------------------------------------
5192
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5193
| cannot be compared, and 0 otherwise. The invalid exception is raised if
5194
| either operand is a NaN. The comparison is performed according to the
5195
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5196
*----------------------------------------------------------------------------*/
5197
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5199
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5200
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5201
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5202
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5204
float_raise( float_flag_invalid STATUS_VAR);
5210
/*----------------------------------------------------------------------------
5211
| Returns 1 if the extended double-precision floating-point value `a' is
5212
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5213
| cause an exception. The comparison is performed according to the IEC/IEEE
5214
| Standard for Binary Floating-Point Arithmetic.
5215
*----------------------------------------------------------------------------*/
5217
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5220
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5221
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5222
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5223
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5225
if ( floatx80_is_signaling_nan( a )
5226
|| floatx80_is_signaling_nan( b ) ) {
5227
float_raise( float_flag_invalid STATUS_VAR);
5233
&& ( ( a.high == b.high )
5235
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5240
/*----------------------------------------------------------------------------
5241
| Returns 1 if the extended double-precision floating-point value `a' is less
5242
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5243
| do not cause an exception. Otherwise, the comparison is performed according
5244
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5245
*----------------------------------------------------------------------------*/
5247
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5251
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5252
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5253
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5254
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5256
if ( floatx80_is_signaling_nan( a )
5257
|| floatx80_is_signaling_nan( b ) ) {
5258
float_raise( float_flag_invalid STATUS_VAR);
5262
aSign = extractFloatx80Sign( a );
5263
bSign = extractFloatx80Sign( b );
5264
if ( aSign != bSign ) {
5267
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5271
aSign ? le128( b.high, b.low, a.high, a.low )
5272
: le128( a.high, a.low, b.high, b.low );
5276
/*----------------------------------------------------------------------------
5277
| Returns 1 if the extended double-precision floating-point value `a' is less
5278
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5279
| an exception. Otherwise, the comparison is performed according to the
5280
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5281
*----------------------------------------------------------------------------*/
5283
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5287
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5288
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5289
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5290
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5292
if ( floatx80_is_signaling_nan( a )
5293
|| floatx80_is_signaling_nan( b ) ) {
5294
float_raise( float_flag_invalid STATUS_VAR);
5298
aSign = extractFloatx80Sign( a );
5299
bSign = extractFloatx80Sign( b );
5300
if ( aSign != bSign ) {
5303
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5307
aSign ? lt128( b.high, b.low, a.high, a.low )
5308
: lt128( a.high, a.low, b.high, b.low );
5312
/*----------------------------------------------------------------------------
5313
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5314
| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5315
| The comparison is performed according to the IEC/IEEE Standard for Binary
5316
| Floating-Point Arithmetic.
5317
*----------------------------------------------------------------------------*/
5318
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5320
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5321
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5322
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5323
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5325
if ( floatx80_is_signaling_nan( a )
5326
|| floatx80_is_signaling_nan( b ) ) {
5327
float_raise( float_flag_invalid STATUS_VAR);
5334
/*----------------------------------------------------------------------------
5335
| Returns the result of converting the quadruple-precision floating-point
5336
| value `a' to the 32-bit two's complement integer format. The conversion
5337
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5338
| Arithmetic---which means in particular that the conversion is rounded
5339
| according to the current rounding mode. If `a' is a NaN, the largest
5340
| positive integer is returned. Otherwise, if the conversion overflows, the
5341
| largest integer with the same sign as `a' is returned.
5342
*----------------------------------------------------------------------------*/
5344
int32 float128_to_int32( float128 a STATUS_PARAM )
5347
int32 aExp, shiftCount;
5348
uint64_t aSig0, aSig1;
5350
aSig1 = extractFloat128Frac1( a );
5351
aSig0 = extractFloat128Frac0( a );
5352
aExp = extractFloat128Exp( a );
5353
aSign = extractFloat128Sign( a );
5354
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5355
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5356
aSig0 |= ( aSig1 != 0 );
5357
shiftCount = 0x4028 - aExp;
5358
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5359
return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5363
/*----------------------------------------------------------------------------
5364
| Returns the result of converting the quadruple-precision floating-point
5365
| value `a' to the 32-bit two's complement integer format. The conversion
5366
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5367
| Arithmetic, except that the conversion is always rounded toward zero. If
5368
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5369
| conversion overflows, the largest integer with the same sign as `a' is
5371
*----------------------------------------------------------------------------*/
5373
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5376
int32 aExp, shiftCount;
5377
uint64_t aSig0, aSig1, savedASig;
5380
aSig1 = extractFloat128Frac1( a );
5381
aSig0 = extractFloat128Frac0( a );
5382
aExp = extractFloat128Exp( a );
5383
aSign = extractFloat128Sign( a );
5384
aSig0 |= ( aSig1 != 0 );
5385
if ( 0x401E < aExp ) {
5386
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5389
else if ( aExp < 0x3FFF ) {
5390
if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5393
aSig0 |= LIT64( 0x0001000000000000 );
5394
shiftCount = 0x402F - aExp;
5396
aSig0 >>= shiftCount;
5398
if ( aSign ) z = - z;
5399
if ( ( z < 0 ) ^ aSign ) {
5401
float_raise( float_flag_invalid STATUS_VAR);
5402
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5404
if ( ( aSig0<<shiftCount ) != savedASig ) {
5405
STATUS(float_exception_flags) |= float_flag_inexact;
5411
/*----------------------------------------------------------------------------
5412
| Returns the result of converting the quadruple-precision floating-point
5413
| value `a' to the 64-bit two's complement integer format. The conversion
5414
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5415
| Arithmetic---which means in particular that the conversion is rounded
5416
| according to the current rounding mode. If `a' is a NaN, the largest
5417
| positive integer is returned. Otherwise, if the conversion overflows, the
5418
| largest integer with the same sign as `a' is returned.
5419
*----------------------------------------------------------------------------*/
5421
int64 float128_to_int64( float128 a STATUS_PARAM )
5424
int32 aExp, shiftCount;
5425
uint64_t aSig0, aSig1;
5427
aSig1 = extractFloat128Frac1( a );
5428
aSig0 = extractFloat128Frac0( a );
5429
aExp = extractFloat128Exp( a );
5430
aSign = extractFloat128Sign( a );
5431
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5432
shiftCount = 0x402F - aExp;
5433
if ( shiftCount <= 0 ) {
5434
if ( 0x403E < aExp ) {
5435
float_raise( float_flag_invalid STATUS_VAR);
5437
|| ( ( aExp == 0x7FFF )
5438
&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5441
return LIT64( 0x7FFFFFFFFFFFFFFF );
5443
return (int64_t) LIT64( 0x8000000000000000 );
5445
shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5448
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5450
return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5454
/*----------------------------------------------------------------------------
5455
| Returns the result of converting the quadruple-precision floating-point
5456
| value `a' to the 64-bit two's complement integer format. The conversion
5457
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5458
| Arithmetic, except that the conversion is always rounded toward zero.
5459
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5460
| the conversion overflows, the largest integer with the same sign as `a' is
5462
*----------------------------------------------------------------------------*/
5464
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5467
int32 aExp, shiftCount;
5468
uint64_t aSig0, aSig1;
5471
aSig1 = extractFloat128Frac1( a );
5472
aSig0 = extractFloat128Frac0( a );
5473
aExp = extractFloat128Exp( a );
5474
aSign = extractFloat128Sign( a );
5475
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5476
shiftCount = aExp - 0x402F;
5477
if ( 0 < shiftCount ) {
5478
if ( 0x403E <= aExp ) {
5479
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5480
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5481
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5482
if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5485
float_raise( float_flag_invalid STATUS_VAR);
5486
if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5487
return LIT64( 0x7FFFFFFFFFFFFFFF );
5490
return (int64_t) LIT64( 0x8000000000000000 );
5492
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5493
if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5494
STATUS(float_exception_flags) |= float_flag_inexact;
5498
if ( aExp < 0x3FFF ) {
5499
if ( aExp | aSig0 | aSig1 ) {
5500
STATUS(float_exception_flags) |= float_flag_inexact;
5504
z = aSig0>>( - shiftCount );
5506
|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5507
STATUS(float_exception_flags) |= float_flag_inexact;
5510
if ( aSign ) z = - z;
5515
/*----------------------------------------------------------------------------
5516
| Returns the result of converting the quadruple-precision floating-point
5517
| value `a' to the single-precision floating-point format. The conversion
5518
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5520
*----------------------------------------------------------------------------*/
5522
float32 float128_to_float32( float128 a STATUS_PARAM )
5526
uint64_t aSig0, aSig1;
5529
aSig1 = extractFloat128Frac1( a );
5530
aSig0 = extractFloat128Frac0( a );
5531
aExp = extractFloat128Exp( a );
5532
aSign = extractFloat128Sign( a );
5533
if ( aExp == 0x7FFF ) {
5534
if ( aSig0 | aSig1 ) {
5535
return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5537
return packFloat32( aSign, 0xFF, 0 );
5539
aSig0 |= ( aSig1 != 0 );
5540
shift64RightJamming( aSig0, 18, &aSig0 );
5542
if ( aExp || zSig ) {
5546
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5550
/*----------------------------------------------------------------------------
5551
| Returns the result of converting the quadruple-precision floating-point
5552
| value `a' to the double-precision floating-point format. The conversion
5553
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5555
*----------------------------------------------------------------------------*/
5557
float64 float128_to_float64( float128 a STATUS_PARAM )
5561
uint64_t aSig0, aSig1;
5563
aSig1 = extractFloat128Frac1( a );
5564
aSig0 = extractFloat128Frac0( a );
5565
aExp = extractFloat128Exp( a );
5566
aSign = extractFloat128Sign( a );
5567
if ( aExp == 0x7FFF ) {
5568
if ( aSig0 | aSig1 ) {
5569
return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5571
return packFloat64( aSign, 0x7FF, 0 );
5573
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5574
aSig0 |= ( aSig1 != 0 );
5575
if ( aExp || aSig0 ) {
5576
aSig0 |= LIT64( 0x4000000000000000 );
5579
return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5583
/*----------------------------------------------------------------------------
5584
| Returns the result of converting the quadruple-precision floating-point
5585
| value `a' to the extended double-precision floating-point format. The
5586
| conversion is performed according to the IEC/IEEE Standard for Binary
5587
| Floating-Point Arithmetic.
5588
*----------------------------------------------------------------------------*/
5590
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5594
uint64_t aSig0, aSig1;
5596
aSig1 = extractFloat128Frac1( a );
5597
aSig0 = extractFloat128Frac0( a );
5598
aExp = extractFloat128Exp( a );
5599
aSign = extractFloat128Sign( a );
5600
if ( aExp == 0x7FFF ) {
5601
if ( aSig0 | aSig1 ) {
5602
return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5604
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5607
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5608
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5611
aSig0 |= LIT64( 0x0001000000000000 );
5613
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5614
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5618
/*----------------------------------------------------------------------------
5619
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5620
| returns the result as a quadruple-precision floating-point value. The
5621
| operation is performed according to the IEC/IEEE Standard for Binary
5622
| Floating-Point Arithmetic.
5623
*----------------------------------------------------------------------------*/
5625
float128 float128_round_to_int( float128 a STATUS_PARAM )
5629
uint64_t lastBitMask, roundBitsMask;
5633
aExp = extractFloat128Exp( a );
5634
if ( 0x402F <= aExp ) {
5635
if ( 0x406F <= aExp ) {
5636
if ( ( aExp == 0x7FFF )
5637
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5639
return propagateFloat128NaN( a, a STATUS_VAR );
5644
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5645
roundBitsMask = lastBitMask - 1;
5647
roundingMode = STATUS(float_rounding_mode);
5648
if ( roundingMode == float_round_nearest_even ) {
5649
if ( lastBitMask ) {
5650
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5651
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5654
if ( (int64_t) z.low < 0 ) {
5656
if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5660
else if ( roundingMode != float_round_to_zero ) {
5661
if ( extractFloat128Sign( z )
5662
^ ( roundingMode == float_round_up ) ) {
5663
add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5666
z.low &= ~ roundBitsMask;
5669
if ( aExp < 0x3FFF ) {
5670
if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5671
STATUS(float_exception_flags) |= float_flag_inexact;
5672
aSign = extractFloat128Sign( a );
5673
switch ( STATUS(float_rounding_mode) ) {
5674
case float_round_nearest_even:
5675
if ( ( aExp == 0x3FFE )
5676
&& ( extractFloat128Frac0( a )
5677
| extractFloat128Frac1( a ) )
5679
return packFloat128( aSign, 0x3FFF, 0, 0 );
5682
case float_round_down:
5684
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5685
: packFloat128( 0, 0, 0, 0 );
5686
case float_round_up:
5688
aSign ? packFloat128( 1, 0, 0, 0 )
5689
: packFloat128( 0, 0x3FFF, 0, 0 );
5691
return packFloat128( aSign, 0, 0, 0 );
5694
lastBitMask <<= 0x402F - aExp;
5695
roundBitsMask = lastBitMask - 1;
5698
roundingMode = STATUS(float_rounding_mode);
5699
if ( roundingMode == float_round_nearest_even ) {
5700
z.high += lastBitMask>>1;
5701
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5702
z.high &= ~ lastBitMask;
5705
else if ( roundingMode != float_round_to_zero ) {
5706
if ( extractFloat128Sign( z )
5707
^ ( roundingMode == float_round_up ) ) {
5708
z.high |= ( a.low != 0 );
5709
z.high += roundBitsMask;
5712
z.high &= ~ roundBitsMask;
5714
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5715
STATUS(float_exception_flags) |= float_flag_inexact;
5721
/*----------------------------------------------------------------------------
5722
| Returns the result of adding the absolute values of the quadruple-precision
5723
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5724
| before being returned. `zSign' is ignored if the result is a NaN.
5725
| The addition is performed according to the IEC/IEEE Standard for Binary
5726
| Floating-Point Arithmetic.
5727
*----------------------------------------------------------------------------*/
5729
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5731
int32 aExp, bExp, zExp;
5732
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5735
aSig1 = extractFloat128Frac1( a );
5736
aSig0 = extractFloat128Frac0( a );
5737
aExp = extractFloat128Exp( a );
5738
bSig1 = extractFloat128Frac1( b );
5739
bSig0 = extractFloat128Frac0( b );
5740
bExp = extractFloat128Exp( b );
5741
expDiff = aExp - bExp;
5742
if ( 0 < expDiff ) {
5743
if ( aExp == 0x7FFF ) {
5744
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5751
bSig0 |= LIT64( 0x0001000000000000 );
5753
shift128ExtraRightJamming(
5754
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5757
else if ( expDiff < 0 ) {
5758
if ( bExp == 0x7FFF ) {
5759
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5760
return packFloat128( zSign, 0x7FFF, 0, 0 );
5766
aSig0 |= LIT64( 0x0001000000000000 );
5768
shift128ExtraRightJamming(
5769
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5773
if ( aExp == 0x7FFF ) {
5774
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5775
return propagateFloat128NaN( a, b STATUS_VAR );
5779
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5781
if (STATUS(flush_to_zero)) {
5782
if (zSig0 | zSig1) {
5783
float_raise(float_flag_output_denormal STATUS_VAR);
5785
return packFloat128(zSign, 0, 0, 0);
5787
return packFloat128( zSign, 0, zSig0, zSig1 );
5790
zSig0 |= LIT64( 0x0002000000000000 );
5794
aSig0 |= LIT64( 0x0001000000000000 );
5795
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5797
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5800
shift128ExtraRightJamming(
5801
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5803
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5807
/*----------------------------------------------------------------------------
5808
| Returns the result of subtracting the absolute values of the quadruple-
5809
| precision floating-point values `a' and `b'. If `zSign' is 1, the
5810
| difference is negated before being returned. `zSign' is ignored if the
5811
| result is a NaN. The subtraction is performed according to the IEC/IEEE
5812
| Standard for Binary Floating-Point Arithmetic.
5813
*----------------------------------------------------------------------------*/
5815
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5817
int32 aExp, bExp, zExp;
5818
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5822
aSig1 = extractFloat128Frac1( a );
5823
aSig0 = extractFloat128Frac0( a );
5824
aExp = extractFloat128Exp( a );
5825
bSig1 = extractFloat128Frac1( b );
5826
bSig0 = extractFloat128Frac0( b );
5827
bExp = extractFloat128Exp( b );
5828
expDiff = aExp - bExp;
5829
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5830
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5831
if ( 0 < expDiff ) goto aExpBigger;
5832
if ( expDiff < 0 ) goto bExpBigger;
5833
if ( aExp == 0x7FFF ) {
5834
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5835
return propagateFloat128NaN( a, b STATUS_VAR );
5837
float_raise( float_flag_invalid STATUS_VAR);
5838
z.low = float128_default_nan_low;
5839
z.high = float128_default_nan_high;
5846
if ( bSig0 < aSig0 ) goto aBigger;
5847
if ( aSig0 < bSig0 ) goto bBigger;
5848
if ( bSig1 < aSig1 ) goto aBigger;
5849
if ( aSig1 < bSig1 ) goto bBigger;
5850
return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5852
if ( bExp == 0x7FFF ) {
5853
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5854
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5860
aSig0 |= LIT64( 0x4000000000000000 );
5862
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5863
bSig0 |= LIT64( 0x4000000000000000 );
5865
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5868
goto normalizeRoundAndPack;
5870
if ( aExp == 0x7FFF ) {
5871
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5878
bSig0 |= LIT64( 0x4000000000000000 );
5880
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5881
aSig0 |= LIT64( 0x4000000000000000 );
5883
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5885
normalizeRoundAndPack:
5887
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5891
/*----------------------------------------------------------------------------
5892
| Returns the result of adding the quadruple-precision floating-point values
5893
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5894
| for Binary Floating-Point Arithmetic.
5895
*----------------------------------------------------------------------------*/
5897
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5901
aSign = extractFloat128Sign( a );
5902
bSign = extractFloat128Sign( b );
5903
if ( aSign == bSign ) {
5904
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5907
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5912
/*----------------------------------------------------------------------------
5913
| Returns the result of subtracting the quadruple-precision floating-point
5914
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5915
| Standard for Binary Floating-Point Arithmetic.
5916
*----------------------------------------------------------------------------*/
5918
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5922
aSign = extractFloat128Sign( a );
5923
bSign = extractFloat128Sign( b );
5924
if ( aSign == bSign ) {
5925
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5928
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5933
/*----------------------------------------------------------------------------
5934
| Returns the result of multiplying the quadruple-precision floating-point
5935
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5936
| Standard for Binary Floating-Point Arithmetic.
5937
*----------------------------------------------------------------------------*/
5939
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5941
flag aSign, bSign, zSign;
5942
int32 aExp, bExp, zExp;
5943
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5946
aSig1 = extractFloat128Frac1( a );
5947
aSig0 = extractFloat128Frac0( a );
5948
aExp = extractFloat128Exp( a );
5949
aSign = extractFloat128Sign( a );
5950
bSig1 = extractFloat128Frac1( b );
5951
bSig0 = extractFloat128Frac0( b );
5952
bExp = extractFloat128Exp( b );
5953
bSign = extractFloat128Sign( b );
5954
zSign = aSign ^ bSign;
5955
if ( aExp == 0x7FFF ) {
5956
if ( ( aSig0 | aSig1 )
5957
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5958
return propagateFloat128NaN( a, b STATUS_VAR );
5960
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5961
return packFloat128( zSign, 0x7FFF, 0, 0 );
5963
if ( bExp == 0x7FFF ) {
5964
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5965
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5967
float_raise( float_flag_invalid STATUS_VAR);
5968
z.low = float128_default_nan_low;
5969
z.high = float128_default_nan_high;
5972
return packFloat128( zSign, 0x7FFF, 0, 0 );
5975
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5976
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5979
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5980
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5982
zExp = aExp + bExp - 0x4000;
5983
aSig0 |= LIT64( 0x0001000000000000 );
5984
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5985
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5986
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5987
zSig2 |= ( zSig3 != 0 );
5988
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5989
shift128ExtraRightJamming(
5990
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5993
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5997
/*----------------------------------------------------------------------------
5998
| Returns the result of dividing the quadruple-precision floating-point value
5999
| `a' by the corresponding value `b'. The operation is performed according to
6000
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6001
*----------------------------------------------------------------------------*/
6003
float128 float128_div( float128 a, float128 b STATUS_PARAM )
6005
flag aSign, bSign, zSign;
6006
int32 aExp, bExp, zExp;
6007
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6008
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6011
aSig1 = extractFloat128Frac1( a );
6012
aSig0 = extractFloat128Frac0( a );
6013
aExp = extractFloat128Exp( a );
6014
aSign = extractFloat128Sign( a );
6015
bSig1 = extractFloat128Frac1( b );
6016
bSig0 = extractFloat128Frac0( b );
6017
bExp = extractFloat128Exp( b );
6018
bSign = extractFloat128Sign( b );
6019
zSign = aSign ^ bSign;
6020
if ( aExp == 0x7FFF ) {
6021
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6022
if ( bExp == 0x7FFF ) {
6023
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6026
return packFloat128( zSign, 0x7FFF, 0, 0 );
6028
if ( bExp == 0x7FFF ) {
6029
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6030
return packFloat128( zSign, 0, 0, 0 );
6033
if ( ( bSig0 | bSig1 ) == 0 ) {
6034
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6036
float_raise( float_flag_invalid STATUS_VAR);
6037
z.low = float128_default_nan_low;
6038
z.high = float128_default_nan_high;
6041
float_raise( float_flag_divbyzero STATUS_VAR);
6042
return packFloat128( zSign, 0x7FFF, 0, 0 );
6044
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6047
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6048
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6050
zExp = aExp - bExp + 0x3FFD;
6052
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6054
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6055
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6056
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6059
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6060
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6061
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6062
while ( (int64_t) rem0 < 0 ) {
6064
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6066
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6067
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6068
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6069
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6070
while ( (int64_t) rem1 < 0 ) {
6072
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6074
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6076
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6077
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6081
/*----------------------------------------------------------------------------
6082
| Returns the remainder of the quadruple-precision floating-point value `a'
6083
| with respect to the corresponding value `b'. The operation is performed
6084
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6085
*----------------------------------------------------------------------------*/
6087
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6090
int32 aExp, bExp, expDiff;
6091
uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6092
uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6096
aSig1 = extractFloat128Frac1( a );
6097
aSig0 = extractFloat128Frac0( a );
6098
aExp = extractFloat128Exp( a );
6099
aSign = extractFloat128Sign( a );
6100
bSig1 = extractFloat128Frac1( b );
6101
bSig0 = extractFloat128Frac0( b );
6102
bExp = extractFloat128Exp( b );
6103
if ( aExp == 0x7FFF ) {
6104
if ( ( aSig0 | aSig1 )
6105
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6106
return propagateFloat128NaN( a, b STATUS_VAR );
6110
if ( bExp == 0x7FFF ) {
6111
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6115
if ( ( bSig0 | bSig1 ) == 0 ) {
6117
float_raise( float_flag_invalid STATUS_VAR);
6118
z.low = float128_default_nan_low;
6119
z.high = float128_default_nan_high;
6122
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6125
if ( ( aSig0 | aSig1 ) == 0 ) return a;
6126
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6128
expDiff = aExp - bExp;
6129
if ( expDiff < -1 ) return a;
6131
aSig0 | LIT64( 0x0001000000000000 ),
6133
15 - ( expDiff < 0 ),
6138
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6139
q = le128( bSig0, bSig1, aSig0, aSig1 );
6140
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6142
while ( 0 < expDiff ) {
6143
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6144
q = ( 4 < q ) ? q - 4 : 0;
6145
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6146
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6147
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6148
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6151
if ( -64 < expDiff ) {
6152
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6153
q = ( 4 < q ) ? q - 4 : 0;
6155
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6157
if ( expDiff < 0 ) {
6158
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6161
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6163
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6164
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6167
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6168
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6171
alternateASig0 = aSig0;
6172
alternateASig1 = aSig1;
6174
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6175
} while ( 0 <= (int64_t) aSig0 );
6177
aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6178
if ( ( sigMean0 < 0 )
6179
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6180
aSig0 = alternateASig0;
6181
aSig1 = alternateASig1;
6183
zSign = ( (int64_t) aSig0 < 0 );
6184
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6186
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6190
/*----------------------------------------------------------------------------
6191
| Returns the square root of the quadruple-precision floating-point value `a'.
6192
| The operation is performed according to the IEC/IEEE Standard for Binary
6193
| Floating-Point Arithmetic.
6194
*----------------------------------------------------------------------------*/
6196
float128 float128_sqrt( float128 a STATUS_PARAM )
6200
uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6201
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6204
aSig1 = extractFloat128Frac1( a );
6205
aSig0 = extractFloat128Frac0( a );
6206
aExp = extractFloat128Exp( a );
6207
aSign = extractFloat128Sign( a );
6208
if ( aExp == 0x7FFF ) {
6209
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6210
if ( ! aSign ) return a;
6214
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6216
float_raise( float_flag_invalid STATUS_VAR);
6217
z.low = float128_default_nan_low;
6218
z.high = float128_default_nan_high;
6222
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6223
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6225
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6226
aSig0 |= LIT64( 0x0001000000000000 );
6227
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6228
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6229
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6230
doubleZSig0 = zSig0<<1;
6231
mul64To128( zSig0, zSig0, &term0, &term1 );
6232
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6233
while ( (int64_t) rem0 < 0 ) {
6236
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6238
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6239
if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6240
if ( zSig1 == 0 ) zSig1 = 1;
6241
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6242
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6243
mul64To128( zSig1, zSig1, &term2, &term3 );
6244
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6245
while ( (int64_t) rem1 < 0 ) {
6247
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6249
term2 |= doubleZSig0;
6250
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6252
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6254
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6255
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6259
/*----------------------------------------------------------------------------
6260
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6261
| the corresponding value `b', and 0 otherwise. The invalid exception is
6262
| raised if either operand is a NaN. Otherwise, the comparison is performed
6263
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6264
*----------------------------------------------------------------------------*/
6266
int float128_eq( float128 a, float128 b STATUS_PARAM )
6269
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6270
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6271
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6272
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6274
float_raise( float_flag_invalid STATUS_VAR);
6279
&& ( ( a.high == b.high )
6281
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6286
/*----------------------------------------------------------------------------
6287
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6288
| or equal to the corresponding value `b', and 0 otherwise. The invalid
6289
| exception is raised if either operand is a NaN. The comparison is performed
6290
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6291
*----------------------------------------------------------------------------*/
6293
int float128_le( float128 a, float128 b STATUS_PARAM )
6297
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6298
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6299
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6300
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6302
float_raise( float_flag_invalid STATUS_VAR);
6305
aSign = extractFloat128Sign( a );
6306
bSign = extractFloat128Sign( b );
6307
if ( aSign != bSign ) {
6310
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6314
aSign ? le128( b.high, b.low, a.high, a.low )
6315
: le128( a.high, a.low, b.high, b.low );
6319
/*----------------------------------------------------------------------------
6320
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6321
| the corresponding value `b', and 0 otherwise. The invalid exception is
6322
| raised if either operand is a NaN. The comparison is performed according
6323
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6324
*----------------------------------------------------------------------------*/
6326
int float128_lt( float128 a, float128 b STATUS_PARAM )
6330
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6331
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6332
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6333
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6335
float_raise( float_flag_invalid STATUS_VAR);
6338
aSign = extractFloat128Sign( a );
6339
bSign = extractFloat128Sign( b );
6340
if ( aSign != bSign ) {
6343
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6347
aSign ? lt128( b.high, b.low, a.high, a.low )
6348
: lt128( a.high, a.low, b.high, b.low );
6352
/*----------------------------------------------------------------------------
6353
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6354
| be compared, and 0 otherwise. The invalid exception is raised if either
6355
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6356
| Standard for Binary Floating-Point Arithmetic.
6357
*----------------------------------------------------------------------------*/
6359
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6361
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6362
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6363
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6364
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6366
float_raise( float_flag_invalid STATUS_VAR);
6372
/*----------------------------------------------------------------------------
6373
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6374
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6375
| exception. The comparison is performed according to the IEC/IEEE Standard
6376
| for Binary Floating-Point Arithmetic.
6377
*----------------------------------------------------------------------------*/
6379
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6382
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6383
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6384
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6385
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6387
if ( float128_is_signaling_nan( a )
6388
|| float128_is_signaling_nan( b ) ) {
6389
float_raise( float_flag_invalid STATUS_VAR);
6395
&& ( ( a.high == b.high )
6397
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6402
/*----------------------------------------------------------------------------
6403
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6404
| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6405
| cause an exception. Otherwise, the comparison is performed according to the
6406
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6407
*----------------------------------------------------------------------------*/
6409
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6413
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6414
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6415
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6416
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6418
if ( float128_is_signaling_nan( a )
6419
|| float128_is_signaling_nan( b ) ) {
6420
float_raise( float_flag_invalid STATUS_VAR);
6424
aSign = extractFloat128Sign( a );
6425
bSign = extractFloat128Sign( b );
6426
if ( aSign != bSign ) {
6429
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6433
aSign ? le128( b.high, b.low, a.high, a.low )
6434
: le128( a.high, a.low, b.high, b.low );
6438
/*----------------------------------------------------------------------------
6439
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6440
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6441
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6442
| Standard for Binary Floating-Point Arithmetic.
6443
*----------------------------------------------------------------------------*/
6445
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6449
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6450
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6451
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6452
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6454
if ( float128_is_signaling_nan( a )
6455
|| float128_is_signaling_nan( b ) ) {
6456
float_raise( float_flag_invalid STATUS_VAR);
6460
aSign = extractFloat128Sign( a );
6461
bSign = extractFloat128Sign( b );
6462
if ( aSign != bSign ) {
6465
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6469
aSign ? lt128( b.high, b.low, a.high, a.low )
6470
: lt128( a.high, a.low, b.high, b.low );
6474
/*----------------------------------------------------------------------------
6475
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6476
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6477
| comparison is performed according to the IEC/IEEE Standard for Binary
6478
| Floating-Point Arithmetic.
6479
*----------------------------------------------------------------------------*/
6481
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6483
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6484
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6485
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6486
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6488
if ( float128_is_signaling_nan( a )
6489
|| float128_is_signaling_nan( b ) ) {
6490
float_raise( float_flag_invalid STATUS_VAR);
6497
/* misc functions */
6498
float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6500
return int64_to_float32(a STATUS_VAR);
6503
float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6505
return int64_to_float64(a STATUS_VAR);
6508
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6512
int old_exc_flags = get_float_exception_flags(status);
6514
v = float32_to_int64(a STATUS_VAR);
6517
} else if (v > 0xffffffff) {
6522
set_float_exception_flags(old_exc_flags, status);
6523
float_raise(float_flag_invalid STATUS_VAR);
6527
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6531
int old_exc_flags = get_float_exception_flags(status);
6533
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6536
} else if (v > 0xffffffff) {
6541
set_float_exception_flags(old_exc_flags, status);
6542
float_raise(float_flag_invalid STATUS_VAR);
6546
int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6550
int old_exc_flags = get_float_exception_flags(status);
6552
v = float32_to_int32(a STATUS_VAR);
6555
} else if (v > 0x7fff) {
6561
set_float_exception_flags(old_exc_flags, status);
6562
float_raise(float_flag_invalid STATUS_VAR);
6566
uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6570
int old_exc_flags = get_float_exception_flags(status);
6572
v = float32_to_int32(a STATUS_VAR);
6575
} else if (v > 0xffff) {
6581
set_float_exception_flags(old_exc_flags, status);
6582
float_raise(float_flag_invalid STATUS_VAR);
6586
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6590
int old_exc_flags = get_float_exception_flags(status);
6592
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6595
} else if (v > 0xffff) {
6600
set_float_exception_flags(old_exc_flags, status);
6601
float_raise(float_flag_invalid STATUS_VAR);
6605
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6610
v = float64_to_int64(a STATUS_VAR);
6613
float_raise( float_flag_invalid STATUS_VAR);
6614
} else if (v > 0xffffffff) {
6616
float_raise( float_flag_invalid STATUS_VAR);
6623
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6628
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6631
float_raise( float_flag_invalid STATUS_VAR);
6632
} else if (v > 0xffffffff) {
6634
float_raise( float_flag_invalid STATUS_VAR);
6641
int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6645
int old_exc_flags = get_float_exception_flags(status);
6647
v = float64_to_int32(a STATUS_VAR);
6650
} else if (v > 0x7fff) {
6656
set_float_exception_flags(old_exc_flags, status);
6657
float_raise(float_flag_invalid STATUS_VAR);
6661
uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6665
int old_exc_flags = get_float_exception_flags(status);
6667
v = float64_to_int32(a STATUS_VAR);
6670
} else if (v > 0xffff) {
6676
set_float_exception_flags(old_exc_flags, status);
6677
float_raise(float_flag_invalid STATUS_VAR);
6681
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6685
int old_exc_flags = get_float_exception_flags(status);
6687
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6690
} else if (v > 0xffff) {
6695
set_float_exception_flags(old_exc_flags, status);
6696
float_raise(float_flag_invalid STATUS_VAR);
6700
/*----------------------------------------------------------------------------
6701
| Returns the result of converting the double-precision floating-point value
6702
| `a' to the 64-bit unsigned integer format. The conversion is
6703
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6704
| Arithmetic---which means in particular that the conversion is rounded
6705
| according to the current rounding mode. If `a' is a NaN, the largest
6706
| positive integer is returned. If the conversion overflows, the
6707
| largest unsigned integer is returned. If 'a' is negative, the value is
6708
| rounded and zero is returned; negative values that do not round to zero
6709
| will raise the inexact exception.
6710
*----------------------------------------------------------------------------*/
6712
uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6715
int_fast16_t aExp, shiftCount;
6716
uint64_t aSig, aSigExtra;
6717
a = float64_squash_input_denormal(a STATUS_VAR);
6719
aSig = extractFloat64Frac(a);
6720
aExp = extractFloat64Exp(a);
6721
aSign = extractFloat64Sign(a);
6722
if (aSign && (aExp > 1022)) {
6723
float_raise(float_flag_invalid STATUS_VAR);
6724
if (float64_is_any_nan(a)) {
6725
return LIT64(0xFFFFFFFFFFFFFFFF);
6731
aSig |= LIT64(0x0010000000000000);
6733
shiftCount = 0x433 - aExp;
6734
if (shiftCount <= 0) {
6736
float_raise(float_flag_invalid STATUS_VAR);
6737
return LIT64(0xFFFFFFFFFFFFFFFF);
6740
aSig <<= -shiftCount;
6742
shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6744
return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6747
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6751
v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6752
v += float64_val(a);
6753
v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6755
return v - INT64_MIN;
6758
#define COMPARE(s, nan_exp) \
6759
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6760
int is_quiet STATUS_PARAM ) \
6762
flag aSign, bSign; \
6763
uint ## s ## _t av, bv; \
6764
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6765
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6767
if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6768
extractFloat ## s ## Frac( a ) ) || \
6769
( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6770
extractFloat ## s ## Frac( b ) )) { \
6772
float ## s ## _is_signaling_nan( a ) || \
6773
float ## s ## _is_signaling_nan( b ) ) { \
6774
float_raise( float_flag_invalid STATUS_VAR); \
6776
return float_relation_unordered; \
6778
aSign = extractFloat ## s ## Sign( a ); \
6779
bSign = extractFloat ## s ## Sign( b ); \
6780
av = float ## s ## _val(a); \
6781
bv = float ## s ## _val(b); \
6782
if ( aSign != bSign ) { \
6783
if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6785
return float_relation_equal; \
6787
return 1 - (2 * aSign); \
6791
return float_relation_equal; \
6793
return 1 - 2 * (aSign ^ ( av < bv )); \
6798
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6800
return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6803
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6805
return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6811
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6812
int is_quiet STATUS_PARAM )
6816
if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6817
( extractFloatx80Frac( a )<<1 ) ) ||
6818
( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6819
( extractFloatx80Frac( b )<<1 ) )) {
6821
floatx80_is_signaling_nan( a ) ||
6822
floatx80_is_signaling_nan( b ) ) {
6823
float_raise( float_flag_invalid STATUS_VAR);
6825
return float_relation_unordered;
6827
aSign = extractFloatx80Sign( a );
6828
bSign = extractFloatx80Sign( b );
6829
if ( aSign != bSign ) {
6831
if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6832
( ( a.low | b.low ) == 0 ) ) {
6834
return float_relation_equal;
6836
return 1 - (2 * aSign);
6839
if (a.low == b.low && a.high == b.high) {
6840
return float_relation_equal;
6842
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6847
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6849
return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6852
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6854
return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6857
INLINE int float128_compare_internal( float128 a, float128 b,
6858
int is_quiet STATUS_PARAM )
6862
if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6863
( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6864
( ( extractFloat128Exp( b ) == 0x7fff ) &&
6865
( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6867
float128_is_signaling_nan( a ) ||
6868
float128_is_signaling_nan( b ) ) {
6869
float_raise( float_flag_invalid STATUS_VAR);
6871
return float_relation_unordered;
6873
aSign = extractFloat128Sign( a );
6874
bSign = extractFloat128Sign( b );
6875
if ( aSign != bSign ) {
6876
if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6878
return float_relation_equal;
6880
return 1 - (2 * aSign);
6883
if (a.low == b.low && a.high == b.high) {
6884
return float_relation_equal;
6886
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6891
int float128_compare( float128 a, float128 b STATUS_PARAM )
6893
return float128_compare_internal(a, b, 0 STATUS_VAR);
6896
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6898
return float128_compare_internal(a, b, 1 STATUS_VAR);
6901
/* min() and max() functions. These can't be implemented as
6902
* 'compare and pick one input' because that would mishandle
6903
* NaNs and +0 vs -0.
6905
* minnum() and maxnum() functions. These are similar to the min()
6906
* and max() functions but if one of the arguments is a QNaN and
6907
* the other is numerical then the numerical argument is returned.
6908
* minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6909
* and maxNum() operations. min() and max() are the typical min/max
6910
* semantics provided by many CPUs which predate that specification.
6912
#define MINMAX(s, nan_exp) \
6913
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6914
int ismin, int isieee STATUS_PARAM) \
6916
flag aSign, bSign; \
6917
uint ## s ## _t av, bv; \
6918
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6919
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6920
if (float ## s ## _is_any_nan(a) || \
6921
float ## s ## _is_any_nan(b)) { \
6923
if (float ## s ## _is_quiet_nan(a) && \
6924
!float ## s ##_is_any_nan(b)) { \
6926
} else if (float ## s ## _is_quiet_nan(b) && \
6927
!float ## s ## _is_any_nan(a)) { \
6931
return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6933
aSign = extractFloat ## s ## Sign(a); \
6934
bSign = extractFloat ## s ## Sign(b); \
6935
av = float ## s ## _val(a); \
6936
bv = float ## s ## _val(b); \
6937
if (aSign != bSign) { \
6939
return aSign ? a : b; \
6941
return aSign ? b : a; \
6945
return (aSign ^ (av < bv)) ? a : b; \
6947
return (aSign ^ (av < bv)) ? b : a; \
6952
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6954
return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
6957
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6959
return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
6962
float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
6964
return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
6967
float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
6969
return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
6976
/* Multiply A by 2 raised to the power N. */
6977
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6983
a = float32_squash_input_denormal(a STATUS_VAR);
6984
aSig = extractFloat32Frac( a );
6985
aExp = extractFloat32Exp( a );
6986
aSign = extractFloat32Sign( a );
6988
if ( aExp == 0xFF ) {
6990
return propagateFloat32NaN( a, a STATUS_VAR );
6996
} else if (aSig == 0) {
7004
} else if (n < -0x200) {
7010
return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7013
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7019
a = float64_squash_input_denormal(a STATUS_VAR);
7020
aSig = extractFloat64Frac( a );
7021
aExp = extractFloat64Exp( a );
7022
aSign = extractFloat64Sign( a );
7024
if ( aExp == 0x7FF ) {
7026
return propagateFloat64NaN( a, a STATUS_VAR );
7031
aSig |= LIT64( 0x0010000000000000 );
7032
} else if (aSig == 0) {
7040
} else if (n < -0x1000) {
7046
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7049
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7055
aSig = extractFloatx80Frac( a );
7056
aExp = extractFloatx80Exp( a );
7057
aSign = extractFloatx80Sign( a );
7059
if ( aExp == 0x7FFF ) {
7061
return propagateFloatx80NaN( a, a STATUS_VAR );
7075
} else if (n < -0x10000) {
7080
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7081
aSign, aExp, aSig, 0 STATUS_VAR );
7084
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7088
uint64_t aSig0, aSig1;
7090
aSig1 = extractFloat128Frac1( a );
7091
aSig0 = extractFloat128Frac0( a );
7092
aExp = extractFloat128Exp( a );
7093
aSign = extractFloat128Sign( a );
7094
if ( aExp == 0x7FFF ) {
7095
if ( aSig0 | aSig1 ) {
7096
return propagateFloat128NaN( a, a STATUS_VAR );
7101
aSig0 |= LIT64( 0x0001000000000000 );
7102
} else if (aSig0 == 0 && aSig1 == 0) {
7110
} else if (n < -0x10000) {
7115
return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1