4
* Derived from SoftFloat.
7
/*============================================================================
9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12
Written by John R. Hauser. This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704. Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980. The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
36
=============================================================================*/
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
* target-dependent and needs the TARGET_* macros.
43
#include "fpu/softfloat.h"
45
/*----------------------------------------------------------------------------
46
| Primitive arithmetic functions, including multi-word arithmetic, and
47
| division and square root approximations. (Can be specialized to target if
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-macros.h"
52
/*----------------------------------------------------------------------------
53
| Functions and definitions to determine: (1) whether tininess for underflow
54
| is detected before or after rounding by default, (2) what (if anything)
55
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57
| are propagated from function inputs to output. These details are target-
59
*----------------------------------------------------------------------------*/
60
#include "softfloat-specialize.h"
62
void set_float_rounding_mode(int val STATUS_PARAM)
64
STATUS(float_rounding_mode) = val;
67
void set_float_exception_flags(int val STATUS_PARAM)
69
STATUS(float_exception_flags) = val;
72
void set_floatx80_rounding_precision(int val STATUS_PARAM)
74
STATUS(floatx80_rounding_precision) = val;
77
/*----------------------------------------------------------------------------
78
| Returns the fraction bits of the half-precision floating-point value `a'.
79
*----------------------------------------------------------------------------*/
81
INLINE uint32_t extractFloat16Frac(float16 a)
83
return float16_val(a) & 0x3ff;
86
/*----------------------------------------------------------------------------
87
| Returns the exponent bits of the half-precision floating-point value `a'.
88
*----------------------------------------------------------------------------*/
90
INLINE int_fast16_t extractFloat16Exp(float16 a)
92
return (float16_val(a) >> 10) & 0x1f;
95
/*----------------------------------------------------------------------------
96
| Returns the sign bit of the single-precision floating-point value `a'.
97
*----------------------------------------------------------------------------*/
99
INLINE flag extractFloat16Sign(float16 a)
101
return float16_val(a)>>15;
104
/*----------------------------------------------------------------------------
105
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106
| and 7, and returns the properly rounded 32-bit integer corresponding to the
107
| input. If `zSign' is 1, the input is negated before being converted to an
108
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
109
| is simply rounded to an integer, with the inexact exception raised if the
110
| input cannot be represented exactly as an integer. However, if the fixed-
111
| point input is too large, the invalid exception is raised and the largest
112
| positive or negative integer is returned.
113
*----------------------------------------------------------------------------*/
115
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
118
flag roundNearestEven;
119
int8 roundIncrement, roundBits;
122
roundingMode = STATUS(float_rounding_mode);
123
roundNearestEven = ( roundingMode == float_round_nearest_even );
124
roundIncrement = 0x40;
125
if ( ! roundNearestEven ) {
126
if ( roundingMode == float_round_to_zero ) {
130
roundIncrement = 0x7F;
132
if ( roundingMode == float_round_up ) roundIncrement = 0;
135
if ( roundingMode == float_round_down ) roundIncrement = 0;
139
roundBits = absZ & 0x7F;
140
absZ = ( absZ + roundIncrement )>>7;
141
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
143
if ( zSign ) z = - z;
144
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145
float_raise( float_flag_invalid STATUS_VAR);
146
return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
148
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
153
/*----------------------------------------------------------------------------
154
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155
| `absZ1', with binary point between bits 63 and 64 (between the input words),
156
| and returns the properly rounded 64-bit integer corresponding to the input.
157
| If `zSign' is 1, the input is negated before being converted to an integer.
158
| Ordinarily, the fixed-point input is simply rounded to an integer, with
159
| the inexact exception raised if the input cannot be represented exactly as
160
| an integer. However, if the fixed-point input is too large, the invalid
161
| exception is raised and the largest positive or negative integer is
163
*----------------------------------------------------------------------------*/
165
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
168
flag roundNearestEven, increment;
171
roundingMode = STATUS(float_rounding_mode);
172
roundNearestEven = ( roundingMode == float_round_nearest_even );
173
increment = ( (int64_t) absZ1 < 0 );
174
if ( ! roundNearestEven ) {
175
if ( roundingMode == float_round_to_zero ) {
180
increment = ( roundingMode == float_round_down ) && absZ1;
183
increment = ( roundingMode == float_round_up ) && absZ1;
189
if ( absZ0 == 0 ) goto overflow;
190
absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
193
if ( zSign ) z = - z;
194
if ( z && ( ( z < 0 ) ^ zSign ) ) {
196
float_raise( float_flag_invalid STATUS_VAR);
198
zSign ? (int64_t) LIT64( 0x8000000000000000 )
199
: LIT64( 0x7FFFFFFFFFFFFFFF );
201
if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
206
/*----------------------------------------------------------------------------
207
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
208
| `absZ1', with binary point between bits 63 and 64 (between the input words),
209
| and returns the properly rounded 64-bit unsigned integer corresponding to the
210
| input. Ordinarily, the fixed-point input is simply rounded to an integer,
211
| with the inexact exception raised if the input cannot be represented exactly
212
| as an integer. However, if the fixed-point input is too large, the invalid
213
| exception is raised and the largest unsigned integer is returned.
214
*----------------------------------------------------------------------------*/
216
static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
217
uint64_t absZ1 STATUS_PARAM)
220
flag roundNearestEven, increment;
222
roundingMode = STATUS(float_rounding_mode);
223
roundNearestEven = (roundingMode == float_round_nearest_even);
224
increment = ((int64_t)absZ1 < 0);
225
if (!roundNearestEven) {
226
if (roundingMode == float_round_to_zero) {
230
increment = (roundingMode == float_round_down) && absZ1;
232
increment = (roundingMode == float_round_up) && absZ1;
239
float_raise(float_flag_invalid STATUS_VAR);
240
return LIT64(0xFFFFFFFFFFFFFFFF);
242
absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
245
if (zSign && absZ0) {
246
float_raise(float_flag_invalid STATUS_VAR);
251
STATUS(float_exception_flags) |= float_flag_inexact;
256
/*----------------------------------------------------------------------------
257
| Returns the fraction bits of the single-precision floating-point value `a'.
258
*----------------------------------------------------------------------------*/
260
INLINE uint32_t extractFloat32Frac( float32 a )
263
return float32_val(a) & 0x007FFFFF;
267
/*----------------------------------------------------------------------------
268
| Returns the exponent bits of the single-precision floating-point value `a'.
269
*----------------------------------------------------------------------------*/
271
INLINE int_fast16_t extractFloat32Exp(float32 a)
274
return ( float32_val(a)>>23 ) & 0xFF;
278
/*----------------------------------------------------------------------------
279
| Returns the sign bit of the single-precision floating-point value `a'.
280
*----------------------------------------------------------------------------*/
282
INLINE flag extractFloat32Sign( float32 a )
285
return float32_val(a)>>31;
289
/*----------------------------------------------------------------------------
290
| If `a' is denormal and we are in flush-to-zero mode then set the
291
| input-denormal exception and return zero. Otherwise just return the value.
292
*----------------------------------------------------------------------------*/
293
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
295
if (STATUS(flush_inputs_to_zero)) {
296
if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
297
float_raise(float_flag_input_denormal STATUS_VAR);
298
return make_float32(float32_val(a) & 0x80000000);
304
/*----------------------------------------------------------------------------
305
| Normalizes the subnormal single-precision floating-point value represented
306
| by the denormalized significand `aSig'. The normalized exponent and
307
| significand are stored at the locations pointed to by `zExpPtr' and
308
| `zSigPtr', respectively.
309
*----------------------------------------------------------------------------*/
312
normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
316
shiftCount = countLeadingZeros32( aSig ) - 8;
317
*zSigPtr = aSig<<shiftCount;
318
*zExpPtr = 1 - shiftCount;
322
/*----------------------------------------------------------------------------
323
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
324
| single-precision floating-point value, returning the result. After being
325
| shifted into the proper positions, the three fields are simply added
326
| together to form the result. This means that any integer portion of `zSig'
327
| will be added into the exponent. Since a properly normalized significand
328
| will have an integer portion equal to 1, the `zExp' input should be 1 less
329
| than the desired result exponent whenever `zSig' is a complete, normalized
331
*----------------------------------------------------------------------------*/
333
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
337
( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
341
/*----------------------------------------------------------------------------
342
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
343
| and significand `zSig', and returns the proper single-precision floating-
344
| point value corresponding to the abstract input. Ordinarily, the abstract
345
| value is simply rounded and packed into the single-precision format, with
346
| the inexact exception raised if the abstract input cannot be represented
347
| exactly. However, if the abstract value is too large, the overflow and
348
| inexact exceptions are raised and an infinity or maximal finite value is
349
| returned. If the abstract value is too small, the input value is rounded to
350
| a subnormal number, and the underflow and inexact exceptions are raised if
351
| the abstract input cannot be represented exactly as a subnormal single-
352
| precision floating-point number.
353
| The input significand `zSig' has its binary point between bits 30
354
| and 29, which is 7 bits to the left of the usual location. This shifted
355
| significand must be normalized or smaller. If `zSig' is not normalized,
356
| `zExp' must be 0; in that case, the result returned is a subnormal number,
357
| and it must not require rounding. In the usual case that `zSig' is
358
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
359
| The handling of underflow and overflow follows the IEC/IEEE Standard for
360
| Binary Floating-Point Arithmetic.
361
*----------------------------------------------------------------------------*/
363
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
366
flag roundNearestEven;
367
int8 roundIncrement, roundBits;
370
roundingMode = STATUS(float_rounding_mode);
371
roundNearestEven = ( roundingMode == float_round_nearest_even );
372
roundIncrement = 0x40;
373
if ( ! roundNearestEven ) {
374
if ( roundingMode == float_round_to_zero ) {
378
roundIncrement = 0x7F;
380
if ( roundingMode == float_round_up ) roundIncrement = 0;
383
if ( roundingMode == float_round_down ) roundIncrement = 0;
387
roundBits = zSig & 0x7F;
388
if ( 0xFD <= (uint16_t) zExp ) {
390
|| ( ( zExp == 0xFD )
391
&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
394
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
397
if (STATUS(flush_to_zero)) {
398
float_raise(float_flag_output_denormal STATUS_VAR);
399
return packFloat32(zSign, 0, 0);
402
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404
|| ( zSig + roundIncrement < 0x80000000 );
405
shift32RightJamming( zSig, - zExp, &zSig );
407
roundBits = zSig & 0x7F;
408
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
411
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
412
zSig = ( zSig + roundIncrement )>>7;
413
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
414
if ( zSig == 0 ) zExp = 0;
415
return packFloat32( zSign, zExp, zSig );
419
/*----------------------------------------------------------------------------
420
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
421
| and significand `zSig', and returns the proper single-precision floating-
422
| point value corresponding to the abstract input. This routine is just like
423
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
424
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
425
| floating-point exponent.
426
*----------------------------------------------------------------------------*/
429
normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
433
shiftCount = countLeadingZeros32( zSig ) - 1;
434
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
438
/*----------------------------------------------------------------------------
439
| Returns the fraction bits of the double-precision floating-point value `a'.
440
*----------------------------------------------------------------------------*/
442
INLINE uint64_t extractFloat64Frac( float64 a )
445
return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
449
/*----------------------------------------------------------------------------
450
| Returns the exponent bits of the double-precision floating-point value `a'.
451
*----------------------------------------------------------------------------*/
453
INLINE int_fast16_t extractFloat64Exp(float64 a)
456
return ( float64_val(a)>>52 ) & 0x7FF;
460
/*----------------------------------------------------------------------------
461
| Returns the sign bit of the double-precision floating-point value `a'.
462
*----------------------------------------------------------------------------*/
464
INLINE flag extractFloat64Sign( float64 a )
467
return float64_val(a)>>63;
471
/*----------------------------------------------------------------------------
472
| If `a' is denormal and we are in flush-to-zero mode then set the
473
| input-denormal exception and return zero. Otherwise just return the value.
474
*----------------------------------------------------------------------------*/
475
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477
if (STATUS(flush_inputs_to_zero)) {
478
if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
479
float_raise(float_flag_input_denormal STATUS_VAR);
480
return make_float64(float64_val(a) & (1ULL << 63));
486
/*----------------------------------------------------------------------------
487
| Normalizes the subnormal double-precision floating-point value represented
488
| by the denormalized significand `aSig'. The normalized exponent and
489
| significand are stored at the locations pointed to by `zExpPtr' and
490
| `zSigPtr', respectively.
491
*----------------------------------------------------------------------------*/
494
normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
498
shiftCount = countLeadingZeros64( aSig ) - 11;
499
*zSigPtr = aSig<<shiftCount;
500
*zExpPtr = 1 - shiftCount;
504
/*----------------------------------------------------------------------------
505
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
506
| double-precision floating-point value, returning the result. After being
507
| shifted into the proper positions, the three fields are simply added
508
| together to form the result. This means that any integer portion of `zSig'
509
| will be added into the exponent. Since a properly normalized significand
510
| will have an integer portion equal to 1, the `zExp' input should be 1 less
511
| than the desired result exponent whenever `zSig' is a complete, normalized
513
*----------------------------------------------------------------------------*/
515
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
519
( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
523
/*----------------------------------------------------------------------------
524
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
525
| and significand `zSig', and returns the proper double-precision floating-
526
| point value corresponding to the abstract input. Ordinarily, the abstract
527
| value is simply rounded and packed into the double-precision format, with
528
| the inexact exception raised if the abstract input cannot be represented
529
| exactly. However, if the abstract value is too large, the overflow and
530
| inexact exceptions are raised and an infinity or maximal finite value is
531
| returned. If the abstract value is too small, the input value is rounded
532
| to a subnormal number, and the underflow and inexact exceptions are raised
533
| if the abstract input cannot be represented exactly as a subnormal double-
534
| precision floating-point number.
535
| The input significand `zSig' has its binary point between bits 62
536
| and 61, which is 10 bits to the left of the usual location. This shifted
537
| significand must be normalized or smaller. If `zSig' is not normalized,
538
| `zExp' must be 0; in that case, the result returned is a subnormal number,
539
| and it must not require rounding. In the usual case that `zSig' is
540
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
541
| The handling of underflow and overflow follows the IEC/IEEE Standard for
542
| Binary Floating-Point Arithmetic.
543
*----------------------------------------------------------------------------*/
545
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
548
flag roundNearestEven;
549
int_fast16_t roundIncrement, roundBits;
552
roundingMode = STATUS(float_rounding_mode);
553
roundNearestEven = ( roundingMode == float_round_nearest_even );
554
roundIncrement = 0x200;
555
if ( ! roundNearestEven ) {
556
if ( roundingMode == float_round_to_zero ) {
560
roundIncrement = 0x3FF;
562
if ( roundingMode == float_round_up ) roundIncrement = 0;
565
if ( roundingMode == float_round_down ) roundIncrement = 0;
569
roundBits = zSig & 0x3FF;
570
if ( 0x7FD <= (uint16_t) zExp ) {
571
if ( ( 0x7FD < zExp )
572
|| ( ( zExp == 0x7FD )
573
&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
575
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
576
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
579
if (STATUS(flush_to_zero)) {
580
float_raise(float_flag_output_denormal STATUS_VAR);
581
return packFloat64(zSign, 0, 0);
584
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
586
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
587
shift64RightJamming( zSig, - zExp, &zSig );
589
roundBits = zSig & 0x3FF;
590
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
593
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
594
zSig = ( zSig + roundIncrement )>>10;
595
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
596
if ( zSig == 0 ) zExp = 0;
597
return packFloat64( zSign, zExp, zSig );
601
/*----------------------------------------------------------------------------
602
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
603
| and significand `zSig', and returns the proper double-precision floating-
604
| point value corresponding to the abstract input. This routine is just like
605
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
606
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
607
| floating-point exponent.
608
*----------------------------------------------------------------------------*/
611
normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615
shiftCount = countLeadingZeros64( zSig ) - 1;
616
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620
/*----------------------------------------------------------------------------
621
| Returns the fraction bits of the extended double-precision floating-point
623
*----------------------------------------------------------------------------*/
625
INLINE uint64_t extractFloatx80Frac( floatx80 a )
632
/*----------------------------------------------------------------------------
633
| Returns the exponent bits of the extended double-precision floating-point
635
*----------------------------------------------------------------------------*/
637
INLINE int32 extractFloatx80Exp( floatx80 a )
640
return a.high & 0x7FFF;
644
/*----------------------------------------------------------------------------
645
| Returns the sign bit of the extended double-precision floating-point value
647
*----------------------------------------------------------------------------*/
649
INLINE flag extractFloatx80Sign( floatx80 a )
656
/*----------------------------------------------------------------------------
657
| Normalizes the subnormal extended double-precision floating-point value
658
| represented by the denormalized significand `aSig'. The normalized exponent
659
| and significand are stored at the locations pointed to by `zExpPtr' and
660
| `zSigPtr', respectively.
661
*----------------------------------------------------------------------------*/
664
normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668
shiftCount = countLeadingZeros64( aSig );
669
*zSigPtr = aSig<<shiftCount;
670
*zExpPtr = 1 - shiftCount;
674
/*----------------------------------------------------------------------------
675
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
676
| extended double-precision floating-point value, returning the result.
677
*----------------------------------------------------------------------------*/
679
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
684
z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
689
/*----------------------------------------------------------------------------
690
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
691
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
692
| and returns the proper extended double-precision floating-point value
693
| corresponding to the abstract input. Ordinarily, the abstract value is
694
| rounded and packed into the extended double-precision format, with the
695
| inexact exception raised if the abstract input cannot be represented
696
| exactly. However, if the abstract value is too large, the overflow and
697
| inexact exceptions are raised and an infinity or maximal finite value is
698
| returned. If the abstract value is too small, the input value is rounded to
699
| a subnormal number, and the underflow and inexact exceptions are raised if
700
| the abstract input cannot be represented exactly as a subnormal extended
701
| double-precision floating-point number.
702
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
703
| number of bits as single or double precision, respectively. Otherwise, the
704
| result is rounded to the full precision of the extended double-precision
706
| The input significand must be normalized or smaller. If the input
707
| significand is not normalized, `zExp' must be 0; in that case, the result
708
| returned is a subnormal number, and it must not require rounding. The
709
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
710
| Floating-Point Arithmetic.
711
*----------------------------------------------------------------------------*/
714
roundAndPackFloatx80(
715
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719
flag roundNearestEven, increment, isTiny;
720
int64 roundIncrement, roundMask, roundBits;
722
roundingMode = STATUS(float_rounding_mode);
723
roundNearestEven = ( roundingMode == float_round_nearest_even );
724
if ( roundingPrecision == 80 ) goto precision80;
725
if ( roundingPrecision == 64 ) {
726
roundIncrement = LIT64( 0x0000000000000400 );
727
roundMask = LIT64( 0x00000000000007FF );
729
else if ( roundingPrecision == 32 ) {
730
roundIncrement = LIT64( 0x0000008000000000 );
731
roundMask = LIT64( 0x000000FFFFFFFFFF );
736
zSig0 |= ( zSig1 != 0 );
737
if ( ! roundNearestEven ) {
738
if ( roundingMode == float_round_to_zero ) {
742
roundIncrement = roundMask;
744
if ( roundingMode == float_round_up ) roundIncrement = 0;
747
if ( roundingMode == float_round_down ) roundIncrement = 0;
751
roundBits = zSig0 & roundMask;
752
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
753
if ( ( 0x7FFE < zExp )
754
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
759
if (STATUS(flush_to_zero)) {
760
float_raise(float_flag_output_denormal STATUS_VAR);
761
return packFloatx80(zSign, 0, 0);
764
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766
|| ( zSig0 <= zSig0 + roundIncrement );
767
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
769
roundBits = zSig0 & roundMask;
770
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
771
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
772
zSig0 += roundIncrement;
773
if ( (int64_t) zSig0 < 0 ) zExp = 1;
774
roundIncrement = roundMask + 1;
775
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
776
roundMask |= roundIncrement;
778
zSig0 &= ~ roundMask;
779
return packFloatx80( zSign, zExp, zSig0 );
782
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
783
zSig0 += roundIncrement;
784
if ( zSig0 < roundIncrement ) {
786
zSig0 = LIT64( 0x8000000000000000 );
788
roundIncrement = roundMask + 1;
789
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
790
roundMask |= roundIncrement;
792
zSig0 &= ~ roundMask;
793
if ( zSig0 == 0 ) zExp = 0;
794
return packFloatx80( zSign, zExp, zSig0 );
796
increment = ( (int64_t) zSig1 < 0 );
797
if ( ! roundNearestEven ) {
798
if ( roundingMode == float_round_to_zero ) {
803
increment = ( roundingMode == float_round_down ) && zSig1;
806
increment = ( roundingMode == float_round_up ) && zSig1;
810
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
811
if ( ( 0x7FFE < zExp )
812
|| ( ( zExp == 0x7FFE )
813
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
819
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
820
if ( ( roundingMode == float_round_to_zero )
821
|| ( zSign && ( roundingMode == float_round_up ) )
822
|| ( ! zSign && ( roundingMode == float_round_down ) )
824
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
826
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
830
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
833
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
834
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
836
if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
837
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
838
if ( roundNearestEven ) {
839
increment = ( (int64_t) zSig1 < 0 );
843
increment = ( roundingMode == float_round_down ) && zSig1;
846
increment = ( roundingMode == float_round_up ) && zSig1;
852
~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
853
if ( (int64_t) zSig0 < 0 ) zExp = 1;
855
return packFloatx80( zSign, zExp, zSig0 );
858
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
863
zSig0 = LIT64( 0x8000000000000000 );
866
zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
870
if ( zSig0 == 0 ) zExp = 0;
872
return packFloatx80( zSign, zExp, zSig0 );
876
/*----------------------------------------------------------------------------
877
| Takes an abstract floating-point value having sign `zSign', exponent
878
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
879
| and returns the proper extended double-precision floating-point value
880
| corresponding to the abstract input. This routine is just like
881
| `roundAndPackFloatx80' except that the input significand does not have to be
883
*----------------------------------------------------------------------------*/
886
normalizeRoundAndPackFloatx80(
887
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
897
shiftCount = countLeadingZeros64( zSig0 );
898
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
901
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
905
/*----------------------------------------------------------------------------
906
| Returns the least-significant 64 fraction bits of the quadruple-precision
907
| floating-point value `a'.
908
*----------------------------------------------------------------------------*/
910
INLINE uint64_t extractFloat128Frac1( float128 a )
917
/*----------------------------------------------------------------------------
918
| Returns the most-significant 48 fraction bits of the quadruple-precision
919
| floating-point value `a'.
920
*----------------------------------------------------------------------------*/
922
INLINE uint64_t extractFloat128Frac0( float128 a )
925
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
929
/*----------------------------------------------------------------------------
930
| Returns the exponent bits of the quadruple-precision floating-point value
932
*----------------------------------------------------------------------------*/
934
INLINE int32 extractFloat128Exp( float128 a )
937
return ( a.high>>48 ) & 0x7FFF;
941
/*----------------------------------------------------------------------------
942
| Returns the sign bit of the quadruple-precision floating-point value `a'.
943
*----------------------------------------------------------------------------*/
945
INLINE flag extractFloat128Sign( float128 a )
952
/*----------------------------------------------------------------------------
953
| Normalizes the subnormal quadruple-precision floating-point value
954
| represented by the denormalized significand formed by the concatenation of
955
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
956
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
957
| significand are stored at the location pointed to by `zSig0Ptr', and the
958
| least significant 64 bits of the normalized significand are stored at the
959
| location pointed to by `zSig1Ptr'.
960
*----------------------------------------------------------------------------*/
963
normalizeFloat128Subnormal(
974
shiftCount = countLeadingZeros64( aSig1 ) - 15;
975
if ( shiftCount < 0 ) {
976
*zSig0Ptr = aSig1>>( - shiftCount );
977
*zSig1Ptr = aSig1<<( shiftCount & 63 );
980
*zSig0Ptr = aSig1<<shiftCount;
983
*zExpPtr = - shiftCount - 63;
986
shiftCount = countLeadingZeros64( aSig0 ) - 15;
987
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
988
*zExpPtr = 1 - shiftCount;
993
/*----------------------------------------------------------------------------
994
| Packs the sign `zSign', the exponent `zExp', and the significand formed
995
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
996
| floating-point value, returning the result. After being shifted into the
997
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
998
| added together to form the most significant 32 bits of the result. This
999
| means that any integer portion of `zSig0' will be added into the exponent.
1000
| Since a properly normalized significand will have an integer portion equal
1001
| to 1, the `zExp' input should be 1 less than the desired result exponent
1002
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1004
*----------------------------------------------------------------------------*/
1007
packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1012
z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1017
/*----------------------------------------------------------------------------
1018
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019
| and extended significand formed by the concatenation of `zSig0', `zSig1',
1020
| and `zSig2', and returns the proper quadruple-precision floating-point value
1021
| corresponding to the abstract input. Ordinarily, the abstract value is
1022
| simply rounded and packed into the quadruple-precision format, with the
1023
| inexact exception raised if the abstract input cannot be represented
1024
| exactly. However, if the abstract value is too large, the overflow and
1025
| inexact exceptions are raised and an infinity or maximal finite value is
1026
| returned. If the abstract value is too small, the input value is rounded to
1027
| a subnormal number, and the underflow and inexact exceptions are raised if
1028
| the abstract input cannot be represented exactly as a subnormal quadruple-
1029
| precision floating-point number.
1030
| The input significand must be normalized or smaller. If the input
1031
| significand is not normalized, `zExp' must be 0; in that case, the result
1032
| returned is a subnormal number, and it must not require rounding. In the
1033
| usual case that the input significand is normalized, `zExp' must be 1 less
1034
| than the ``true'' floating-point exponent. The handling of underflow and
1035
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036
*----------------------------------------------------------------------------*/
1039
roundAndPackFloat128(
1040
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1043
flag roundNearestEven, increment, isTiny;
1045
roundingMode = STATUS(float_rounding_mode);
1046
roundNearestEven = ( roundingMode == float_round_nearest_even );
1047
increment = ( (int64_t) zSig2 < 0 );
1048
if ( ! roundNearestEven ) {
1049
if ( roundingMode == float_round_to_zero ) {
1054
increment = ( roundingMode == float_round_down ) && zSig2;
1057
increment = ( roundingMode == float_round_up ) && zSig2;
1061
if ( 0x7FFD <= (uint32_t) zExp ) {
1062
if ( ( 0x7FFD < zExp )
1063
|| ( ( zExp == 0x7FFD )
1065
LIT64( 0x0001FFFFFFFFFFFF ),
1066
LIT64( 0xFFFFFFFFFFFFFFFF ),
1073
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1074
if ( ( roundingMode == float_round_to_zero )
1075
|| ( zSign && ( roundingMode == float_round_up ) )
1076
|| ( ! zSign && ( roundingMode == float_round_down ) )
1082
LIT64( 0x0000FFFFFFFFFFFF ),
1083
LIT64( 0xFFFFFFFFFFFFFFFF )
1086
return packFloat128( zSign, 0x7FFF, 0, 0 );
1089
if (STATUS(flush_to_zero)) {
1090
float_raise(float_flag_output_denormal STATUS_VAR);
1091
return packFloat128(zSign, 0, 0, 0);
1094
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1100
LIT64( 0x0001FFFFFFFFFFFF ),
1101
LIT64( 0xFFFFFFFFFFFFFFFF )
1103
shift128ExtraRightJamming(
1104
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1106
if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1107
if ( roundNearestEven ) {
1108
increment = ( (int64_t) zSig2 < 0 );
1112
increment = ( roundingMode == float_round_down ) && zSig2;
1115
increment = ( roundingMode == float_round_up ) && zSig2;
1120
if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1122
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1123
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1126
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1128
return packFloat128( zSign, zExp, zSig0, zSig1 );
1132
/*----------------------------------------------------------------------------
1133
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1134
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1135
| returns the proper quadruple-precision floating-point value corresponding
1136
| to the abstract input. This routine is just like `roundAndPackFloat128'
1137
| except that the input significand has fewer bits and does not have to be
1138
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1140
*----------------------------------------------------------------------------*/
1143
normalizeRoundAndPackFloat128(
1144
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1154
shiftCount = countLeadingZeros64( zSig0 ) - 15;
1155
if ( 0 <= shiftCount ) {
1157
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1160
shift128ExtraRightJamming(
1161
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1164
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1168
/*----------------------------------------------------------------------------
1169
| Returns the result of converting the 32-bit two's complement integer `a'
1170
| to the single-precision floating-point format. The conversion is performed
1171
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172
*----------------------------------------------------------------------------*/
1174
float32 int32_to_float32(int32_t a STATUS_PARAM)
1178
if ( a == 0 ) return float32_zero;
1179
if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1181
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1185
/*----------------------------------------------------------------------------
1186
| Returns the result of converting the 32-bit two's complement integer `a'
1187
| to the double-precision floating-point format. The conversion is performed
1188
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189
*----------------------------------------------------------------------------*/
1191
float64 int32_to_float64(int32_t a STATUS_PARAM)
1198
if ( a == 0 ) return float64_zero;
1200
absA = zSign ? - a : a;
1201
shiftCount = countLeadingZeros32( absA ) + 21;
1203
return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1207
/*----------------------------------------------------------------------------
1208
| Returns the result of converting the 32-bit two's complement integer `a'
1209
| to the extended double-precision floating-point format. The conversion
1210
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1212
*----------------------------------------------------------------------------*/
1214
floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1221
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1223
absA = zSign ? - a : a;
1224
shiftCount = countLeadingZeros32( absA ) + 32;
1226
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1230
/*----------------------------------------------------------------------------
1231
| Returns the result of converting the 32-bit two's complement integer `a' to
1232
| the quadruple-precision floating-point format. The conversion is performed
1233
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1234
*----------------------------------------------------------------------------*/
1236
float128 int32_to_float128(int32_t a STATUS_PARAM)
1243
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1245
absA = zSign ? - a : a;
1246
shiftCount = countLeadingZeros32( absA ) + 17;
1248
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1252
/*----------------------------------------------------------------------------
1253
| Returns the result of converting the 64-bit two's complement integer `a'
1254
| to the single-precision floating-point format. The conversion is performed
1255
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256
*----------------------------------------------------------------------------*/
1258
float32 int64_to_float32(int64_t a STATUS_PARAM)
1264
if ( a == 0 ) return float32_zero;
1266
absA = zSign ? - a : a;
1267
shiftCount = countLeadingZeros64( absA ) - 40;
1268
if ( 0 <= shiftCount ) {
1269
return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1273
if ( shiftCount < 0 ) {
1274
shift64RightJamming( absA, - shiftCount, &absA );
1277
absA <<= shiftCount;
1279
return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1284
float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1288
if ( a == 0 ) return float32_zero;
1289
shiftCount = countLeadingZeros64( a ) - 40;
1290
if ( 0 <= shiftCount ) {
1291
return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1295
if ( shiftCount < 0 ) {
1296
shift64RightJamming( a, - shiftCount, &a );
1301
return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1305
/*----------------------------------------------------------------------------
1306
| Returns the result of converting the 64-bit two's complement integer `a'
1307
| to the double-precision floating-point format. The conversion is performed
1308
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1309
*----------------------------------------------------------------------------*/
1311
float64 int64_to_float64(int64_t a STATUS_PARAM)
1315
if ( a == 0 ) return float64_zero;
1316
if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1317
return packFloat64( 1, 0x43E, 0 );
1320
return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1324
float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1329
return float64_zero;
1331
if ((int64_t)a < 0) {
1332
shift64RightJamming(a, 1, &a);
1335
return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1338
/*----------------------------------------------------------------------------
1339
| Returns the result of converting the 64-bit two's complement integer `a'
1340
| to the extended double-precision floating-point format. The conversion
1341
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1343
*----------------------------------------------------------------------------*/
1345
floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1351
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1353
absA = zSign ? - a : a;
1354
shiftCount = countLeadingZeros64( absA );
1355
return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1359
/*----------------------------------------------------------------------------
1360
| Returns the result of converting the 64-bit two's complement integer `a' to
1361
| the quadruple-precision floating-point format. The conversion is performed
1362
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1363
*----------------------------------------------------------------------------*/
1365
float128 int64_to_float128(int64_t a STATUS_PARAM)
1371
uint64_t zSig0, zSig1;
1373
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1375
absA = zSign ? - a : a;
1376
shiftCount = countLeadingZeros64( absA ) + 49;
1377
zExp = 0x406E - shiftCount;
1378
if ( 64 <= shiftCount ) {
1387
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1388
return packFloat128( zSign, zExp, zSig0, zSig1 );
1392
float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1395
return float128_zero;
1397
return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1400
/*----------------------------------------------------------------------------
1401
| Returns the result of converting the single-precision floating-point value
1402
| `a' to the 32-bit two's complement integer format. The conversion is
1403
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1404
| Arithmetic---which means in particular that the conversion is rounded
1405
| according to the current rounding mode. If `a' is a NaN, the largest
1406
| positive integer is returned. Otherwise, if the conversion overflows, the
1407
| largest integer with the same sign as `a' is returned.
1408
*----------------------------------------------------------------------------*/
1410
int32 float32_to_int32( float32 a STATUS_PARAM )
1413
int_fast16_t aExp, shiftCount;
1417
a = float32_squash_input_denormal(a STATUS_VAR);
1418
aSig = extractFloat32Frac( a );
1419
aExp = extractFloat32Exp( a );
1420
aSign = extractFloat32Sign( a );
1421
if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1422
if ( aExp ) aSig |= 0x00800000;
1423
shiftCount = 0xAF - aExp;
1426
if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1427
return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1431
/*----------------------------------------------------------------------------
1432
| Returns the result of converting the single-precision floating-point value
1433
| `a' to the 32-bit two's complement integer format. The conversion is
1434
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1435
| Arithmetic, except that the conversion is always rounded toward zero.
1436
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1437
| the conversion overflows, the largest integer with the same sign as `a' is
1439
*----------------------------------------------------------------------------*/
1441
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1444
int_fast16_t aExp, shiftCount;
1447
a = float32_squash_input_denormal(a STATUS_VAR);
1449
aSig = extractFloat32Frac( a );
1450
aExp = extractFloat32Exp( a );
1451
aSign = extractFloat32Sign( a );
1452
shiftCount = aExp - 0x9E;
1453
if ( 0 <= shiftCount ) {
1454
if ( float32_val(a) != 0xCF000000 ) {
1455
float_raise( float_flag_invalid STATUS_VAR);
1456
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1458
return (int32_t) 0x80000000;
1460
else if ( aExp <= 0x7E ) {
1461
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1464
aSig = ( aSig | 0x00800000 )<<8;
1465
z = aSig>>( - shiftCount );
1466
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1467
STATUS(float_exception_flags) |= float_flag_inexact;
1469
if ( aSign ) z = - z;
1474
/*----------------------------------------------------------------------------
1475
| Returns the result of converting the single-precision floating-point value
1476
| `a' to the 16-bit two's complement integer format. The conversion is
1477
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1478
| Arithmetic, except that the conversion is always rounded toward zero.
1479
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1480
| the conversion overflows, the largest integer with the same sign as `a' is
1482
*----------------------------------------------------------------------------*/
1484
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1487
int_fast16_t aExp, shiftCount;
1491
aSig = extractFloat32Frac( a );
1492
aExp = extractFloat32Exp( a );
1493
aSign = extractFloat32Sign( a );
1494
shiftCount = aExp - 0x8E;
1495
if ( 0 <= shiftCount ) {
1496
if ( float32_val(a) != 0xC7000000 ) {
1497
float_raise( float_flag_invalid STATUS_VAR);
1498
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1502
return (int32_t) 0xffff8000;
1504
else if ( aExp <= 0x7E ) {
1505
if ( aExp | aSig ) {
1506
STATUS(float_exception_flags) |= float_flag_inexact;
1511
aSig = ( aSig | 0x00800000 )<<8;
1512
z = aSig>>( - shiftCount );
1513
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1514
STATUS(float_exception_flags) |= float_flag_inexact;
1523
/*----------------------------------------------------------------------------
1524
| Returns the result of converting the single-precision floating-point value
1525
| `a' to the 64-bit two's complement integer format. The conversion is
1526
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1527
| Arithmetic---which means in particular that the conversion is rounded
1528
| according to the current rounding mode. If `a' is a NaN, the largest
1529
| positive integer is returned. Otherwise, if the conversion overflows, the
1530
| largest integer with the same sign as `a' is returned.
1531
*----------------------------------------------------------------------------*/
1533
int64 float32_to_int64( float32 a STATUS_PARAM )
1536
int_fast16_t aExp, shiftCount;
1538
uint64_t aSig64, aSigExtra;
1539
a = float32_squash_input_denormal(a STATUS_VAR);
1541
aSig = extractFloat32Frac( a );
1542
aExp = extractFloat32Exp( a );
1543
aSign = extractFloat32Sign( a );
1544
shiftCount = 0xBE - aExp;
1545
if ( shiftCount < 0 ) {
1546
float_raise( float_flag_invalid STATUS_VAR);
1547
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1548
return LIT64( 0x7FFFFFFFFFFFFFFF );
1550
return (int64_t) LIT64( 0x8000000000000000 );
1552
if ( aExp ) aSig |= 0x00800000;
1555
shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1556
return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1560
/*----------------------------------------------------------------------------
1561
| Returns the result of converting the single-precision floating-point value
1562
| `a' to the 64-bit unsigned integer format. The conversion is
1563
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1564
| Arithmetic---which means in particular that the conversion is rounded
1565
| according to the current rounding mode. If `a' is a NaN, the largest
1566
| unsigned integer is returned. Otherwise, if the conversion overflows, the
1567
| largest unsigned integer is returned. If the 'a' is negative, the result
1568
| is rounded and zero is returned; values that do not round to zero will
1569
| raise the inexact exception flag.
1570
*----------------------------------------------------------------------------*/
1572
uint64 float32_to_uint64(float32 a STATUS_PARAM)
1575
int_fast16_t aExp, shiftCount;
1577
uint64_t aSig64, aSigExtra;
1578
a = float32_squash_input_denormal(a STATUS_VAR);
1580
aSig = extractFloat32Frac(a);
1581
aExp = extractFloat32Exp(a);
1582
aSign = extractFloat32Sign(a);
1583
if ((aSign) && (aExp > 126)) {
1584
float_raise(float_flag_invalid STATUS_VAR);
1585
if (float32_is_any_nan(a)) {
1586
return LIT64(0xFFFFFFFFFFFFFFFF);
1591
shiftCount = 0xBE - aExp;
1595
if (shiftCount < 0) {
1596
float_raise(float_flag_invalid STATUS_VAR);
1597
return LIT64(0xFFFFFFFFFFFFFFFF);
1602
shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1603
return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1606
/*----------------------------------------------------------------------------
1607
| Returns the result of converting the single-precision floating-point value
1608
| `a' to the 64-bit two's complement integer format. The conversion is
1609
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1610
| Arithmetic, except that the conversion is always rounded toward zero. If
1611
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1612
| conversion overflows, the largest integer with the same sign as `a' is
1614
*----------------------------------------------------------------------------*/
1616
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1619
int_fast16_t aExp, shiftCount;
1623
a = float32_squash_input_denormal(a STATUS_VAR);
1625
aSig = extractFloat32Frac( a );
1626
aExp = extractFloat32Exp( a );
1627
aSign = extractFloat32Sign( a );
1628
shiftCount = aExp - 0xBE;
1629
if ( 0 <= shiftCount ) {
1630
if ( float32_val(a) != 0xDF000000 ) {
1631
float_raise( float_flag_invalid STATUS_VAR);
1632
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1633
return LIT64( 0x7FFFFFFFFFFFFFFF );
1636
return (int64_t) LIT64( 0x8000000000000000 );
1638
else if ( aExp <= 0x7E ) {
1639
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1642
aSig64 = aSig | 0x00800000;
1644
z = aSig64>>( - shiftCount );
1645
if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1646
STATUS(float_exception_flags) |= float_flag_inexact;
1648
if ( aSign ) z = - z;
1653
/*----------------------------------------------------------------------------
1654
| Returns the result of converting the single-precision floating-point value
1655
| `a' to the double-precision floating-point format. The conversion is
1656
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1658
*----------------------------------------------------------------------------*/
1660
float64 float32_to_float64( float32 a STATUS_PARAM )
1665
a = float32_squash_input_denormal(a STATUS_VAR);
1667
aSig = extractFloat32Frac( a );
1668
aExp = extractFloat32Exp( a );
1669
aSign = extractFloat32Sign( a );
1670
if ( aExp == 0xFF ) {
1671
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1672
return packFloat64( aSign, 0x7FF, 0 );
1675
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1676
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1679
return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1683
/*----------------------------------------------------------------------------
1684
| Returns the result of converting the single-precision floating-point value
1685
| `a' to the extended double-precision floating-point format. The conversion
1686
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1688
*----------------------------------------------------------------------------*/
1690
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1696
a = float32_squash_input_denormal(a STATUS_VAR);
1697
aSig = extractFloat32Frac( a );
1698
aExp = extractFloat32Exp( a );
1699
aSign = extractFloat32Sign( a );
1700
if ( aExp == 0xFF ) {
1701
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1702
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1705
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1706
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1709
return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1713
/*----------------------------------------------------------------------------
1714
| Returns the result of converting the single-precision floating-point value
1715
| `a' to the double-precision floating-point format. The conversion is
1716
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1718
*----------------------------------------------------------------------------*/
1720
float128 float32_to_float128( float32 a STATUS_PARAM )
1726
a = float32_squash_input_denormal(a STATUS_VAR);
1727
aSig = extractFloat32Frac( a );
1728
aExp = extractFloat32Exp( a );
1729
aSign = extractFloat32Sign( a );
1730
if ( aExp == 0xFF ) {
1731
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1732
return packFloat128( aSign, 0x7FFF, 0, 0 );
1735
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1736
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1739
return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1743
/*----------------------------------------------------------------------------
1744
| Rounds the single-precision floating-point value `a' to an integer, and
1745
| returns the result as a single-precision floating-point value. The
1746
| operation is performed according to the IEC/IEEE Standard for Binary
1747
| Floating-Point Arithmetic.
1748
*----------------------------------------------------------------------------*/
1750
float32 float32_round_to_int( float32 a STATUS_PARAM)
1754
uint32_t lastBitMask, roundBitsMask;
1757
a = float32_squash_input_denormal(a STATUS_VAR);
1759
aExp = extractFloat32Exp( a );
1760
if ( 0x96 <= aExp ) {
1761
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1762
return propagateFloat32NaN( a, a STATUS_VAR );
1766
if ( aExp <= 0x7E ) {
1767
if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1768
STATUS(float_exception_flags) |= float_flag_inexact;
1769
aSign = extractFloat32Sign( a );
1770
switch ( STATUS(float_rounding_mode) ) {
1771
case float_round_nearest_even:
1772
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1773
return packFloat32( aSign, 0x7F, 0 );
1776
case float_round_down:
1777
return make_float32(aSign ? 0xBF800000 : 0);
1778
case float_round_up:
1779
return make_float32(aSign ? 0x80000000 : 0x3F800000);
1781
return packFloat32( aSign, 0, 0 );
1784
lastBitMask <<= 0x96 - aExp;
1785
roundBitsMask = lastBitMask - 1;
1787
roundingMode = STATUS(float_rounding_mode);
1788
if ( roundingMode == float_round_nearest_even ) {
1789
z += lastBitMask>>1;
1790
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1792
else if ( roundingMode != float_round_to_zero ) {
1793
if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1797
z &= ~ roundBitsMask;
1798
if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1799
return make_float32(z);
1803
/*----------------------------------------------------------------------------
1804
| Returns the result of adding the absolute values of the single-precision
1805
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1806
| before being returned. `zSign' is ignored if the result is a NaN.
1807
| The addition is performed according to the IEC/IEEE Standard for Binary
1808
| Floating-Point Arithmetic.
1809
*----------------------------------------------------------------------------*/
1811
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1813
int_fast16_t aExp, bExp, zExp;
1814
uint32_t aSig, bSig, zSig;
1815
int_fast16_t expDiff;
1817
aSig = extractFloat32Frac( a );
1818
aExp = extractFloat32Exp( a );
1819
bSig = extractFloat32Frac( b );
1820
bExp = extractFloat32Exp( b );
1821
expDiff = aExp - bExp;
1824
if ( 0 < expDiff ) {
1825
if ( aExp == 0xFF ) {
1826
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1835
shift32RightJamming( bSig, expDiff, &bSig );
1838
else if ( expDiff < 0 ) {
1839
if ( bExp == 0xFF ) {
1840
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1841
return packFloat32( zSign, 0xFF, 0 );
1849
shift32RightJamming( aSig, - expDiff, &aSig );
1853
if ( aExp == 0xFF ) {
1854
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1858
if (STATUS(flush_to_zero)) {
1860
float_raise(float_flag_output_denormal STATUS_VAR);
1862
return packFloat32(zSign, 0, 0);
1864
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1866
zSig = 0x40000000 + aSig + bSig;
1871
zSig = ( aSig + bSig )<<1;
1873
if ( (int32_t) zSig < 0 ) {
1878
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1882
/*----------------------------------------------------------------------------
1883
| Returns the result of subtracting the absolute values of the single-
1884
| precision floating-point values `a' and `b'. If `zSign' is 1, the
1885
| difference is negated before being returned. `zSign' is ignored if the
1886
| result is a NaN. The subtraction is performed according to the IEC/IEEE
1887
| Standard for Binary Floating-Point Arithmetic.
1888
*----------------------------------------------------------------------------*/
1890
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1892
int_fast16_t aExp, bExp, zExp;
1893
uint32_t aSig, bSig, zSig;
1894
int_fast16_t expDiff;
1896
aSig = extractFloat32Frac( a );
1897
aExp = extractFloat32Exp( a );
1898
bSig = extractFloat32Frac( b );
1899
bExp = extractFloat32Exp( b );
1900
expDiff = aExp - bExp;
1903
if ( 0 < expDiff ) goto aExpBigger;
1904
if ( expDiff < 0 ) goto bExpBigger;
1905
if ( aExp == 0xFF ) {
1906
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1907
float_raise( float_flag_invalid STATUS_VAR);
1908
return float32_default_nan;
1914
if ( bSig < aSig ) goto aBigger;
1915
if ( aSig < bSig ) goto bBigger;
1916
return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1918
if ( bExp == 0xFF ) {
1919
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1920
return packFloat32( zSign ^ 1, 0xFF, 0 );
1928
shift32RightJamming( aSig, - expDiff, &aSig );
1934
goto normalizeRoundAndPack;
1936
if ( aExp == 0xFF ) {
1937
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1946
shift32RightJamming( bSig, expDiff, &bSig );
1951
normalizeRoundAndPack:
1953
return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1957
/*----------------------------------------------------------------------------
1958
| Returns the result of adding the single-precision floating-point values `a'
1959
| and `b'. The operation is performed according to the IEC/IEEE Standard for
1960
| Binary Floating-Point Arithmetic.
1961
*----------------------------------------------------------------------------*/
1963
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1966
a = float32_squash_input_denormal(a STATUS_VAR);
1967
b = float32_squash_input_denormal(b STATUS_VAR);
1969
aSign = extractFloat32Sign( a );
1970
bSign = extractFloat32Sign( b );
1971
if ( aSign == bSign ) {
1972
return addFloat32Sigs( a, b, aSign STATUS_VAR);
1975
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1980
/*----------------------------------------------------------------------------
1981
| Returns the result of subtracting the single-precision floating-point values
1982
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1983
| for Binary Floating-Point Arithmetic.
1984
*----------------------------------------------------------------------------*/
1986
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1989
a = float32_squash_input_denormal(a STATUS_VAR);
1990
b = float32_squash_input_denormal(b STATUS_VAR);
1992
aSign = extractFloat32Sign( a );
1993
bSign = extractFloat32Sign( b );
1994
if ( aSign == bSign ) {
1995
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1998
return addFloat32Sigs( a, b, aSign STATUS_VAR );
2003
/*----------------------------------------------------------------------------
2004
| Returns the result of multiplying the single-precision floating-point values
2005
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2006
| for Binary Floating-Point Arithmetic.
2007
*----------------------------------------------------------------------------*/
2009
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2011
flag aSign, bSign, zSign;
2012
int_fast16_t aExp, bExp, zExp;
2013
uint32_t aSig, bSig;
2017
a = float32_squash_input_denormal(a STATUS_VAR);
2018
b = float32_squash_input_denormal(b STATUS_VAR);
2020
aSig = extractFloat32Frac( a );
2021
aExp = extractFloat32Exp( a );
2022
aSign = extractFloat32Sign( a );
2023
bSig = extractFloat32Frac( b );
2024
bExp = extractFloat32Exp( b );
2025
bSign = extractFloat32Sign( b );
2026
zSign = aSign ^ bSign;
2027
if ( aExp == 0xFF ) {
2028
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2029
return propagateFloat32NaN( a, b STATUS_VAR );
2031
if ( ( bExp | bSig ) == 0 ) {
2032
float_raise( float_flag_invalid STATUS_VAR);
2033
return float32_default_nan;
2035
return packFloat32( zSign, 0xFF, 0 );
2037
if ( bExp == 0xFF ) {
2038
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2039
if ( ( aExp | aSig ) == 0 ) {
2040
float_raise( float_flag_invalid STATUS_VAR);
2041
return float32_default_nan;
2043
return packFloat32( zSign, 0xFF, 0 );
2046
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2047
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2050
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2051
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2053
zExp = aExp + bExp - 0x7F;
2054
aSig = ( aSig | 0x00800000 )<<7;
2055
bSig = ( bSig | 0x00800000 )<<8;
2056
shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2058
if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2062
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2066
/*----------------------------------------------------------------------------
2067
| Returns the result of dividing the single-precision floating-point value `a'
2068
| by the corresponding value `b'. The operation is performed according to the
2069
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2070
*----------------------------------------------------------------------------*/
2072
float32 float32_div( float32 a, float32 b STATUS_PARAM )
2074
flag aSign, bSign, zSign;
2075
int_fast16_t aExp, bExp, zExp;
2076
uint32_t aSig, bSig, zSig;
2077
a = float32_squash_input_denormal(a STATUS_VAR);
2078
b = float32_squash_input_denormal(b STATUS_VAR);
2080
aSig = extractFloat32Frac( a );
2081
aExp = extractFloat32Exp( a );
2082
aSign = extractFloat32Sign( a );
2083
bSig = extractFloat32Frac( b );
2084
bExp = extractFloat32Exp( b );
2085
bSign = extractFloat32Sign( b );
2086
zSign = aSign ^ bSign;
2087
if ( aExp == 0xFF ) {
2088
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2089
if ( bExp == 0xFF ) {
2090
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2091
float_raise( float_flag_invalid STATUS_VAR);
2092
return float32_default_nan;
2094
return packFloat32( zSign, 0xFF, 0 );
2096
if ( bExp == 0xFF ) {
2097
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2098
return packFloat32( zSign, 0, 0 );
2102
if ( ( aExp | aSig ) == 0 ) {
2103
float_raise( float_flag_invalid STATUS_VAR);
2104
return float32_default_nan;
2106
float_raise( float_flag_divbyzero STATUS_VAR);
2107
return packFloat32( zSign, 0xFF, 0 );
2109
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2112
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2113
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2115
zExp = aExp - bExp + 0x7D;
2116
aSig = ( aSig | 0x00800000 )<<7;
2117
bSig = ( bSig | 0x00800000 )<<8;
2118
if ( bSig <= ( aSig + aSig ) ) {
2122
zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2123
if ( ( zSig & 0x3F ) == 0 ) {
2124
zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2126
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2130
/*----------------------------------------------------------------------------
2131
| Returns the remainder of the single-precision floating-point value `a'
2132
| with respect to the corresponding value `b'. The operation is performed
2133
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2134
*----------------------------------------------------------------------------*/
2136
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2139
int_fast16_t aExp, bExp, expDiff;
2140
uint32_t aSig, bSig;
2142
uint64_t aSig64, bSig64, q64;
2143
uint32_t alternateASig;
2145
a = float32_squash_input_denormal(a STATUS_VAR);
2146
b = float32_squash_input_denormal(b STATUS_VAR);
2148
aSig = extractFloat32Frac( a );
2149
aExp = extractFloat32Exp( a );
2150
aSign = extractFloat32Sign( a );
2151
bSig = extractFloat32Frac( b );
2152
bExp = extractFloat32Exp( b );
2153
if ( aExp == 0xFF ) {
2154
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2155
return propagateFloat32NaN( a, b STATUS_VAR );
2157
float_raise( float_flag_invalid STATUS_VAR);
2158
return float32_default_nan;
2160
if ( bExp == 0xFF ) {
2161
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2166
float_raise( float_flag_invalid STATUS_VAR);
2167
return float32_default_nan;
2169
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2172
if ( aSig == 0 ) return a;
2173
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2175
expDiff = aExp - bExp;
2178
if ( expDiff < 32 ) {
2181
if ( expDiff < 0 ) {
2182
if ( expDiff < -1 ) return a;
2185
q = ( bSig <= aSig );
2186
if ( q ) aSig -= bSig;
2187
if ( 0 < expDiff ) {
2188
q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2191
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2199
if ( bSig <= aSig ) aSig -= bSig;
2200
aSig64 = ( (uint64_t) aSig )<<40;
2201
bSig64 = ( (uint64_t) bSig )<<40;
2203
while ( 0 < expDiff ) {
2204
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2205
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2206
aSig64 = - ( ( bSig * q64 )<<38 );
2210
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2211
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2212
q = q64>>( 64 - expDiff );
2214
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2217
alternateASig = aSig;
2220
} while ( 0 <= (int32_t) aSig );
2221
sigMean = aSig + alternateASig;
2222
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2223
aSig = alternateASig;
2225
zSign = ( (int32_t) aSig < 0 );
2226
if ( zSign ) aSig = - aSig;
2227
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2231
/*----------------------------------------------------------------------------
2232
| Returns the result of multiplying the single-precision floating-point values
2233
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2234
| multiplication. The operation is performed according to the IEC/IEEE
2235
| Standard for Binary Floating-Point Arithmetic 754-2008.
2236
| The flags argument allows the caller to select negation of the
2237
| addend, the intermediate product, or the final result. (The difference
2238
| between this and having the caller do a separate negation is that negating
2239
| externally will flip the sign bit on NaNs.)
2240
*----------------------------------------------------------------------------*/
2242
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2244
flag aSign, bSign, cSign, zSign;
2245
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2246
uint32_t aSig, bSig, cSig;
2247
flag pInf, pZero, pSign;
2248
uint64_t pSig64, cSig64, zSig64;
2251
flag signflip, infzero;
2253
a = float32_squash_input_denormal(a STATUS_VAR);
2254
b = float32_squash_input_denormal(b STATUS_VAR);
2255
c = float32_squash_input_denormal(c STATUS_VAR);
2256
aSig = extractFloat32Frac(a);
2257
aExp = extractFloat32Exp(a);
2258
aSign = extractFloat32Sign(a);
2259
bSig = extractFloat32Frac(b);
2260
bExp = extractFloat32Exp(b);
2261
bSign = extractFloat32Sign(b);
2262
cSig = extractFloat32Frac(c);
2263
cExp = extractFloat32Exp(c);
2264
cSign = extractFloat32Sign(c);
2266
infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2267
(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2269
/* It is implementation-defined whether the cases of (0,inf,qnan)
2270
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2271
* they return if they do), so we have to hand this information
2272
* off to the target-specific pick-a-NaN routine.
2274
if (((aExp == 0xff) && aSig) ||
2275
((bExp == 0xff) && bSig) ||
2276
((cExp == 0xff) && cSig)) {
2277
return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2281
float_raise(float_flag_invalid STATUS_VAR);
2282
return float32_default_nan;
2285
if (flags & float_muladd_negate_c) {
2289
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2291
/* Work out the sign and type of the product */
2292
pSign = aSign ^ bSign;
2293
if (flags & float_muladd_negate_product) {
2296
pInf = (aExp == 0xff) || (bExp == 0xff);
2297
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2300
if (pInf && (pSign ^ cSign)) {
2301
/* addition of opposite-signed infinities => InvalidOperation */
2302
float_raise(float_flag_invalid STATUS_VAR);
2303
return float32_default_nan;
2305
/* Otherwise generate an infinity of the same sign */
2306
return packFloat32(cSign ^ signflip, 0xff, 0);
2310
return packFloat32(pSign ^ signflip, 0xff, 0);
2316
/* Adding two exact zeroes */
2317
if (pSign == cSign) {
2319
} else if (STATUS(float_rounding_mode) == float_round_down) {
2324
return packFloat32(zSign ^ signflip, 0, 0);
2326
/* Exact zero plus a denorm */
2327
if (STATUS(flush_to_zero)) {
2328
float_raise(float_flag_output_denormal STATUS_VAR);
2329
return packFloat32(cSign ^ signflip, 0, 0);
2332
/* Zero plus something non-zero : just return the something */
2333
return packFloat32(cSign ^ signflip, cExp, cSig);
2337
normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2340
normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2343
/* Calculate the actual result a * b + c */
2345
/* Multiply first; this is easy. */
2346
/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2347
* because we want the true exponent, not the "one-less-than"
2348
* flavour that roundAndPackFloat32() takes.
2350
pExp = aExp + bExp - 0x7e;
2351
aSig = (aSig | 0x00800000) << 7;
2352
bSig = (bSig | 0x00800000) << 8;
2353
pSig64 = (uint64_t)aSig * bSig;
2354
if ((int64_t)(pSig64 << 1) >= 0) {
2359
zSign = pSign ^ signflip;
2361
/* Now pSig64 is the significand of the multiply, with the explicit bit in
2366
/* Throw out the special case of c being an exact zero now */
2367
shift64RightJamming(pSig64, 32, &pSig64);
2369
return roundAndPackFloat32(zSign, pExp - 1,
2372
normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2375
cSig64 = (uint64_t)cSig << (62 - 23);
2376
cSig64 |= LIT64(0x4000000000000000);
2377
expDiff = pExp - cExp;
2379
if (pSign == cSign) {
2382
/* scale c to match p */
2383
shift64RightJamming(cSig64, expDiff, &cSig64);
2385
} else if (expDiff < 0) {
2386
/* scale p to match c */
2387
shift64RightJamming(pSig64, -expDiff, &pSig64);
2390
/* no scaling needed */
2393
/* Add significands and make sure explicit bit ends up in posn 62 */
2394
zSig64 = pSig64 + cSig64;
2395
if ((int64_t)zSig64 < 0) {
2396
shift64RightJamming(zSig64, 1, &zSig64);
2403
shift64RightJamming(cSig64, expDiff, &cSig64);
2404
zSig64 = pSig64 - cSig64;
2406
} else if (expDiff < 0) {
2407
shift64RightJamming(pSig64, -expDiff, &pSig64);
2408
zSig64 = cSig64 - pSig64;
2413
if (cSig64 < pSig64) {
2414
zSig64 = pSig64 - cSig64;
2415
} else if (pSig64 < cSig64) {
2416
zSig64 = cSig64 - pSig64;
2421
if (STATUS(float_rounding_mode) == float_round_down) {
2424
return packFloat32(zSign, 0, 0);
2428
/* Normalize to put the explicit bit back into bit 62. */
2429
shiftcount = countLeadingZeros64(zSig64) - 1;
2430
zSig64 <<= shiftcount;
2433
shift64RightJamming(zSig64, 32, &zSig64);
2434
return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2438
/*----------------------------------------------------------------------------
2439
| Returns the square root of the single-precision floating-point value `a'.
2440
| The operation is performed according to the IEC/IEEE Standard for Binary
2441
| Floating-Point Arithmetic.
2442
*----------------------------------------------------------------------------*/
2444
float32 float32_sqrt( float32 a STATUS_PARAM )
2447
int_fast16_t aExp, zExp;
2448
uint32_t aSig, zSig;
2450
a = float32_squash_input_denormal(a STATUS_VAR);
2452
aSig = extractFloat32Frac( a );
2453
aExp = extractFloat32Exp( a );
2454
aSign = extractFloat32Sign( a );
2455
if ( aExp == 0xFF ) {
2456
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2457
if ( ! aSign ) return a;
2458
float_raise( float_flag_invalid STATUS_VAR);
2459
return float32_default_nan;
2462
if ( ( aExp | aSig ) == 0 ) return a;
2463
float_raise( float_flag_invalid STATUS_VAR);
2464
return float32_default_nan;
2467
if ( aSig == 0 ) return float32_zero;
2468
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2470
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2471
aSig = ( aSig | 0x00800000 )<<8;
2472
zSig = estimateSqrt32( aExp, aSig ) + 2;
2473
if ( ( zSig & 0x7F ) <= 5 ) {
2479
term = ( (uint64_t) zSig ) * zSig;
2480
rem = ( ( (uint64_t) aSig )<<32 ) - term;
2481
while ( (int64_t) rem < 0 ) {
2483
rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2485
zSig |= ( rem != 0 );
2487
shift32RightJamming( zSig, 1, &zSig );
2489
return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2493
/*----------------------------------------------------------------------------
2494
| Returns the binary exponential of the single-precision floating-point value
2495
| `a'. The operation is performed according to the IEC/IEEE Standard for
2496
| Binary Floating-Point Arithmetic.
2498
| Uses the following identities:
2500
| 1. -------------------------------------------------------------------------
2504
| 2. -------------------------------------------------------------------------
2507
| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2509
*----------------------------------------------------------------------------*/
2511
static const float64 float32_exp2_coefficients[15] =
2513
const_float64( 0x3ff0000000000000ll ), /* 1 */
2514
const_float64( 0x3fe0000000000000ll ), /* 2 */
2515
const_float64( 0x3fc5555555555555ll ), /* 3 */
2516
const_float64( 0x3fa5555555555555ll ), /* 4 */
2517
const_float64( 0x3f81111111111111ll ), /* 5 */
2518
const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2519
const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2520
const_float64( 0x3efa01a01a01a01all ), /* 8 */
2521
const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2522
const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2523
const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2524
const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2525
const_float64( 0x3de6124613a86d09ll ), /* 13 */
2526
const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2527
const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2530
float32 float32_exp2( float32 a STATUS_PARAM )
2537
a = float32_squash_input_denormal(a STATUS_VAR);
2539
aSig = extractFloat32Frac( a );
2540
aExp = extractFloat32Exp( a );
2541
aSign = extractFloat32Sign( a );
2543
if ( aExp == 0xFF) {
2544
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2545
return (aSign) ? float32_zero : a;
2548
if (aSig == 0) return float32_one;
2551
float_raise( float_flag_inexact STATUS_VAR);
2553
/* ******************************* */
2554
/* using float64 for approximation */
2555
/* ******************************* */
2556
x = float32_to_float64(a STATUS_VAR);
2557
x = float64_mul(x, float64_ln2 STATUS_VAR);
2561
for (i = 0 ; i < 15 ; i++) {
2564
f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2565
r = float64_add(r, f STATUS_VAR);
2567
xn = float64_mul(xn, x STATUS_VAR);
2570
return float64_to_float32(r, status);
2573
/*----------------------------------------------------------------------------
2574
| Returns the binary log of the single-precision floating-point value `a'.
2575
| The operation is performed according to the IEC/IEEE Standard for Binary
2576
| Floating-Point Arithmetic.
2577
*----------------------------------------------------------------------------*/
2578
float32 float32_log2( float32 a STATUS_PARAM )
2582
uint32_t aSig, zSig, i;
2584
a = float32_squash_input_denormal(a STATUS_VAR);
2585
aSig = extractFloat32Frac( a );
2586
aExp = extractFloat32Exp( a );
2587
aSign = extractFloat32Sign( a );
2590
if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2591
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2594
float_raise( float_flag_invalid STATUS_VAR);
2595
return float32_default_nan;
2597
if ( aExp == 0xFF ) {
2598
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2607
for (i = 1 << 22; i > 0; i >>= 1) {
2608
aSig = ( (uint64_t)aSig * aSig ) >> 23;
2609
if ( aSig & 0x01000000 ) {
2618
return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2621
/*----------------------------------------------------------------------------
2622
| Returns 1 if the single-precision floating-point value `a' is equal to
2623
| the corresponding value `b', and 0 otherwise. The invalid exception is
2624
| raised if either operand is a NaN. Otherwise, the comparison is performed
2625
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2626
*----------------------------------------------------------------------------*/
2628
int float32_eq( float32 a, float32 b STATUS_PARAM )
2631
a = float32_squash_input_denormal(a STATUS_VAR);
2632
b = float32_squash_input_denormal(b STATUS_VAR);
2634
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2635
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2637
float_raise( float_flag_invalid STATUS_VAR);
2640
av = float32_val(a);
2641
bv = float32_val(b);
2642
return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2645
/*----------------------------------------------------------------------------
2646
| Returns 1 if the single-precision floating-point value `a' is less than
2647
| or equal to the corresponding value `b', and 0 otherwise. The invalid
2648
| exception is raised if either operand is a NaN. The comparison is performed
2649
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2650
*----------------------------------------------------------------------------*/
2652
int float32_le( float32 a, float32 b STATUS_PARAM )
2656
a = float32_squash_input_denormal(a STATUS_VAR);
2657
b = float32_squash_input_denormal(b STATUS_VAR);
2659
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2660
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2662
float_raise( float_flag_invalid STATUS_VAR);
2665
aSign = extractFloat32Sign( a );
2666
bSign = extractFloat32Sign( b );
2667
av = float32_val(a);
2668
bv = float32_val(b);
2669
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2670
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2674
/*----------------------------------------------------------------------------
2675
| Returns 1 if the single-precision floating-point value `a' is less than
2676
| the corresponding value `b', and 0 otherwise. The invalid exception is
2677
| raised if either operand is a NaN. The comparison is performed according
2678
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2679
*----------------------------------------------------------------------------*/
2681
int float32_lt( float32 a, float32 b STATUS_PARAM )
2685
a = float32_squash_input_denormal(a STATUS_VAR);
2686
b = float32_squash_input_denormal(b STATUS_VAR);
2688
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2689
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2691
float_raise( float_flag_invalid STATUS_VAR);
2694
aSign = extractFloat32Sign( a );
2695
bSign = extractFloat32Sign( b );
2696
av = float32_val(a);
2697
bv = float32_val(b);
2698
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2699
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2703
/*----------------------------------------------------------------------------
2704
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2705
| be compared, and 0 otherwise. The invalid exception is raised if either
2706
| operand is a NaN. The comparison is performed according to the IEC/IEEE
2707
| Standard for Binary Floating-Point Arithmetic.
2708
*----------------------------------------------------------------------------*/
2710
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2712
a = float32_squash_input_denormal(a STATUS_VAR);
2713
b = float32_squash_input_denormal(b STATUS_VAR);
2715
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2716
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2718
float_raise( float_flag_invalid STATUS_VAR);
2724
/*----------------------------------------------------------------------------
2725
| Returns 1 if the single-precision floating-point value `a' is equal to
2726
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2727
| exception. The comparison is performed according to the IEC/IEEE Standard
2728
| for Binary Floating-Point Arithmetic.
2729
*----------------------------------------------------------------------------*/
2731
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2733
a = float32_squash_input_denormal(a STATUS_VAR);
2734
b = float32_squash_input_denormal(b STATUS_VAR);
2736
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2737
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2739
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2740
float_raise( float_flag_invalid STATUS_VAR);
2744
return ( float32_val(a) == float32_val(b) ) ||
2745
( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2748
/*----------------------------------------------------------------------------
2749
| Returns 1 if the single-precision floating-point value `a' is less than or
2750
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2751
| cause an exception. Otherwise, the comparison is performed according to the
2752
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2753
*----------------------------------------------------------------------------*/
2755
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2759
a = float32_squash_input_denormal(a STATUS_VAR);
2760
b = float32_squash_input_denormal(b STATUS_VAR);
2762
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2763
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2765
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2766
float_raise( float_flag_invalid STATUS_VAR);
2770
aSign = extractFloat32Sign( a );
2771
bSign = extractFloat32Sign( b );
2772
av = float32_val(a);
2773
bv = float32_val(b);
2774
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2775
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2779
/*----------------------------------------------------------------------------
2780
| Returns 1 if the single-precision floating-point value `a' is less than
2781
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2782
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2783
| Standard for Binary Floating-Point Arithmetic.
2784
*----------------------------------------------------------------------------*/
2786
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2790
a = float32_squash_input_denormal(a STATUS_VAR);
2791
b = float32_squash_input_denormal(b STATUS_VAR);
2793
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2794
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2796
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2797
float_raise( float_flag_invalid STATUS_VAR);
2801
aSign = extractFloat32Sign( a );
2802
bSign = extractFloat32Sign( b );
2803
av = float32_val(a);
2804
bv = float32_val(b);
2805
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2806
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2810
/*----------------------------------------------------------------------------
2811
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2812
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2813
| comparison is performed according to the IEC/IEEE Standard for Binary
2814
| Floating-Point Arithmetic.
2815
*----------------------------------------------------------------------------*/
2817
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2819
a = float32_squash_input_denormal(a STATUS_VAR);
2820
b = float32_squash_input_denormal(b STATUS_VAR);
2822
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2823
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2825
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2826
float_raise( float_flag_invalid STATUS_VAR);
2833
/*----------------------------------------------------------------------------
2834
| Returns the result of converting the double-precision floating-point value
2835
| `a' to the 32-bit two's complement integer format. The conversion is
2836
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2837
| Arithmetic---which means in particular that the conversion is rounded
2838
| according to the current rounding mode. If `a' is a NaN, the largest
2839
| positive integer is returned. Otherwise, if the conversion overflows, the
2840
| largest integer with the same sign as `a' is returned.
2841
*----------------------------------------------------------------------------*/
2843
int32 float64_to_int32( float64 a STATUS_PARAM )
2846
int_fast16_t aExp, shiftCount;
2848
a = float64_squash_input_denormal(a STATUS_VAR);
2850
aSig = extractFloat64Frac( a );
2851
aExp = extractFloat64Exp( a );
2852
aSign = extractFloat64Sign( a );
2853
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2854
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2855
shiftCount = 0x42C - aExp;
2856
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2857
return roundAndPackInt32( aSign, aSig STATUS_VAR );
2861
/*----------------------------------------------------------------------------
2862
| Returns the result of converting the double-precision floating-point value
2863
| `a' to the 32-bit two's complement integer format. The conversion is
2864
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2865
| Arithmetic, except that the conversion is always rounded toward zero.
2866
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2867
| the conversion overflows, the largest integer with the same sign as `a' is
2869
*----------------------------------------------------------------------------*/
2871
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2874
int_fast16_t aExp, shiftCount;
2875
uint64_t aSig, savedASig;
2877
a = float64_squash_input_denormal(a STATUS_VAR);
2879
aSig = extractFloat64Frac( a );
2880
aExp = extractFloat64Exp( a );
2881
aSign = extractFloat64Sign( a );
2882
if ( 0x41E < aExp ) {
2883
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2886
else if ( aExp < 0x3FF ) {
2887
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2890
aSig |= LIT64( 0x0010000000000000 );
2891
shiftCount = 0x433 - aExp;
2893
aSig >>= shiftCount;
2895
if ( aSign ) z = - z;
2896
if ( ( z < 0 ) ^ aSign ) {
2898
float_raise( float_flag_invalid STATUS_VAR);
2899
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2901
if ( ( aSig<<shiftCount ) != savedASig ) {
2902
STATUS(float_exception_flags) |= float_flag_inexact;
2908
/*----------------------------------------------------------------------------
2909
| Returns the result of converting the double-precision floating-point value
2910
| `a' to the 16-bit two's complement integer format. The conversion is
2911
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2912
| Arithmetic, except that the conversion is always rounded toward zero.
2913
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2914
| the conversion overflows, the largest integer with the same sign as `a' is
2916
*----------------------------------------------------------------------------*/
2918
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2921
int_fast16_t aExp, shiftCount;
2922
uint64_t aSig, savedASig;
2925
aSig = extractFloat64Frac( a );
2926
aExp = extractFloat64Exp( a );
2927
aSign = extractFloat64Sign( a );
2928
if ( 0x40E < aExp ) {
2929
if ( ( aExp == 0x7FF ) && aSig ) {
2934
else if ( aExp < 0x3FF ) {
2935
if ( aExp || aSig ) {
2936
STATUS(float_exception_flags) |= float_flag_inexact;
2940
aSig |= LIT64( 0x0010000000000000 );
2941
shiftCount = 0x433 - aExp;
2943
aSig >>= shiftCount;
2948
if ( ( (int16_t)z < 0 ) ^ aSign ) {
2950
float_raise( float_flag_invalid STATUS_VAR);
2951
return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2953
if ( ( aSig<<shiftCount ) != savedASig ) {
2954
STATUS(float_exception_flags) |= float_flag_inexact;
2959
/*----------------------------------------------------------------------------
2960
| Returns the result of converting the double-precision floating-point value
2961
| `a' to the 64-bit two's complement integer format. The conversion is
2962
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963
| Arithmetic---which means in particular that the conversion is rounded
2964
| according to the current rounding mode. If `a' is a NaN, the largest
2965
| positive integer is returned. Otherwise, if the conversion overflows, the
2966
| largest integer with the same sign as `a' is returned.
2967
*----------------------------------------------------------------------------*/
2969
int64 float64_to_int64( float64 a STATUS_PARAM )
2972
int_fast16_t aExp, shiftCount;
2973
uint64_t aSig, aSigExtra;
2974
a = float64_squash_input_denormal(a STATUS_VAR);
2976
aSig = extractFloat64Frac( a );
2977
aExp = extractFloat64Exp( a );
2978
aSign = extractFloat64Sign( a );
2979
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2980
shiftCount = 0x433 - aExp;
2981
if ( shiftCount <= 0 ) {
2982
if ( 0x43E < aExp ) {
2983
float_raise( float_flag_invalid STATUS_VAR);
2985
|| ( ( aExp == 0x7FF )
2986
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2988
return LIT64( 0x7FFFFFFFFFFFFFFF );
2990
return (int64_t) LIT64( 0x8000000000000000 );
2993
aSig <<= - shiftCount;
2996
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2998
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3002
/*----------------------------------------------------------------------------
3003
| Returns the result of converting the double-precision floating-point value
3004
| `a' to the 64-bit two's complement integer format. The conversion is
3005
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3006
| Arithmetic, except that the conversion is always rounded toward zero.
3007
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3008
| the conversion overflows, the largest integer with the same sign as `a' is
3010
*----------------------------------------------------------------------------*/
3012
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3015
int_fast16_t aExp, shiftCount;
3018
a = float64_squash_input_denormal(a STATUS_VAR);
3020
aSig = extractFloat64Frac( a );
3021
aExp = extractFloat64Exp( a );
3022
aSign = extractFloat64Sign( a );
3023
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3024
shiftCount = aExp - 0x433;
3025
if ( 0 <= shiftCount ) {
3026
if ( 0x43E <= aExp ) {
3027
if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3028
float_raise( float_flag_invalid STATUS_VAR);
3030
|| ( ( aExp == 0x7FF )
3031
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
3033
return LIT64( 0x7FFFFFFFFFFFFFFF );
3036
return (int64_t) LIT64( 0x8000000000000000 );
3038
z = aSig<<shiftCount;
3041
if ( aExp < 0x3FE ) {
3042
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3045
z = aSig>>( - shiftCount );
3046
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3047
STATUS(float_exception_flags) |= float_flag_inexact;
3050
if ( aSign ) z = - z;
3055
/*----------------------------------------------------------------------------
3056
| Returns the result of converting the double-precision floating-point value
3057
| `a' to the single-precision floating-point format. The conversion is
3058
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3060
*----------------------------------------------------------------------------*/
3062
float32 float64_to_float32( float64 a STATUS_PARAM )
3068
a = float64_squash_input_denormal(a STATUS_VAR);
3070
aSig = extractFloat64Frac( a );
3071
aExp = extractFloat64Exp( a );
3072
aSign = extractFloat64Sign( a );
3073
if ( aExp == 0x7FF ) {
3074
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3075
return packFloat32( aSign, 0xFF, 0 );
3077
shift64RightJamming( aSig, 22, &aSig );
3079
if ( aExp || zSig ) {
3083
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3088
/*----------------------------------------------------------------------------
3089
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3090
| half-precision floating-point value, returning the result. After being
3091
| shifted into the proper positions, the three fields are simply added
3092
| together to form the result. This means that any integer portion of `zSig'
3093
| will be added into the exponent. Since a properly normalized significand
3094
| will have an integer portion equal to 1, the `zExp' input should be 1 less
3095
| than the desired result exponent whenever `zSig' is a complete, normalized
3097
*----------------------------------------------------------------------------*/
3098
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3100
return make_float16(
3101
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3104
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3105
The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3107
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3113
aSign = extractFloat16Sign(a);
3114
aExp = extractFloat16Exp(a);
3115
aSig = extractFloat16Frac(a);
3117
if (aExp == 0x1f && ieee) {
3119
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3121
return packFloat32(aSign, 0xff, 0);
3127
return packFloat32(aSign, 0, 0);
3130
shiftCount = countLeadingZeros32( aSig ) - 21;
3131
aSig = aSig << shiftCount;
3134
return packFloat32( aSign, aExp + 0x70, aSig << 13);
3137
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3145
int maxexp = ieee ? 15 : 16;
3146
bool rounding_bumps_exp;
3147
bool is_tiny = false;
3149
a = float32_squash_input_denormal(a STATUS_VAR);
3151
aSig = extractFloat32Frac( a );
3152
aExp = extractFloat32Exp( a );
3153
aSign = extractFloat32Sign( a );
3154
if ( aExp == 0xFF ) {
3156
/* Input is a NaN */
3158
float_raise(float_flag_invalid STATUS_VAR);
3159
return packFloat16(aSign, 0, 0);
3161
return commonNaNToFloat16(
3162
float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3166
float_raise(float_flag_invalid STATUS_VAR);
3167
return packFloat16(aSign, 0x1f, 0x3ff);
3169
return packFloat16(aSign, 0x1f, 0);
3171
if (aExp == 0 && aSig == 0) {
3172
return packFloat16(aSign, 0, 0);
3174
/* Decimal point between bits 22 and 23. Note that we add the 1 bit
3175
* even if the input is denormal; however this is harmless because
3176
* the largest possible single-precision denormal is still smaller
3177
* than the smallest representable half-precision denormal, and so we
3178
* will end up ignoring aSig and returning via the "always return zero"
3183
/* Calculate the mask of bits of the mantissa which are not
3184
* representable in half-precision and will be lost.
3187
/* Will be denormal in halfprec */
3193
/* Normal number in halfprec */
3197
roundingMode = STATUS(float_rounding_mode);
3198
switch (roundingMode) {
3199
case float_round_nearest_even:
3200
increment = (mask + 1) >> 1;
3201
if ((aSig & mask) == increment) {
3202
increment = aSig & (increment << 1);
3205
case float_round_up:
3206
increment = aSign ? 0 : mask;
3208
case float_round_down:
3209
increment = aSign ? mask : 0;
3211
default: /* round_to_zero */
3216
rounding_bumps_exp = (aSig + increment >= 0x01000000);
3218
if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3220
float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3221
return packFloat16(aSign, 0x1f, 0);
3223
float_raise(float_flag_invalid STATUS_VAR);
3224
return packFloat16(aSign, 0x1f, 0x3ff);
3229
/* Note that flush-to-zero does not affect half-precision results */
3231
(STATUS(float_detect_tininess) == float_tininess_before_rounding)
3233
|| (!rounding_bumps_exp);
3236
float_raise(float_flag_inexact STATUS_VAR);
3238
float_raise(float_flag_underflow STATUS_VAR);
3243
if (rounding_bumps_exp) {
3249
return packFloat16(aSign, 0, 0);
3252
aSig >>= -14 - aExp;
3255
return packFloat16(aSign, aExp + 14, aSig >> 13);
3258
/*----------------------------------------------------------------------------
3259
| Returns the result of converting the double-precision floating-point value
3260
| `a' to the extended double-precision floating-point format. The conversion
3261
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3263
*----------------------------------------------------------------------------*/
3265
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3271
a = float64_squash_input_denormal(a STATUS_VAR);
3272
aSig = extractFloat64Frac( a );
3273
aExp = extractFloat64Exp( a );
3274
aSign = extractFloat64Sign( a );
3275
if ( aExp == 0x7FF ) {
3276
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3277
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3280
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3281
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3285
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3289
/*----------------------------------------------------------------------------
3290
| Returns the result of converting the double-precision floating-point value
3291
| `a' to the quadruple-precision floating-point format. The conversion is
3292
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3294
*----------------------------------------------------------------------------*/
3296
float128 float64_to_float128( float64 a STATUS_PARAM )
3300
uint64_t aSig, zSig0, zSig1;
3302
a = float64_squash_input_denormal(a STATUS_VAR);
3303
aSig = extractFloat64Frac( a );
3304
aExp = extractFloat64Exp( a );
3305
aSign = extractFloat64Sign( a );
3306
if ( aExp == 0x7FF ) {
3307
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3308
return packFloat128( aSign, 0x7FFF, 0, 0 );
3311
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3312
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3315
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3316
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3320
/*----------------------------------------------------------------------------
3321
| Rounds the double-precision floating-point value `a' to an integer, and
3322
| returns the result as a double-precision floating-point value. The
3323
| operation is performed according to the IEC/IEEE Standard for Binary
3324
| Floating-Point Arithmetic.
3325
*----------------------------------------------------------------------------*/
3327
float64 float64_round_to_int( float64 a STATUS_PARAM )
3331
uint64_t lastBitMask, roundBitsMask;
3334
a = float64_squash_input_denormal(a STATUS_VAR);
3336
aExp = extractFloat64Exp( a );
3337
if ( 0x433 <= aExp ) {
3338
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3339
return propagateFloat64NaN( a, a STATUS_VAR );
3343
if ( aExp < 0x3FF ) {
3344
if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3345
STATUS(float_exception_flags) |= float_flag_inexact;
3346
aSign = extractFloat64Sign( a );
3347
switch ( STATUS(float_rounding_mode) ) {
3348
case float_round_nearest_even:
3349
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3350
return packFloat64( aSign, 0x3FF, 0 );
3353
case float_round_down:
3354
return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3355
case float_round_up:
3356
return make_float64(
3357
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3359
return packFloat64( aSign, 0, 0 );
3362
lastBitMask <<= 0x433 - aExp;
3363
roundBitsMask = lastBitMask - 1;
3365
roundingMode = STATUS(float_rounding_mode);
3366
if ( roundingMode == float_round_nearest_even ) {
3367
z += lastBitMask>>1;
3368
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3370
else if ( roundingMode != float_round_to_zero ) {
3371
if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3375
z &= ~ roundBitsMask;
3376
if ( z != float64_val(a) )
3377
STATUS(float_exception_flags) |= float_flag_inexact;
3378
return make_float64(z);
3382
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3386
oldmode = STATUS(float_rounding_mode);
3387
STATUS(float_rounding_mode) = float_round_to_zero;
3388
res = float64_round_to_int(a STATUS_VAR);
3389
STATUS(float_rounding_mode) = oldmode;
3393
/*----------------------------------------------------------------------------
3394
| Returns the result of adding the absolute values of the double-precision
3395
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3396
| before being returned. `zSign' is ignored if the result is a NaN.
3397
| The addition is performed according to the IEC/IEEE Standard for Binary
3398
| Floating-Point Arithmetic.
3399
*----------------------------------------------------------------------------*/
3401
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3403
int_fast16_t aExp, bExp, zExp;
3404
uint64_t aSig, bSig, zSig;
3405
int_fast16_t expDiff;
3407
aSig = extractFloat64Frac( a );
3408
aExp = extractFloat64Exp( a );
3409
bSig = extractFloat64Frac( b );
3410
bExp = extractFloat64Exp( b );
3411
expDiff = aExp - bExp;
3414
if ( 0 < expDiff ) {
3415
if ( aExp == 0x7FF ) {
3416
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3423
bSig |= LIT64( 0x2000000000000000 );
3425
shift64RightJamming( bSig, expDiff, &bSig );
3428
else if ( expDiff < 0 ) {
3429
if ( bExp == 0x7FF ) {
3430
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3431
return packFloat64( zSign, 0x7FF, 0 );
3437
aSig |= LIT64( 0x2000000000000000 );
3439
shift64RightJamming( aSig, - expDiff, &aSig );
3443
if ( aExp == 0x7FF ) {
3444
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3448
if (STATUS(flush_to_zero)) {
3450
float_raise(float_flag_output_denormal STATUS_VAR);
3452
return packFloat64(zSign, 0, 0);
3454
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3456
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3460
aSig |= LIT64( 0x2000000000000000 );
3461
zSig = ( aSig + bSig )<<1;
3463
if ( (int64_t) zSig < 0 ) {
3468
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3472
/*----------------------------------------------------------------------------
3473
| Returns the result of subtracting the absolute values of the double-
3474
| precision floating-point values `a' and `b'. If `zSign' is 1, the
3475
| difference is negated before being returned. `zSign' is ignored if the
3476
| result is a NaN. The subtraction is performed according to the IEC/IEEE
3477
| Standard for Binary Floating-Point Arithmetic.
3478
*----------------------------------------------------------------------------*/
3480
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3482
int_fast16_t aExp, bExp, zExp;
3483
uint64_t aSig, bSig, zSig;
3484
int_fast16_t expDiff;
3486
aSig = extractFloat64Frac( a );
3487
aExp = extractFloat64Exp( a );
3488
bSig = extractFloat64Frac( b );
3489
bExp = extractFloat64Exp( b );
3490
expDiff = aExp - bExp;
3493
if ( 0 < expDiff ) goto aExpBigger;
3494
if ( expDiff < 0 ) goto bExpBigger;
3495
if ( aExp == 0x7FF ) {
3496
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3497
float_raise( float_flag_invalid STATUS_VAR);
3498
return float64_default_nan;
3504
if ( bSig < aSig ) goto aBigger;
3505
if ( aSig < bSig ) goto bBigger;
3506
return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3508
if ( bExp == 0x7FF ) {
3509
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3510
return packFloat64( zSign ^ 1, 0x7FF, 0 );
3516
aSig |= LIT64( 0x4000000000000000 );
3518
shift64RightJamming( aSig, - expDiff, &aSig );
3519
bSig |= LIT64( 0x4000000000000000 );
3524
goto normalizeRoundAndPack;
3526
if ( aExp == 0x7FF ) {
3527
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3534
bSig |= LIT64( 0x4000000000000000 );
3536
shift64RightJamming( bSig, expDiff, &bSig );
3537
aSig |= LIT64( 0x4000000000000000 );
3541
normalizeRoundAndPack:
3543
return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3547
/*----------------------------------------------------------------------------
3548
| Returns the result of adding the double-precision floating-point values `a'
3549
| and `b'. The operation is performed according to the IEC/IEEE Standard for
3550
| Binary Floating-Point Arithmetic.
3551
*----------------------------------------------------------------------------*/
3553
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3556
a = float64_squash_input_denormal(a STATUS_VAR);
3557
b = float64_squash_input_denormal(b STATUS_VAR);
3559
aSign = extractFloat64Sign( a );
3560
bSign = extractFloat64Sign( b );
3561
if ( aSign == bSign ) {
3562
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3565
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3570
/*----------------------------------------------------------------------------
3571
| Returns the result of subtracting the double-precision floating-point values
3572
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3573
| for Binary Floating-Point Arithmetic.
3574
*----------------------------------------------------------------------------*/
3576
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3579
a = float64_squash_input_denormal(a STATUS_VAR);
3580
b = float64_squash_input_denormal(b STATUS_VAR);
3582
aSign = extractFloat64Sign( a );
3583
bSign = extractFloat64Sign( b );
3584
if ( aSign == bSign ) {
3585
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3588
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3593
/*----------------------------------------------------------------------------
3594
| Returns the result of multiplying the double-precision floating-point values
3595
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3596
| for Binary Floating-Point Arithmetic.
3597
*----------------------------------------------------------------------------*/
3599
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3601
flag aSign, bSign, zSign;
3602
int_fast16_t aExp, bExp, zExp;
3603
uint64_t aSig, bSig, zSig0, zSig1;
3605
a = float64_squash_input_denormal(a STATUS_VAR);
3606
b = float64_squash_input_denormal(b STATUS_VAR);
3608
aSig = extractFloat64Frac( a );
3609
aExp = extractFloat64Exp( a );
3610
aSign = extractFloat64Sign( a );
3611
bSig = extractFloat64Frac( b );
3612
bExp = extractFloat64Exp( b );
3613
bSign = extractFloat64Sign( b );
3614
zSign = aSign ^ bSign;
3615
if ( aExp == 0x7FF ) {
3616
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3617
return propagateFloat64NaN( a, b STATUS_VAR );
3619
if ( ( bExp | bSig ) == 0 ) {
3620
float_raise( float_flag_invalid STATUS_VAR);
3621
return float64_default_nan;
3623
return packFloat64( zSign, 0x7FF, 0 );
3625
if ( bExp == 0x7FF ) {
3626
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3627
if ( ( aExp | aSig ) == 0 ) {
3628
float_raise( float_flag_invalid STATUS_VAR);
3629
return float64_default_nan;
3631
return packFloat64( zSign, 0x7FF, 0 );
3634
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3635
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3638
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3639
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3641
zExp = aExp + bExp - 0x3FF;
3642
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3643
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3644
mul64To128( aSig, bSig, &zSig0, &zSig1 );
3645
zSig0 |= ( zSig1 != 0 );
3646
if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3650
return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3654
/*----------------------------------------------------------------------------
3655
| Returns the result of dividing the double-precision floating-point value `a'
3656
| by the corresponding value `b'. The operation is performed according to
3657
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3658
*----------------------------------------------------------------------------*/
3660
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3662
flag aSign, bSign, zSign;
3663
int_fast16_t aExp, bExp, zExp;
3664
uint64_t aSig, bSig, zSig;
3665
uint64_t rem0, rem1;
3666
uint64_t term0, term1;
3667
a = float64_squash_input_denormal(a STATUS_VAR);
3668
b = float64_squash_input_denormal(b STATUS_VAR);
3670
aSig = extractFloat64Frac( a );
3671
aExp = extractFloat64Exp( a );
3672
aSign = extractFloat64Sign( a );
3673
bSig = extractFloat64Frac( b );
3674
bExp = extractFloat64Exp( b );
3675
bSign = extractFloat64Sign( b );
3676
zSign = aSign ^ bSign;
3677
if ( aExp == 0x7FF ) {
3678
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3679
if ( bExp == 0x7FF ) {
3680
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3681
float_raise( float_flag_invalid STATUS_VAR);
3682
return float64_default_nan;
3684
return packFloat64( zSign, 0x7FF, 0 );
3686
if ( bExp == 0x7FF ) {
3687
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3688
return packFloat64( zSign, 0, 0 );
3692
if ( ( aExp | aSig ) == 0 ) {
3693
float_raise( float_flag_invalid STATUS_VAR);
3694
return float64_default_nan;
3696
float_raise( float_flag_divbyzero STATUS_VAR);
3697
return packFloat64( zSign, 0x7FF, 0 );
3699
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3702
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3703
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3705
zExp = aExp - bExp + 0x3FD;
3706
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3707
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3708
if ( bSig <= ( aSig + aSig ) ) {
3712
zSig = estimateDiv128To64( aSig, 0, bSig );
3713
if ( ( zSig & 0x1FF ) <= 2 ) {
3714
mul64To128( bSig, zSig, &term0, &term1 );
3715
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3716
while ( (int64_t) rem0 < 0 ) {
3718
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3720
zSig |= ( rem1 != 0 );
3722
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3726
/*----------------------------------------------------------------------------
3727
| Returns the remainder of the double-precision floating-point value `a'
3728
| with respect to the corresponding value `b'. The operation is performed
3729
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3730
*----------------------------------------------------------------------------*/
3732
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3735
int_fast16_t aExp, bExp, expDiff;
3736
uint64_t aSig, bSig;
3737
uint64_t q, alternateASig;
3740
a = float64_squash_input_denormal(a STATUS_VAR);
3741
b = float64_squash_input_denormal(b STATUS_VAR);
3742
aSig = extractFloat64Frac( a );
3743
aExp = extractFloat64Exp( a );
3744
aSign = extractFloat64Sign( a );
3745
bSig = extractFloat64Frac( b );
3746
bExp = extractFloat64Exp( b );
3747
if ( aExp == 0x7FF ) {
3748
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3749
return propagateFloat64NaN( a, b STATUS_VAR );
3751
float_raise( float_flag_invalid STATUS_VAR);
3752
return float64_default_nan;
3754
if ( bExp == 0x7FF ) {
3755
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3760
float_raise( float_flag_invalid STATUS_VAR);
3761
return float64_default_nan;
3763
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3766
if ( aSig == 0 ) return a;
3767
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3769
expDiff = aExp - bExp;
3770
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3771
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3772
if ( expDiff < 0 ) {
3773
if ( expDiff < -1 ) return a;
3776
q = ( bSig <= aSig );
3777
if ( q ) aSig -= bSig;
3779
while ( 0 < expDiff ) {
3780
q = estimateDiv128To64( aSig, 0, bSig );
3781
q = ( 2 < q ) ? q - 2 : 0;
3782
aSig = - ( ( bSig>>2 ) * q );
3786
if ( 0 < expDiff ) {
3787
q = estimateDiv128To64( aSig, 0, bSig );
3788
q = ( 2 < q ) ? q - 2 : 0;
3791
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3798
alternateASig = aSig;
3801
} while ( 0 <= (int64_t) aSig );
3802
sigMean = aSig + alternateASig;
3803
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3804
aSig = alternateASig;
3806
zSign = ( (int64_t) aSig < 0 );
3807
if ( zSign ) aSig = - aSig;
3808
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3812
/*----------------------------------------------------------------------------
3813
| Returns the result of multiplying the double-precision floating-point values
3814
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3815
| multiplication. The operation is performed according to the IEC/IEEE
3816
| Standard for Binary Floating-Point Arithmetic 754-2008.
3817
| The flags argument allows the caller to select negation of the
3818
| addend, the intermediate product, or the final result. (The difference
3819
| between this and having the caller do a separate negation is that negating
3820
| externally will flip the sign bit on NaNs.)
3821
*----------------------------------------------------------------------------*/
3823
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3825
flag aSign, bSign, cSign, zSign;
3826
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3827
uint64_t aSig, bSig, cSig;
3828
flag pInf, pZero, pSign;
3829
uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3831
flag signflip, infzero;
3833
a = float64_squash_input_denormal(a STATUS_VAR);
3834
b = float64_squash_input_denormal(b STATUS_VAR);
3835
c = float64_squash_input_denormal(c STATUS_VAR);
3836
aSig = extractFloat64Frac(a);
3837
aExp = extractFloat64Exp(a);
3838
aSign = extractFloat64Sign(a);
3839
bSig = extractFloat64Frac(b);
3840
bExp = extractFloat64Exp(b);
3841
bSign = extractFloat64Sign(b);
3842
cSig = extractFloat64Frac(c);
3843
cExp = extractFloat64Exp(c);
3844
cSign = extractFloat64Sign(c);
3846
infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3847
(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3849
/* It is implementation-defined whether the cases of (0,inf,qnan)
3850
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3851
* they return if they do), so we have to hand this information
3852
* off to the target-specific pick-a-NaN routine.
3854
if (((aExp == 0x7ff) && aSig) ||
3855
((bExp == 0x7ff) && bSig) ||
3856
((cExp == 0x7ff) && cSig)) {
3857
return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3861
float_raise(float_flag_invalid STATUS_VAR);
3862
return float64_default_nan;
3865
if (flags & float_muladd_negate_c) {
3869
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3871
/* Work out the sign and type of the product */
3872
pSign = aSign ^ bSign;
3873
if (flags & float_muladd_negate_product) {
3876
pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3877
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3879
if (cExp == 0x7ff) {
3880
if (pInf && (pSign ^ cSign)) {
3881
/* addition of opposite-signed infinities => InvalidOperation */
3882
float_raise(float_flag_invalid STATUS_VAR);
3883
return float64_default_nan;
3885
/* Otherwise generate an infinity of the same sign */
3886
return packFloat64(cSign ^ signflip, 0x7ff, 0);
3890
return packFloat64(pSign ^ signflip, 0x7ff, 0);
3896
/* Adding two exact zeroes */
3897
if (pSign == cSign) {
3899
} else if (STATUS(float_rounding_mode) == float_round_down) {
3904
return packFloat64(zSign ^ signflip, 0, 0);
3906
/* Exact zero plus a denorm */
3907
if (STATUS(flush_to_zero)) {
3908
float_raise(float_flag_output_denormal STATUS_VAR);
3909
return packFloat64(cSign ^ signflip, 0, 0);
3912
/* Zero plus something non-zero : just return the something */
3913
return packFloat64(cSign ^ signflip, cExp, cSig);
3917
normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3920
normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3923
/* Calculate the actual result a * b + c */
3925
/* Multiply first; this is easy. */
3926
/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3927
* because we want the true exponent, not the "one-less-than"
3928
* flavour that roundAndPackFloat64() takes.
3930
pExp = aExp + bExp - 0x3fe;
3931
aSig = (aSig | LIT64(0x0010000000000000))<<10;
3932
bSig = (bSig | LIT64(0x0010000000000000))<<11;
3933
mul64To128(aSig, bSig, &pSig0, &pSig1);
3934
if ((int64_t)(pSig0 << 1) >= 0) {
3935
shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3939
zSign = pSign ^ signflip;
3941
/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3942
* bit in position 126.
3946
/* Throw out the special case of c being an exact zero now */
3947
shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3948
return roundAndPackFloat64(zSign, pExp - 1,
3951
normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3954
/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3955
* significand of the addend, with the explicit bit in position 126.
3957
cSig0 = cSig << (126 - 64 - 52);
3959
cSig0 |= LIT64(0x4000000000000000);
3960
expDiff = pExp - cExp;
3962
if (pSign == cSign) {
3965
/* scale c to match p */
3966
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3968
} else if (expDiff < 0) {
3969
/* scale p to match c */
3970
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3973
/* no scaling needed */
3976
/* Add significands and make sure explicit bit ends up in posn 126 */
3977
add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3978
if ((int64_t)zSig0 < 0) {
3979
shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3983
shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3984
return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3988
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3989
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3991
} else if (expDiff < 0) {
3992
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3993
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3998
if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3999
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4000
} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4001
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4006
if (STATUS(float_rounding_mode) == float_round_down) {
4009
return packFloat64(zSign, 0, 0);
4013
/* Do the equivalent of normalizeRoundAndPackFloat64() but
4014
* starting with the significand in a pair of uint64_t.
4017
shiftcount = countLeadingZeros64(zSig0) - 1;
4018
shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4024
shiftcount = countLeadingZeros64(zSig1);
4025
if (shiftcount == 0) {
4026
zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4030
zSig0 = zSig1 << shiftcount;
4031
zExp -= (shiftcount + 64);
4034
return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4038
/*----------------------------------------------------------------------------
4039
| Returns the square root of the double-precision floating-point value `a'.
4040
| The operation is performed according to the IEC/IEEE Standard for Binary
4041
| Floating-Point Arithmetic.
4042
*----------------------------------------------------------------------------*/
4044
float64 float64_sqrt( float64 a STATUS_PARAM )
4047
int_fast16_t aExp, zExp;
4048
uint64_t aSig, zSig, doubleZSig;
4049
uint64_t rem0, rem1, term0, term1;
4050
a = float64_squash_input_denormal(a STATUS_VAR);
4052
aSig = extractFloat64Frac( a );
4053
aExp = extractFloat64Exp( a );
4054
aSign = extractFloat64Sign( a );
4055
if ( aExp == 0x7FF ) {
4056
if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4057
if ( ! aSign ) return a;
4058
float_raise( float_flag_invalid STATUS_VAR);
4059
return float64_default_nan;
4062
if ( ( aExp | aSig ) == 0 ) return a;
4063
float_raise( float_flag_invalid STATUS_VAR);
4064
return float64_default_nan;
4067
if ( aSig == 0 ) return float64_zero;
4068
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4070
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4071
aSig |= LIT64( 0x0010000000000000 );
4072
zSig = estimateSqrt32( aExp, aSig>>21 );
4073
aSig <<= 9 - ( aExp & 1 );
4074
zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4075
if ( ( zSig & 0x1FF ) <= 5 ) {
4076
doubleZSig = zSig<<1;
4077
mul64To128( zSig, zSig, &term0, &term1 );
4078
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4079
while ( (int64_t) rem0 < 0 ) {
4082
add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4084
zSig |= ( ( rem0 | rem1 ) != 0 );
4086
return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4090
/*----------------------------------------------------------------------------
4091
| Returns the binary log of the double-precision floating-point value `a'.
4092
| The operation is performed according to the IEC/IEEE Standard for Binary
4093
| Floating-Point Arithmetic.
4094
*----------------------------------------------------------------------------*/
4095
float64 float64_log2( float64 a STATUS_PARAM )
4099
uint64_t aSig, aSig0, aSig1, zSig, i;
4100
a = float64_squash_input_denormal(a STATUS_VAR);
4102
aSig = extractFloat64Frac( a );
4103
aExp = extractFloat64Exp( a );
4104
aSign = extractFloat64Sign( a );
4107
if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4108
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4111
float_raise( float_flag_invalid STATUS_VAR);
4112
return float64_default_nan;
4114
if ( aExp == 0x7FF ) {
4115
if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4120
aSig |= LIT64( 0x0010000000000000 );
4122
zSig = (uint64_t)aExp << 52;
4123
for (i = 1LL << 51; i > 0; i >>= 1) {
4124
mul64To128( aSig, aSig, &aSig0, &aSig1 );
4125
aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4126
if ( aSig & LIT64( 0x0020000000000000 ) ) {
4134
return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4137
/*----------------------------------------------------------------------------
4138
| Returns 1 if the double-precision floating-point value `a' is equal to the
4139
| corresponding value `b', and 0 otherwise. The invalid exception is raised
4140
| if either operand is a NaN. Otherwise, the comparison is performed
4141
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4142
*----------------------------------------------------------------------------*/
4144
int float64_eq( float64 a, float64 b STATUS_PARAM )
4147
a = float64_squash_input_denormal(a STATUS_VAR);
4148
b = float64_squash_input_denormal(b STATUS_VAR);
4150
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4151
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4153
float_raise( float_flag_invalid STATUS_VAR);
4156
av = float64_val(a);
4157
bv = float64_val(b);
4158
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4162
/*----------------------------------------------------------------------------
4163
| Returns 1 if the double-precision floating-point value `a' is less than or
4164
| equal to the corresponding value `b', and 0 otherwise. The invalid
4165
| exception is raised if either operand is a NaN. The comparison is performed
4166
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4167
*----------------------------------------------------------------------------*/
4169
int float64_le( float64 a, float64 b STATUS_PARAM )
4173
a = float64_squash_input_denormal(a STATUS_VAR);
4174
b = float64_squash_input_denormal(b STATUS_VAR);
4176
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4177
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4179
float_raise( float_flag_invalid STATUS_VAR);
4182
aSign = extractFloat64Sign( a );
4183
bSign = extractFloat64Sign( b );
4184
av = float64_val(a);
4185
bv = float64_val(b);
4186
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4187
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4191
/*----------------------------------------------------------------------------
4192
| Returns 1 if the double-precision floating-point value `a' is less than
4193
| the corresponding value `b', and 0 otherwise. The invalid exception is
4194
| raised if either operand is a NaN. The comparison is performed according
4195
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4196
*----------------------------------------------------------------------------*/
4198
int float64_lt( float64 a, float64 b STATUS_PARAM )
4203
a = float64_squash_input_denormal(a STATUS_VAR);
4204
b = float64_squash_input_denormal(b STATUS_VAR);
4205
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4206
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4208
float_raise( float_flag_invalid STATUS_VAR);
4211
aSign = extractFloat64Sign( a );
4212
bSign = extractFloat64Sign( b );
4213
av = float64_val(a);
4214
bv = float64_val(b);
4215
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4216
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4220
/*----------------------------------------------------------------------------
4221
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4222
| be compared, and 0 otherwise. The invalid exception is raised if either
4223
| operand is a NaN. The comparison is performed according to the IEC/IEEE
4224
| Standard for Binary Floating-Point Arithmetic.
4225
*----------------------------------------------------------------------------*/
4227
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4229
a = float64_squash_input_denormal(a STATUS_VAR);
4230
b = float64_squash_input_denormal(b STATUS_VAR);
4232
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4233
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4235
float_raise( float_flag_invalid STATUS_VAR);
4241
/*----------------------------------------------------------------------------
4242
| Returns 1 if the double-precision floating-point value `a' is equal to the
4243
| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4244
| exception.The comparison is performed according to the IEC/IEEE Standard
4245
| for Binary Floating-Point Arithmetic.
4246
*----------------------------------------------------------------------------*/
4248
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4251
a = float64_squash_input_denormal(a STATUS_VAR);
4252
b = float64_squash_input_denormal(b STATUS_VAR);
4254
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4255
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4257
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4258
float_raise( float_flag_invalid STATUS_VAR);
4262
av = float64_val(a);
4263
bv = float64_val(b);
4264
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4268
/*----------------------------------------------------------------------------
4269
| Returns 1 if the double-precision floating-point value `a' is less than or
4270
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4271
| cause an exception. Otherwise, the comparison is performed according to the
4272
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273
*----------------------------------------------------------------------------*/
4275
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4279
a = float64_squash_input_denormal(a STATUS_VAR);
4280
b = float64_squash_input_denormal(b STATUS_VAR);
4282
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4283
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4285
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4286
float_raise( float_flag_invalid STATUS_VAR);
4290
aSign = extractFloat64Sign( a );
4291
bSign = extractFloat64Sign( b );
4292
av = float64_val(a);
4293
bv = float64_val(b);
4294
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4295
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4299
/*----------------------------------------------------------------------------
4300
| Returns 1 if the double-precision floating-point value `a' is less than
4301
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4302
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4303
| Standard for Binary Floating-Point Arithmetic.
4304
*----------------------------------------------------------------------------*/
4306
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4310
a = float64_squash_input_denormal(a STATUS_VAR);
4311
b = float64_squash_input_denormal(b STATUS_VAR);
4313
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4314
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4316
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4317
float_raise( float_flag_invalid STATUS_VAR);
4321
aSign = extractFloat64Sign( a );
4322
bSign = extractFloat64Sign( b );
4323
av = float64_val(a);
4324
bv = float64_val(b);
4325
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4326
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4330
/*----------------------------------------------------------------------------
4331
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4332
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4333
| comparison is performed according to the IEC/IEEE Standard for Binary
4334
| Floating-Point Arithmetic.
4335
*----------------------------------------------------------------------------*/
4337
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4339
a = float64_squash_input_denormal(a STATUS_VAR);
4340
b = float64_squash_input_denormal(b STATUS_VAR);
4342
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4343
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4345
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4346
float_raise( float_flag_invalid STATUS_VAR);
4353
/*----------------------------------------------------------------------------
4354
| Returns the result of converting the extended double-precision floating-
4355
| point value `a' to the 32-bit two's complement integer format. The
4356
| conversion is performed according to the IEC/IEEE Standard for Binary
4357
| Floating-Point Arithmetic---which means in particular that the conversion
4358
| is rounded according to the current rounding mode. If `a' is a NaN, the
4359
| largest positive integer is returned. Otherwise, if the conversion
4360
| overflows, the largest integer with the same sign as `a' is returned.
4361
*----------------------------------------------------------------------------*/
4363
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4366
int32 aExp, shiftCount;
4369
aSig = extractFloatx80Frac( a );
4370
aExp = extractFloatx80Exp( a );
4371
aSign = extractFloatx80Sign( a );
4372
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4373
shiftCount = 0x4037 - aExp;
4374
if ( shiftCount <= 0 ) shiftCount = 1;
4375
shift64RightJamming( aSig, shiftCount, &aSig );
4376
return roundAndPackInt32( aSign, aSig STATUS_VAR );
4380
/*----------------------------------------------------------------------------
4381
| Returns the result of converting the extended double-precision floating-
4382
| point value `a' to the 32-bit two's complement integer format. The
4383
| conversion is performed according to the IEC/IEEE Standard for Binary
4384
| Floating-Point Arithmetic, except that the conversion is always rounded
4385
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4386
| Otherwise, if the conversion overflows, the largest integer with the same
4387
| sign as `a' is returned.
4388
*----------------------------------------------------------------------------*/
4390
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4393
int32 aExp, shiftCount;
4394
uint64_t aSig, savedASig;
4397
aSig = extractFloatx80Frac( a );
4398
aExp = extractFloatx80Exp( a );
4399
aSign = extractFloatx80Sign( a );
4400
if ( 0x401E < aExp ) {
4401
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4404
else if ( aExp < 0x3FFF ) {
4405
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4408
shiftCount = 0x403E - aExp;
4410
aSig >>= shiftCount;
4412
if ( aSign ) z = - z;
4413
if ( ( z < 0 ) ^ aSign ) {
4415
float_raise( float_flag_invalid STATUS_VAR);
4416
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4418
if ( ( aSig<<shiftCount ) != savedASig ) {
4419
STATUS(float_exception_flags) |= float_flag_inexact;
4425
/*----------------------------------------------------------------------------
4426
| Returns the result of converting the extended double-precision floating-
4427
| point value `a' to the 64-bit two's complement integer format. The
4428
| conversion is performed according to the IEC/IEEE Standard for Binary
4429
| Floating-Point Arithmetic---which means in particular that the conversion
4430
| is rounded according to the current rounding mode. If `a' is a NaN,
4431
| the largest positive integer is returned. Otherwise, if the conversion
4432
| overflows, the largest integer with the same sign as `a' is returned.
4433
*----------------------------------------------------------------------------*/
4435
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4438
int32 aExp, shiftCount;
4439
uint64_t aSig, aSigExtra;
4441
aSig = extractFloatx80Frac( a );
4442
aExp = extractFloatx80Exp( a );
4443
aSign = extractFloatx80Sign( a );
4444
shiftCount = 0x403E - aExp;
4445
if ( shiftCount <= 0 ) {
4447
float_raise( float_flag_invalid STATUS_VAR);
4449
|| ( ( aExp == 0x7FFF )
4450
&& ( aSig != LIT64( 0x8000000000000000 ) ) )
4452
return LIT64( 0x7FFFFFFFFFFFFFFF );
4454
return (int64_t) LIT64( 0x8000000000000000 );
4459
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4461
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4465
/*----------------------------------------------------------------------------
4466
| Returns the result of converting the extended double-precision floating-
4467
| point value `a' to the 64-bit two's complement integer format. The
4468
| conversion is performed according to the IEC/IEEE Standard for Binary
4469
| Floating-Point Arithmetic, except that the conversion is always rounded
4470
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4471
| Otherwise, if the conversion overflows, the largest integer with the same
4472
| sign as `a' is returned.
4473
*----------------------------------------------------------------------------*/
4475
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4478
int32 aExp, shiftCount;
4482
aSig = extractFloatx80Frac( a );
4483
aExp = extractFloatx80Exp( a );
4484
aSign = extractFloatx80Sign( a );
4485
shiftCount = aExp - 0x403E;
4486
if ( 0 <= shiftCount ) {
4487
aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4488
if ( ( a.high != 0xC03E ) || aSig ) {
4489
float_raise( float_flag_invalid STATUS_VAR);
4490
if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4491
return LIT64( 0x7FFFFFFFFFFFFFFF );
4494
return (int64_t) LIT64( 0x8000000000000000 );
4496
else if ( aExp < 0x3FFF ) {
4497
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4500
z = aSig>>( - shiftCount );
4501
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4502
STATUS(float_exception_flags) |= float_flag_inexact;
4504
if ( aSign ) z = - z;
4509
/*----------------------------------------------------------------------------
4510
| Returns the result of converting the extended double-precision floating-
4511
| point value `a' to the single-precision floating-point format. The
4512
| conversion is performed according to the IEC/IEEE Standard for Binary
4513
| Floating-Point Arithmetic.
4514
*----------------------------------------------------------------------------*/
4516
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4522
aSig = extractFloatx80Frac( a );
4523
aExp = extractFloatx80Exp( a );
4524
aSign = extractFloatx80Sign( a );
4525
if ( aExp == 0x7FFF ) {
4526
if ( (uint64_t) ( aSig<<1 ) ) {
4527
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4529
return packFloat32( aSign, 0xFF, 0 );
4531
shift64RightJamming( aSig, 33, &aSig );
4532
if ( aExp || aSig ) aExp -= 0x3F81;
4533
return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4537
/*----------------------------------------------------------------------------
4538
| Returns the result of converting the extended double-precision floating-
4539
| point value `a' to the double-precision floating-point format. The
4540
| conversion is performed according to the IEC/IEEE Standard for Binary
4541
| Floating-Point Arithmetic.
4542
*----------------------------------------------------------------------------*/
4544
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4548
uint64_t aSig, zSig;
4550
aSig = extractFloatx80Frac( a );
4551
aExp = extractFloatx80Exp( a );
4552
aSign = extractFloatx80Sign( a );
4553
if ( aExp == 0x7FFF ) {
4554
if ( (uint64_t) ( aSig<<1 ) ) {
4555
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4557
return packFloat64( aSign, 0x7FF, 0 );
4559
shift64RightJamming( aSig, 1, &zSig );
4560
if ( aExp || aSig ) aExp -= 0x3C01;
4561
return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4565
/*----------------------------------------------------------------------------
4566
| Returns the result of converting the extended double-precision floating-
4567
| point value `a' to the quadruple-precision floating-point format. The
4568
| conversion is performed according to the IEC/IEEE Standard for Binary
4569
| Floating-Point Arithmetic.
4570
*----------------------------------------------------------------------------*/
4572
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4576
uint64_t aSig, zSig0, zSig1;
4578
aSig = extractFloatx80Frac( a );
4579
aExp = extractFloatx80Exp( a );
4580
aSign = extractFloatx80Sign( a );
4581
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4582
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4584
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4585
return packFloat128( aSign, aExp, zSig0, zSig1 );
4589
/*----------------------------------------------------------------------------
4590
| Rounds the extended double-precision floating-point value `a' to an integer,
4591
| and returns the result as an extended quadruple-precision floating-point
4592
| value. The operation is performed according to the IEC/IEEE Standard for
4593
| Binary Floating-Point Arithmetic.
4594
*----------------------------------------------------------------------------*/
4596
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4600
uint64_t lastBitMask, roundBitsMask;
4604
aExp = extractFloatx80Exp( a );
4605
if ( 0x403E <= aExp ) {
4606
if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4607
return propagateFloatx80NaN( a, a STATUS_VAR );
4611
if ( aExp < 0x3FFF ) {
4613
&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4616
STATUS(float_exception_flags) |= float_flag_inexact;
4617
aSign = extractFloatx80Sign( a );
4618
switch ( STATUS(float_rounding_mode) ) {
4619
case float_round_nearest_even:
4620
if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4623
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4626
case float_round_down:
4629
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4630
: packFloatx80( 0, 0, 0 );
4631
case float_round_up:
4633
aSign ? packFloatx80( 1, 0, 0 )
4634
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4636
return packFloatx80( aSign, 0, 0 );
4639
lastBitMask <<= 0x403E - aExp;
4640
roundBitsMask = lastBitMask - 1;
4642
roundingMode = STATUS(float_rounding_mode);
4643
if ( roundingMode == float_round_nearest_even ) {
4644
z.low += lastBitMask>>1;
4645
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4647
else if ( roundingMode != float_round_to_zero ) {
4648
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4649
z.low += roundBitsMask;
4652
z.low &= ~ roundBitsMask;
4655
z.low = LIT64( 0x8000000000000000 );
4657
if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4662
/*----------------------------------------------------------------------------
4663
| Returns the result of adding the absolute values of the extended double-
4664
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4665
| negated before being returned. `zSign' is ignored if the result is a NaN.
4666
| The addition is performed according to the IEC/IEEE Standard for Binary
4667
| Floating-Point Arithmetic.
4668
*----------------------------------------------------------------------------*/
4670
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4672
int32 aExp, bExp, zExp;
4673
uint64_t aSig, bSig, zSig0, zSig1;
4676
aSig = extractFloatx80Frac( a );
4677
aExp = extractFloatx80Exp( a );
4678
bSig = extractFloatx80Frac( b );
4679
bExp = extractFloatx80Exp( b );
4680
expDiff = aExp - bExp;
4681
if ( 0 < expDiff ) {
4682
if ( aExp == 0x7FFF ) {
4683
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4686
if ( bExp == 0 ) --expDiff;
4687
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4690
else if ( expDiff < 0 ) {
4691
if ( bExp == 0x7FFF ) {
4692
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4693
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4695
if ( aExp == 0 ) ++expDiff;
4696
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4700
if ( aExp == 0x7FFF ) {
4701
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4702
return propagateFloatx80NaN( a, b STATUS_VAR );
4707
zSig0 = aSig + bSig;
4709
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4715
zSig0 = aSig + bSig;
4716
if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4718
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4719
zSig0 |= LIT64( 0x8000000000000000 );
4723
roundAndPackFloatx80(
4724
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4728
/*----------------------------------------------------------------------------
4729
| Returns the result of subtracting the absolute values of the extended
4730
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4731
| difference is negated before being returned. `zSign' is ignored if the
4732
| result is a NaN. The subtraction is performed according to the IEC/IEEE
4733
| Standard for Binary Floating-Point Arithmetic.
4734
*----------------------------------------------------------------------------*/
4736
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4738
int32 aExp, bExp, zExp;
4739
uint64_t aSig, bSig, zSig0, zSig1;
4743
aSig = extractFloatx80Frac( a );
4744
aExp = extractFloatx80Exp( a );
4745
bSig = extractFloatx80Frac( b );
4746
bExp = extractFloatx80Exp( b );
4747
expDiff = aExp - bExp;
4748
if ( 0 < expDiff ) goto aExpBigger;
4749
if ( expDiff < 0 ) goto bExpBigger;
4750
if ( aExp == 0x7FFF ) {
4751
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4752
return propagateFloatx80NaN( a, b STATUS_VAR );
4754
float_raise( float_flag_invalid STATUS_VAR);
4755
z.low = floatx80_default_nan_low;
4756
z.high = floatx80_default_nan_high;
4764
if ( bSig < aSig ) goto aBigger;
4765
if ( aSig < bSig ) goto bBigger;
4766
return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4768
if ( bExp == 0x7FFF ) {
4769
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4770
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4772
if ( aExp == 0 ) ++expDiff;
4773
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4775
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4778
goto normalizeRoundAndPack;
4780
if ( aExp == 0x7FFF ) {
4781
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4784
if ( bExp == 0 ) --expDiff;
4785
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4787
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4789
normalizeRoundAndPack:
4791
normalizeRoundAndPackFloatx80(
4792
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4796
/*----------------------------------------------------------------------------
4797
| Returns the result of adding the extended double-precision floating-point
4798
| values `a' and `b'. The operation is performed according to the IEC/IEEE
4799
| Standard for Binary Floating-Point Arithmetic.
4800
*----------------------------------------------------------------------------*/
4802
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4806
aSign = extractFloatx80Sign( a );
4807
bSign = extractFloatx80Sign( b );
4808
if ( aSign == bSign ) {
4809
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4812
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4817
/*----------------------------------------------------------------------------
4818
| Returns the result of subtracting the extended double-precision floating-
4819
| point values `a' and `b'. The operation is performed according to the
4820
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821
*----------------------------------------------------------------------------*/
4823
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4827
aSign = extractFloatx80Sign( a );
4828
bSign = extractFloatx80Sign( b );
4829
if ( aSign == bSign ) {
4830
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4833
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4838
/*----------------------------------------------------------------------------
4839
| Returns the result of multiplying the extended double-precision floating-
4840
| point values `a' and `b'. The operation is performed according to the
4841
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842
*----------------------------------------------------------------------------*/
4844
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4846
flag aSign, bSign, zSign;
4847
int32 aExp, bExp, zExp;
4848
uint64_t aSig, bSig, zSig0, zSig1;
4851
aSig = extractFloatx80Frac( a );
4852
aExp = extractFloatx80Exp( a );
4853
aSign = extractFloatx80Sign( a );
4854
bSig = extractFloatx80Frac( b );
4855
bExp = extractFloatx80Exp( b );
4856
bSign = extractFloatx80Sign( b );
4857
zSign = aSign ^ bSign;
4858
if ( aExp == 0x7FFF ) {
4859
if ( (uint64_t) ( aSig<<1 )
4860
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4861
return propagateFloatx80NaN( a, b STATUS_VAR );
4863
if ( ( bExp | bSig ) == 0 ) goto invalid;
4864
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4866
if ( bExp == 0x7FFF ) {
4867
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4868
if ( ( aExp | aSig ) == 0 ) {
4870
float_raise( float_flag_invalid STATUS_VAR);
4871
z.low = floatx80_default_nan_low;
4872
z.high = floatx80_default_nan_high;
4875
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4878
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4879
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4882
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4883
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4885
zExp = aExp + bExp - 0x3FFE;
4886
mul64To128( aSig, bSig, &zSig0, &zSig1 );
4887
if ( 0 < (int64_t) zSig0 ) {
4888
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4892
roundAndPackFloatx80(
4893
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4897
/*----------------------------------------------------------------------------
4898
| Returns the result of dividing the extended double-precision floating-point
4899
| value `a' by the corresponding value `b'. The operation is performed
4900
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4901
*----------------------------------------------------------------------------*/
4903
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4905
flag aSign, bSign, zSign;
4906
int32 aExp, bExp, zExp;
4907
uint64_t aSig, bSig, zSig0, zSig1;
4908
uint64_t rem0, rem1, rem2, term0, term1, term2;
4911
aSig = extractFloatx80Frac( a );
4912
aExp = extractFloatx80Exp( a );
4913
aSign = extractFloatx80Sign( a );
4914
bSig = extractFloatx80Frac( b );
4915
bExp = extractFloatx80Exp( b );
4916
bSign = extractFloatx80Sign( b );
4917
zSign = aSign ^ bSign;
4918
if ( aExp == 0x7FFF ) {
4919
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4920
if ( bExp == 0x7FFF ) {
4921
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4924
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4926
if ( bExp == 0x7FFF ) {
4927
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4928
return packFloatx80( zSign, 0, 0 );
4932
if ( ( aExp | aSig ) == 0 ) {
4934
float_raise( float_flag_invalid STATUS_VAR);
4935
z.low = floatx80_default_nan_low;
4936
z.high = floatx80_default_nan_high;
4939
float_raise( float_flag_divbyzero STATUS_VAR);
4940
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4942
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4945
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4946
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4948
zExp = aExp - bExp + 0x3FFE;
4950
if ( bSig <= aSig ) {
4951
shift128Right( aSig, 0, 1, &aSig, &rem1 );
4954
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4955
mul64To128( bSig, zSig0, &term0, &term1 );
4956
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4957
while ( (int64_t) rem0 < 0 ) {
4959
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4961
zSig1 = estimateDiv128To64( rem1, 0, bSig );
4962
if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4963
mul64To128( bSig, zSig1, &term1, &term2 );
4964
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4965
while ( (int64_t) rem1 < 0 ) {
4967
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4969
zSig1 |= ( ( rem1 | rem2 ) != 0 );
4972
roundAndPackFloatx80(
4973
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4977
/*----------------------------------------------------------------------------
4978
| Returns the remainder of the extended double-precision floating-point value
4979
| `a' with respect to the corresponding value `b'. The operation is performed
4980
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4981
*----------------------------------------------------------------------------*/
4983
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4986
int32 aExp, bExp, expDiff;
4987
uint64_t aSig0, aSig1, bSig;
4988
uint64_t q, term0, term1, alternateASig0, alternateASig1;
4991
aSig0 = extractFloatx80Frac( a );
4992
aExp = extractFloatx80Exp( a );
4993
aSign = extractFloatx80Sign( a );
4994
bSig = extractFloatx80Frac( b );
4995
bExp = extractFloatx80Exp( b );
4996
if ( aExp == 0x7FFF ) {
4997
if ( (uint64_t) ( aSig0<<1 )
4998
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4999
return propagateFloatx80NaN( a, b STATUS_VAR );
5003
if ( bExp == 0x7FFF ) {
5004
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5010
float_raise( float_flag_invalid STATUS_VAR);
5011
z.low = floatx80_default_nan_low;
5012
z.high = floatx80_default_nan_high;
5015
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5018
if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5019
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5021
bSig |= LIT64( 0x8000000000000000 );
5023
expDiff = aExp - bExp;
5025
if ( expDiff < 0 ) {
5026
if ( expDiff < -1 ) return a;
5027
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5030
q = ( bSig <= aSig0 );
5031
if ( q ) aSig0 -= bSig;
5033
while ( 0 < expDiff ) {
5034
q = estimateDiv128To64( aSig0, aSig1, bSig );
5035
q = ( 2 < q ) ? q - 2 : 0;
5036
mul64To128( bSig, q, &term0, &term1 );
5037
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5038
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5042
if ( 0 < expDiff ) {
5043
q = estimateDiv128To64( aSig0, aSig1, bSig );
5044
q = ( 2 < q ) ? q - 2 : 0;
5046
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5047
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5048
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5049
while ( le128( term0, term1, aSig0, aSig1 ) ) {
5051
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5058
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5059
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5060
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5063
aSig0 = alternateASig0;
5064
aSig1 = alternateASig1;
5068
normalizeRoundAndPackFloatx80(
5069
80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5073
/*----------------------------------------------------------------------------
5074
| Returns the square root of the extended double-precision floating-point
5075
| value `a'. The operation is performed according to the IEC/IEEE Standard
5076
| for Binary Floating-Point Arithmetic.
5077
*----------------------------------------------------------------------------*/
5079
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5083
uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5084
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5087
aSig0 = extractFloatx80Frac( a );
5088
aExp = extractFloatx80Exp( a );
5089
aSign = extractFloatx80Sign( a );
5090
if ( aExp == 0x7FFF ) {
5091
if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5092
if ( ! aSign ) return a;
5096
if ( ( aExp | aSig0 ) == 0 ) return a;
5098
float_raise( float_flag_invalid STATUS_VAR);
5099
z.low = floatx80_default_nan_low;
5100
z.high = floatx80_default_nan_high;
5104
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5105
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5107
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5108
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5109
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5110
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5111
doubleZSig0 = zSig0<<1;
5112
mul64To128( zSig0, zSig0, &term0, &term1 );
5113
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5114
while ( (int64_t) rem0 < 0 ) {
5117
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5119
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5120
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5121
if ( zSig1 == 0 ) zSig1 = 1;
5122
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5123
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5124
mul64To128( zSig1, zSig1, &term2, &term3 );
5125
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5126
while ( (int64_t) rem1 < 0 ) {
5128
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5130
term2 |= doubleZSig0;
5131
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5133
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5135
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5136
zSig0 |= doubleZSig0;
5138
roundAndPackFloatx80(
5139
STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5143
/*----------------------------------------------------------------------------
5144
| Returns 1 if the extended double-precision floating-point value `a' is equal
5145
| to the corresponding value `b', and 0 otherwise. The invalid exception is
5146
| raised if either operand is a NaN. Otherwise, the comparison is performed
5147
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5148
*----------------------------------------------------------------------------*/
5150
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5153
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5154
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5155
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5156
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5158
float_raise( float_flag_invalid STATUS_VAR);
5163
&& ( ( a.high == b.high )
5165
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5170
/*----------------------------------------------------------------------------
5171
| Returns 1 if the extended double-precision floating-point value `a' is
5172
| less than or equal to the corresponding value `b', and 0 otherwise. The
5173
| invalid exception is raised if either operand is a NaN. The comparison is
5174
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5176
*----------------------------------------------------------------------------*/
5178
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5182
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5183
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5184
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5185
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5187
float_raise( float_flag_invalid STATUS_VAR);
5190
aSign = extractFloatx80Sign( a );
5191
bSign = extractFloatx80Sign( b );
5192
if ( aSign != bSign ) {
5195
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5199
aSign ? le128( b.high, b.low, a.high, a.low )
5200
: le128( a.high, a.low, b.high, b.low );
5204
/*----------------------------------------------------------------------------
5205
| Returns 1 if the extended double-precision floating-point value `a' is
5206
| less than the corresponding value `b', and 0 otherwise. The invalid
5207
| exception is raised if either operand is a NaN. The comparison is performed
5208
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5209
*----------------------------------------------------------------------------*/
5211
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5215
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5216
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5217
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5218
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5220
float_raise( float_flag_invalid STATUS_VAR);
5223
aSign = extractFloatx80Sign( a );
5224
bSign = extractFloatx80Sign( b );
5225
if ( aSign != bSign ) {
5228
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5232
aSign ? lt128( b.high, b.low, a.high, a.low )
5233
: lt128( a.high, a.low, b.high, b.low );
5237
/*----------------------------------------------------------------------------
5238
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5239
| cannot be compared, and 0 otherwise. The invalid exception is raised if
5240
| either operand is a NaN. The comparison is performed according to the
5241
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242
*----------------------------------------------------------------------------*/
5243
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5245
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5246
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5247
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5248
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5250
float_raise( float_flag_invalid STATUS_VAR);
5256
/*----------------------------------------------------------------------------
5257
| Returns 1 if the extended double-precision floating-point value `a' is
5258
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5259
| cause an exception. The comparison is performed according to the IEC/IEEE
5260
| Standard for Binary Floating-Point Arithmetic.
5261
*----------------------------------------------------------------------------*/
5263
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5266
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5267
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5268
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5269
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5271
if ( floatx80_is_signaling_nan( a )
5272
|| floatx80_is_signaling_nan( b ) ) {
5273
float_raise( float_flag_invalid STATUS_VAR);
5279
&& ( ( a.high == b.high )
5281
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5286
/*----------------------------------------------------------------------------
5287
| Returns 1 if the extended double-precision floating-point value `a' is less
5288
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5289
| do not cause an exception. Otherwise, the comparison is performed according
5290
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5291
*----------------------------------------------------------------------------*/
5293
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5297
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5298
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5299
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5300
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5302
if ( floatx80_is_signaling_nan( a )
5303
|| floatx80_is_signaling_nan( b ) ) {
5304
float_raise( float_flag_invalid STATUS_VAR);
5308
aSign = extractFloatx80Sign( a );
5309
bSign = extractFloatx80Sign( b );
5310
if ( aSign != bSign ) {
5313
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5317
aSign ? le128( b.high, b.low, a.high, a.low )
5318
: le128( a.high, a.low, b.high, b.low );
5322
/*----------------------------------------------------------------------------
5323
| Returns 1 if the extended double-precision floating-point value `a' is less
5324
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5325
| an exception. Otherwise, the comparison is performed according to the
5326
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327
*----------------------------------------------------------------------------*/
5329
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5333
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5334
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5335
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5336
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5338
if ( floatx80_is_signaling_nan( a )
5339
|| floatx80_is_signaling_nan( b ) ) {
5340
float_raise( float_flag_invalid STATUS_VAR);
5344
aSign = extractFloatx80Sign( a );
5345
bSign = extractFloatx80Sign( b );
5346
if ( aSign != bSign ) {
5349
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5353
aSign ? lt128( b.high, b.low, a.high, a.low )
5354
: lt128( a.high, a.low, b.high, b.low );
5358
/*----------------------------------------------------------------------------
5359
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5360
| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5361
| The comparison is performed according to the IEC/IEEE Standard for Binary
5362
| Floating-Point Arithmetic.
5363
*----------------------------------------------------------------------------*/
5364
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5366
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5367
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5368
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5369
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5371
if ( floatx80_is_signaling_nan( a )
5372
|| floatx80_is_signaling_nan( b ) ) {
5373
float_raise( float_flag_invalid STATUS_VAR);
5380
/*----------------------------------------------------------------------------
5381
| Returns the result of converting the quadruple-precision floating-point
5382
| value `a' to the 32-bit two's complement integer format. The conversion
5383
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5384
| Arithmetic---which means in particular that the conversion is rounded
5385
| according to the current rounding mode. If `a' is a NaN, the largest
5386
| positive integer is returned. Otherwise, if the conversion overflows, the
5387
| largest integer with the same sign as `a' is returned.
5388
*----------------------------------------------------------------------------*/
5390
int32 float128_to_int32( float128 a STATUS_PARAM )
5393
int32 aExp, shiftCount;
5394
uint64_t aSig0, aSig1;
5396
aSig1 = extractFloat128Frac1( a );
5397
aSig0 = extractFloat128Frac0( a );
5398
aExp = extractFloat128Exp( a );
5399
aSign = extractFloat128Sign( a );
5400
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5401
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5402
aSig0 |= ( aSig1 != 0 );
5403
shiftCount = 0x4028 - aExp;
5404
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5405
return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5409
/*----------------------------------------------------------------------------
5410
| Returns the result of converting the quadruple-precision floating-point
5411
| value `a' to the 32-bit two's complement integer format. The conversion
5412
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5413
| Arithmetic, except that the conversion is always rounded toward zero. If
5414
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5415
| conversion overflows, the largest integer with the same sign as `a' is
5417
*----------------------------------------------------------------------------*/
5419
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5422
int32 aExp, shiftCount;
5423
uint64_t aSig0, aSig1, savedASig;
5426
aSig1 = extractFloat128Frac1( a );
5427
aSig0 = extractFloat128Frac0( a );
5428
aExp = extractFloat128Exp( a );
5429
aSign = extractFloat128Sign( a );
5430
aSig0 |= ( aSig1 != 0 );
5431
if ( 0x401E < aExp ) {
5432
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5435
else if ( aExp < 0x3FFF ) {
5436
if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5439
aSig0 |= LIT64( 0x0001000000000000 );
5440
shiftCount = 0x402F - aExp;
5442
aSig0 >>= shiftCount;
5444
if ( aSign ) z = - z;
5445
if ( ( z < 0 ) ^ aSign ) {
5447
float_raise( float_flag_invalid STATUS_VAR);
5448
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5450
if ( ( aSig0<<shiftCount ) != savedASig ) {
5451
STATUS(float_exception_flags) |= float_flag_inexact;
5457
/*----------------------------------------------------------------------------
5458
| Returns the result of converting the quadruple-precision floating-point
5459
| value `a' to the 64-bit two's complement integer format. The conversion
5460
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5461
| Arithmetic---which means in particular that the conversion is rounded
5462
| according to the current rounding mode. If `a' is a NaN, the largest
5463
| positive integer is returned. Otherwise, if the conversion overflows, the
5464
| largest integer with the same sign as `a' is returned.
5465
*----------------------------------------------------------------------------*/
5467
int64 float128_to_int64( float128 a STATUS_PARAM )
5470
int32 aExp, shiftCount;
5471
uint64_t aSig0, aSig1;
5473
aSig1 = extractFloat128Frac1( a );
5474
aSig0 = extractFloat128Frac0( a );
5475
aExp = extractFloat128Exp( a );
5476
aSign = extractFloat128Sign( a );
5477
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5478
shiftCount = 0x402F - aExp;
5479
if ( shiftCount <= 0 ) {
5480
if ( 0x403E < aExp ) {
5481
float_raise( float_flag_invalid STATUS_VAR);
5483
|| ( ( aExp == 0x7FFF )
5484
&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5487
return LIT64( 0x7FFFFFFFFFFFFFFF );
5489
return (int64_t) LIT64( 0x8000000000000000 );
5491
shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5494
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5496
return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5500
/*----------------------------------------------------------------------------
5501
| Returns the result of converting the quadruple-precision floating-point
5502
| value `a' to the 64-bit two's complement integer format. The conversion
5503
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5504
| Arithmetic, except that the conversion is always rounded toward zero.
5505
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5506
| the conversion overflows, the largest integer with the same sign as `a' is
5508
*----------------------------------------------------------------------------*/
5510
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5513
int32 aExp, shiftCount;
5514
uint64_t aSig0, aSig1;
5517
aSig1 = extractFloat128Frac1( a );
5518
aSig0 = extractFloat128Frac0( a );
5519
aExp = extractFloat128Exp( a );
5520
aSign = extractFloat128Sign( a );
5521
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5522
shiftCount = aExp - 0x402F;
5523
if ( 0 < shiftCount ) {
5524
if ( 0x403E <= aExp ) {
5525
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5526
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5527
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5528
if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5531
float_raise( float_flag_invalid STATUS_VAR);
5532
if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5533
return LIT64( 0x7FFFFFFFFFFFFFFF );
5536
return (int64_t) LIT64( 0x8000000000000000 );
5538
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5539
if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5540
STATUS(float_exception_flags) |= float_flag_inexact;
5544
if ( aExp < 0x3FFF ) {
5545
if ( aExp | aSig0 | aSig1 ) {
5546
STATUS(float_exception_flags) |= float_flag_inexact;
5550
z = aSig0>>( - shiftCount );
5552
|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5553
STATUS(float_exception_flags) |= float_flag_inexact;
5556
if ( aSign ) z = - z;
5561
/*----------------------------------------------------------------------------
5562
| Returns the result of converting the quadruple-precision floating-point
5563
| value `a' to the single-precision floating-point format. The conversion
5564
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5566
*----------------------------------------------------------------------------*/
5568
float32 float128_to_float32( float128 a STATUS_PARAM )
5572
uint64_t aSig0, aSig1;
5575
aSig1 = extractFloat128Frac1( a );
5576
aSig0 = extractFloat128Frac0( a );
5577
aExp = extractFloat128Exp( a );
5578
aSign = extractFloat128Sign( a );
5579
if ( aExp == 0x7FFF ) {
5580
if ( aSig0 | aSig1 ) {
5581
return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5583
return packFloat32( aSign, 0xFF, 0 );
5585
aSig0 |= ( aSig1 != 0 );
5586
shift64RightJamming( aSig0, 18, &aSig0 );
5588
if ( aExp || zSig ) {
5592
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5596
/*----------------------------------------------------------------------------
5597
| Returns the result of converting the quadruple-precision floating-point
5598
| value `a' to the double-precision floating-point format. The conversion
5599
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5601
*----------------------------------------------------------------------------*/
5603
float64 float128_to_float64( float128 a STATUS_PARAM )
5607
uint64_t aSig0, aSig1;
5609
aSig1 = extractFloat128Frac1( a );
5610
aSig0 = extractFloat128Frac0( a );
5611
aExp = extractFloat128Exp( a );
5612
aSign = extractFloat128Sign( a );
5613
if ( aExp == 0x7FFF ) {
5614
if ( aSig0 | aSig1 ) {
5615
return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5617
return packFloat64( aSign, 0x7FF, 0 );
5619
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5620
aSig0 |= ( aSig1 != 0 );
5621
if ( aExp || aSig0 ) {
5622
aSig0 |= LIT64( 0x4000000000000000 );
5625
return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5629
/*----------------------------------------------------------------------------
5630
| Returns the result of converting the quadruple-precision floating-point
5631
| value `a' to the extended double-precision floating-point format. The
5632
| conversion is performed according to the IEC/IEEE Standard for Binary
5633
| Floating-Point Arithmetic.
5634
*----------------------------------------------------------------------------*/
5636
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5640
uint64_t aSig0, aSig1;
5642
aSig1 = extractFloat128Frac1( a );
5643
aSig0 = extractFloat128Frac0( a );
5644
aExp = extractFloat128Exp( a );
5645
aSign = extractFloat128Sign( a );
5646
if ( aExp == 0x7FFF ) {
5647
if ( aSig0 | aSig1 ) {
5648
return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5650
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5653
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5654
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5657
aSig0 |= LIT64( 0x0001000000000000 );
5659
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5660
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5664
/*----------------------------------------------------------------------------
5665
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5666
| returns the result as a quadruple-precision floating-point value. The
5667
| operation is performed according to the IEC/IEEE Standard for Binary
5668
| Floating-Point Arithmetic.
5669
*----------------------------------------------------------------------------*/
5671
float128 float128_round_to_int( float128 a STATUS_PARAM )
5675
uint64_t lastBitMask, roundBitsMask;
5679
aExp = extractFloat128Exp( a );
5680
if ( 0x402F <= aExp ) {
5681
if ( 0x406F <= aExp ) {
5682
if ( ( aExp == 0x7FFF )
5683
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5685
return propagateFloat128NaN( a, a STATUS_VAR );
5690
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5691
roundBitsMask = lastBitMask - 1;
5693
roundingMode = STATUS(float_rounding_mode);
5694
if ( roundingMode == float_round_nearest_even ) {
5695
if ( lastBitMask ) {
5696
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5697
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5700
if ( (int64_t) z.low < 0 ) {
5702
if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5706
else if ( roundingMode != float_round_to_zero ) {
5707
if ( extractFloat128Sign( z )
5708
^ ( roundingMode == float_round_up ) ) {
5709
add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5712
z.low &= ~ roundBitsMask;
5715
if ( aExp < 0x3FFF ) {
5716
if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5717
STATUS(float_exception_flags) |= float_flag_inexact;
5718
aSign = extractFloat128Sign( a );
5719
switch ( STATUS(float_rounding_mode) ) {
5720
case float_round_nearest_even:
5721
if ( ( aExp == 0x3FFE )
5722
&& ( extractFloat128Frac0( a )
5723
| extractFloat128Frac1( a ) )
5725
return packFloat128( aSign, 0x3FFF, 0, 0 );
5728
case float_round_down:
5730
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5731
: packFloat128( 0, 0, 0, 0 );
5732
case float_round_up:
5734
aSign ? packFloat128( 1, 0, 0, 0 )
5735
: packFloat128( 0, 0x3FFF, 0, 0 );
5737
return packFloat128( aSign, 0, 0, 0 );
5740
lastBitMask <<= 0x402F - aExp;
5741
roundBitsMask = lastBitMask - 1;
5744
roundingMode = STATUS(float_rounding_mode);
5745
if ( roundingMode == float_round_nearest_even ) {
5746
z.high += lastBitMask>>1;
5747
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5748
z.high &= ~ lastBitMask;
5751
else if ( roundingMode != float_round_to_zero ) {
5752
if ( extractFloat128Sign( z )
5753
^ ( roundingMode == float_round_up ) ) {
5754
z.high |= ( a.low != 0 );
5755
z.high += roundBitsMask;
5758
z.high &= ~ roundBitsMask;
5760
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5761
STATUS(float_exception_flags) |= float_flag_inexact;
5767
/*----------------------------------------------------------------------------
5768
| Returns the result of adding the absolute values of the quadruple-precision
5769
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5770
| before being returned. `zSign' is ignored if the result is a NaN.
5771
| The addition is performed according to the IEC/IEEE Standard for Binary
5772
| Floating-Point Arithmetic.
5773
*----------------------------------------------------------------------------*/
5775
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5777
int32 aExp, bExp, zExp;
5778
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5781
aSig1 = extractFloat128Frac1( a );
5782
aSig0 = extractFloat128Frac0( a );
5783
aExp = extractFloat128Exp( a );
5784
bSig1 = extractFloat128Frac1( b );
5785
bSig0 = extractFloat128Frac0( b );
5786
bExp = extractFloat128Exp( b );
5787
expDiff = aExp - bExp;
5788
if ( 0 < expDiff ) {
5789
if ( aExp == 0x7FFF ) {
5790
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5797
bSig0 |= LIT64( 0x0001000000000000 );
5799
shift128ExtraRightJamming(
5800
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5803
else if ( expDiff < 0 ) {
5804
if ( bExp == 0x7FFF ) {
5805
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5806
return packFloat128( zSign, 0x7FFF, 0, 0 );
5812
aSig0 |= LIT64( 0x0001000000000000 );
5814
shift128ExtraRightJamming(
5815
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5819
if ( aExp == 0x7FFF ) {
5820
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5821
return propagateFloat128NaN( a, b STATUS_VAR );
5825
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5827
if (STATUS(flush_to_zero)) {
5828
if (zSig0 | zSig1) {
5829
float_raise(float_flag_output_denormal STATUS_VAR);
5831
return packFloat128(zSign, 0, 0, 0);
5833
return packFloat128( zSign, 0, zSig0, zSig1 );
5836
zSig0 |= LIT64( 0x0002000000000000 );
5840
aSig0 |= LIT64( 0x0001000000000000 );
5841
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5843
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5846
shift128ExtraRightJamming(
5847
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5849
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5853
/*----------------------------------------------------------------------------
5854
| Returns the result of subtracting the absolute values of the quadruple-
5855
| precision floating-point values `a' and `b'. If `zSign' is 1, the
5856
| difference is negated before being returned. `zSign' is ignored if the
5857
| result is a NaN. The subtraction is performed according to the IEC/IEEE
5858
| Standard for Binary Floating-Point Arithmetic.
5859
*----------------------------------------------------------------------------*/
5861
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5863
int32 aExp, bExp, zExp;
5864
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5868
aSig1 = extractFloat128Frac1( a );
5869
aSig0 = extractFloat128Frac0( a );
5870
aExp = extractFloat128Exp( a );
5871
bSig1 = extractFloat128Frac1( b );
5872
bSig0 = extractFloat128Frac0( b );
5873
bExp = extractFloat128Exp( b );
5874
expDiff = aExp - bExp;
5875
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5876
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5877
if ( 0 < expDiff ) goto aExpBigger;
5878
if ( expDiff < 0 ) goto bExpBigger;
5879
if ( aExp == 0x7FFF ) {
5880
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5881
return propagateFloat128NaN( a, b STATUS_VAR );
5883
float_raise( float_flag_invalid STATUS_VAR);
5884
z.low = float128_default_nan_low;
5885
z.high = float128_default_nan_high;
5892
if ( bSig0 < aSig0 ) goto aBigger;
5893
if ( aSig0 < bSig0 ) goto bBigger;
5894
if ( bSig1 < aSig1 ) goto aBigger;
5895
if ( aSig1 < bSig1 ) goto bBigger;
5896
return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5898
if ( bExp == 0x7FFF ) {
5899
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5900
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5906
aSig0 |= LIT64( 0x4000000000000000 );
5908
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5909
bSig0 |= LIT64( 0x4000000000000000 );
5911
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5914
goto normalizeRoundAndPack;
5916
if ( aExp == 0x7FFF ) {
5917
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5924
bSig0 |= LIT64( 0x4000000000000000 );
5926
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5927
aSig0 |= LIT64( 0x4000000000000000 );
5929
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5931
normalizeRoundAndPack:
5933
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5937
/*----------------------------------------------------------------------------
5938
| Returns the result of adding the quadruple-precision floating-point values
5939
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5940
| for Binary Floating-Point Arithmetic.
5941
*----------------------------------------------------------------------------*/
5943
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5947
aSign = extractFloat128Sign( a );
5948
bSign = extractFloat128Sign( b );
5949
if ( aSign == bSign ) {
5950
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5953
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5958
/*----------------------------------------------------------------------------
5959
| Returns the result of subtracting the quadruple-precision floating-point
5960
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5961
| Standard for Binary Floating-Point Arithmetic.
5962
*----------------------------------------------------------------------------*/
5964
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5968
aSign = extractFloat128Sign( a );
5969
bSign = extractFloat128Sign( b );
5970
if ( aSign == bSign ) {
5971
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5974
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5979
/*----------------------------------------------------------------------------
5980
| Returns the result of multiplying the quadruple-precision floating-point
5981
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5982
| Standard for Binary Floating-Point Arithmetic.
5983
*----------------------------------------------------------------------------*/
5985
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5987
flag aSign, bSign, zSign;
5988
int32 aExp, bExp, zExp;
5989
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5992
aSig1 = extractFloat128Frac1( a );
5993
aSig0 = extractFloat128Frac0( a );
5994
aExp = extractFloat128Exp( a );
5995
aSign = extractFloat128Sign( a );
5996
bSig1 = extractFloat128Frac1( b );
5997
bSig0 = extractFloat128Frac0( b );
5998
bExp = extractFloat128Exp( b );
5999
bSign = extractFloat128Sign( b );
6000
zSign = aSign ^ bSign;
6001
if ( aExp == 0x7FFF ) {
6002
if ( ( aSig0 | aSig1 )
6003
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6004
return propagateFloat128NaN( a, b STATUS_VAR );
6006
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6007
return packFloat128( zSign, 0x7FFF, 0, 0 );
6009
if ( bExp == 0x7FFF ) {
6010
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6011
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6013
float_raise( float_flag_invalid STATUS_VAR);
6014
z.low = float128_default_nan_low;
6015
z.high = float128_default_nan_high;
6018
return packFloat128( zSign, 0x7FFF, 0, 0 );
6021
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6022
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6025
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6026
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6028
zExp = aExp + bExp - 0x4000;
6029
aSig0 |= LIT64( 0x0001000000000000 );
6030
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6031
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6032
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6033
zSig2 |= ( zSig3 != 0 );
6034
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6035
shift128ExtraRightJamming(
6036
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6039
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6043
/*----------------------------------------------------------------------------
6044
| Returns the result of dividing the quadruple-precision floating-point value
6045
| `a' by the corresponding value `b'. The operation is performed according to
6046
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6047
*----------------------------------------------------------------------------*/
6049
float128 float128_div( float128 a, float128 b STATUS_PARAM )
6051
flag aSign, bSign, zSign;
6052
int32 aExp, bExp, zExp;
6053
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6054
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6057
aSig1 = extractFloat128Frac1( a );
6058
aSig0 = extractFloat128Frac0( a );
6059
aExp = extractFloat128Exp( a );
6060
aSign = extractFloat128Sign( a );
6061
bSig1 = extractFloat128Frac1( b );
6062
bSig0 = extractFloat128Frac0( b );
6063
bExp = extractFloat128Exp( b );
6064
bSign = extractFloat128Sign( b );
6065
zSign = aSign ^ bSign;
6066
if ( aExp == 0x7FFF ) {
6067
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6068
if ( bExp == 0x7FFF ) {
6069
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6072
return packFloat128( zSign, 0x7FFF, 0, 0 );
6074
if ( bExp == 0x7FFF ) {
6075
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6076
return packFloat128( zSign, 0, 0, 0 );
6079
if ( ( bSig0 | bSig1 ) == 0 ) {
6080
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6082
float_raise( float_flag_invalid STATUS_VAR);
6083
z.low = float128_default_nan_low;
6084
z.high = float128_default_nan_high;
6087
float_raise( float_flag_divbyzero STATUS_VAR);
6088
return packFloat128( zSign, 0x7FFF, 0, 0 );
6090
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6093
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6094
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6096
zExp = aExp - bExp + 0x3FFD;
6098
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6100
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6101
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6102
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6105
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6106
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6107
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6108
while ( (int64_t) rem0 < 0 ) {
6110
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6112
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6113
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6114
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6115
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6116
while ( (int64_t) rem1 < 0 ) {
6118
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6120
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6122
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6123
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6127
/*----------------------------------------------------------------------------
6128
| Returns the remainder of the quadruple-precision floating-point value `a'
6129
| with respect to the corresponding value `b'. The operation is performed
6130
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6131
*----------------------------------------------------------------------------*/
6133
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6136
int32 aExp, bExp, expDiff;
6137
uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6138
uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6142
aSig1 = extractFloat128Frac1( a );
6143
aSig0 = extractFloat128Frac0( a );
6144
aExp = extractFloat128Exp( a );
6145
aSign = extractFloat128Sign( a );
6146
bSig1 = extractFloat128Frac1( b );
6147
bSig0 = extractFloat128Frac0( b );
6148
bExp = extractFloat128Exp( b );
6149
if ( aExp == 0x7FFF ) {
6150
if ( ( aSig0 | aSig1 )
6151
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6152
return propagateFloat128NaN( a, b STATUS_VAR );
6156
if ( bExp == 0x7FFF ) {
6157
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6161
if ( ( bSig0 | bSig1 ) == 0 ) {
6163
float_raise( float_flag_invalid STATUS_VAR);
6164
z.low = float128_default_nan_low;
6165
z.high = float128_default_nan_high;
6168
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6171
if ( ( aSig0 | aSig1 ) == 0 ) return a;
6172
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6174
expDiff = aExp - bExp;
6175
if ( expDiff < -1 ) return a;
6177
aSig0 | LIT64( 0x0001000000000000 ),
6179
15 - ( expDiff < 0 ),
6184
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6185
q = le128( bSig0, bSig1, aSig0, aSig1 );
6186
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6188
while ( 0 < expDiff ) {
6189
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6190
q = ( 4 < q ) ? q - 4 : 0;
6191
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6192
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6193
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6194
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6197
if ( -64 < expDiff ) {
6198
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6199
q = ( 4 < q ) ? q - 4 : 0;
6201
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6203
if ( expDiff < 0 ) {
6204
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6207
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6209
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6210
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6213
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6214
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6217
alternateASig0 = aSig0;
6218
alternateASig1 = aSig1;
6220
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6221
} while ( 0 <= (int64_t) aSig0 );
6223
aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6224
if ( ( sigMean0 < 0 )
6225
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6226
aSig0 = alternateASig0;
6227
aSig1 = alternateASig1;
6229
zSign = ( (int64_t) aSig0 < 0 );
6230
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6232
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6236
/*----------------------------------------------------------------------------
6237
| Returns the square root of the quadruple-precision floating-point value `a'.
6238
| The operation is performed according to the IEC/IEEE Standard for Binary
6239
| Floating-Point Arithmetic.
6240
*----------------------------------------------------------------------------*/
6242
float128 float128_sqrt( float128 a STATUS_PARAM )
6246
uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6247
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6250
aSig1 = extractFloat128Frac1( a );
6251
aSig0 = extractFloat128Frac0( a );
6252
aExp = extractFloat128Exp( a );
6253
aSign = extractFloat128Sign( a );
6254
if ( aExp == 0x7FFF ) {
6255
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6256
if ( ! aSign ) return a;
6260
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6262
float_raise( float_flag_invalid STATUS_VAR);
6263
z.low = float128_default_nan_low;
6264
z.high = float128_default_nan_high;
6268
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6269
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6271
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6272
aSig0 |= LIT64( 0x0001000000000000 );
6273
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6274
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6275
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6276
doubleZSig0 = zSig0<<1;
6277
mul64To128( zSig0, zSig0, &term0, &term1 );
6278
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6279
while ( (int64_t) rem0 < 0 ) {
6282
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6284
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6285
if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6286
if ( zSig1 == 0 ) zSig1 = 1;
6287
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6288
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6289
mul64To128( zSig1, zSig1, &term2, &term3 );
6290
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6291
while ( (int64_t) rem1 < 0 ) {
6293
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6295
term2 |= doubleZSig0;
6296
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6298
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6300
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6301
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6305
/*----------------------------------------------------------------------------
6306
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6307
| the corresponding value `b', and 0 otherwise. The invalid exception is
6308
| raised if either operand is a NaN. Otherwise, the comparison is performed
6309
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6310
*----------------------------------------------------------------------------*/
6312
int float128_eq( float128 a, float128 b STATUS_PARAM )
6315
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6316
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6317
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6318
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6320
float_raise( float_flag_invalid STATUS_VAR);
6325
&& ( ( a.high == b.high )
6327
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6332
/*----------------------------------------------------------------------------
6333
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6334
| or equal to the corresponding value `b', and 0 otherwise. The invalid
6335
| exception is raised if either operand is a NaN. The comparison is performed
6336
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6337
*----------------------------------------------------------------------------*/
6339
int float128_le( float128 a, float128 b STATUS_PARAM )
6343
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6344
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6345
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6346
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6348
float_raise( float_flag_invalid STATUS_VAR);
6351
aSign = extractFloat128Sign( a );
6352
bSign = extractFloat128Sign( b );
6353
if ( aSign != bSign ) {
6356
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6360
aSign ? le128( b.high, b.low, a.high, a.low )
6361
: le128( a.high, a.low, b.high, b.low );
6365
/*----------------------------------------------------------------------------
6366
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6367
| the corresponding value `b', and 0 otherwise. The invalid exception is
6368
| raised if either operand is a NaN. The comparison is performed according
6369
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6370
*----------------------------------------------------------------------------*/
6372
int float128_lt( float128 a, float128 b STATUS_PARAM )
6376
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6377
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6378
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6379
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6381
float_raise( float_flag_invalid STATUS_VAR);
6384
aSign = extractFloat128Sign( a );
6385
bSign = extractFloat128Sign( b );
6386
if ( aSign != bSign ) {
6389
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6393
aSign ? lt128( b.high, b.low, a.high, a.low )
6394
: lt128( a.high, a.low, b.high, b.low );
6398
/*----------------------------------------------------------------------------
6399
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6400
| be compared, and 0 otherwise. The invalid exception is raised if either
6401
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6402
| Standard for Binary Floating-Point Arithmetic.
6403
*----------------------------------------------------------------------------*/
6405
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6407
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6408
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6409
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6410
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6412
float_raise( float_flag_invalid STATUS_VAR);
6418
/*----------------------------------------------------------------------------
6419
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6420
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6421
| exception. The comparison is performed according to the IEC/IEEE Standard
6422
| for Binary Floating-Point Arithmetic.
6423
*----------------------------------------------------------------------------*/
6425
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6428
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6429
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6430
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6431
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6433
if ( float128_is_signaling_nan( a )
6434
|| float128_is_signaling_nan( b ) ) {
6435
float_raise( float_flag_invalid STATUS_VAR);
6441
&& ( ( a.high == b.high )
6443
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6448
/*----------------------------------------------------------------------------
6449
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6450
| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6451
| cause an exception. Otherwise, the comparison is performed according to the
6452
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6453
*----------------------------------------------------------------------------*/
6455
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6459
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6460
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6461
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6462
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6464
if ( float128_is_signaling_nan( a )
6465
|| float128_is_signaling_nan( b ) ) {
6466
float_raise( float_flag_invalid STATUS_VAR);
6470
aSign = extractFloat128Sign( a );
6471
bSign = extractFloat128Sign( b );
6472
if ( aSign != bSign ) {
6475
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6479
aSign ? le128( b.high, b.low, a.high, a.low )
6480
: le128( a.high, a.low, b.high, b.low );
6484
/*----------------------------------------------------------------------------
6485
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6486
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6487
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6488
| Standard for Binary Floating-Point Arithmetic.
6489
*----------------------------------------------------------------------------*/
6491
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6495
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6496
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6497
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6498
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6500
if ( float128_is_signaling_nan( a )
6501
|| float128_is_signaling_nan( b ) ) {
6502
float_raise( float_flag_invalid STATUS_VAR);
6506
aSign = extractFloat128Sign( a );
6507
bSign = extractFloat128Sign( b );
6508
if ( aSign != bSign ) {
6511
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6515
aSign ? lt128( b.high, b.low, a.high, a.low )
6516
: lt128( a.high, a.low, b.high, b.low );
6520
/*----------------------------------------------------------------------------
6521
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6522
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6523
| comparison is performed according to the IEC/IEEE Standard for Binary
6524
| Floating-Point Arithmetic.
6525
*----------------------------------------------------------------------------*/
6527
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6529
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6530
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6531
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6532
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6534
if ( float128_is_signaling_nan( a )
6535
|| float128_is_signaling_nan( b ) ) {
6536
float_raise( float_flag_invalid STATUS_VAR);
6543
/* misc functions */
6544
float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6546
return int64_to_float32(a STATUS_VAR);
6549
float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6551
return int64_to_float64(a STATUS_VAR);
6554
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6558
int old_exc_flags = get_float_exception_flags(status);
6560
v = float32_to_int64(a STATUS_VAR);
6563
} else if (v > 0xffffffff) {
6568
set_float_exception_flags(old_exc_flags, status);
6569
float_raise(float_flag_invalid STATUS_VAR);
6573
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6577
int old_exc_flags = get_float_exception_flags(status);
6579
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6582
} else if (v > 0xffffffff) {
6587
set_float_exception_flags(old_exc_flags, status);
6588
float_raise(float_flag_invalid STATUS_VAR);
6592
int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6596
int old_exc_flags = get_float_exception_flags(status);
6598
v = float32_to_int32(a STATUS_VAR);
6601
} else if (v > 0x7fff) {
6607
set_float_exception_flags(old_exc_flags, status);
6608
float_raise(float_flag_invalid STATUS_VAR);
6612
uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6616
int old_exc_flags = get_float_exception_flags(status);
6618
v = float32_to_int32(a STATUS_VAR);
6621
} else if (v > 0xffff) {
6627
set_float_exception_flags(old_exc_flags, status);
6628
float_raise(float_flag_invalid STATUS_VAR);
6632
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6636
int old_exc_flags = get_float_exception_flags(status);
6638
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6641
} else if (v > 0xffff) {
6646
set_float_exception_flags(old_exc_flags, status);
6647
float_raise(float_flag_invalid STATUS_VAR);
6651
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6656
v = float64_to_int64(a STATUS_VAR);
6659
float_raise( float_flag_invalid STATUS_VAR);
6660
} else if (v > 0xffffffff) {
6662
float_raise( float_flag_invalid STATUS_VAR);
6669
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6674
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6677
float_raise( float_flag_invalid STATUS_VAR);
6678
} else if (v > 0xffffffff) {
6680
float_raise( float_flag_invalid STATUS_VAR);
6687
int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6691
int old_exc_flags = get_float_exception_flags(status);
6693
v = float64_to_int32(a STATUS_VAR);
6696
} else if (v > 0x7fff) {
6702
set_float_exception_flags(old_exc_flags, status);
6703
float_raise(float_flag_invalid STATUS_VAR);
6707
uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6711
int old_exc_flags = get_float_exception_flags(status);
6713
v = float64_to_int32(a STATUS_VAR);
6716
} else if (v > 0xffff) {
6722
set_float_exception_flags(old_exc_flags, status);
6723
float_raise(float_flag_invalid STATUS_VAR);
6727
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6731
int old_exc_flags = get_float_exception_flags(status);
6733
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6736
} else if (v > 0xffff) {
6741
set_float_exception_flags(old_exc_flags, status);
6742
float_raise(float_flag_invalid STATUS_VAR);
6746
/*----------------------------------------------------------------------------
6747
| Returns the result of converting the double-precision floating-point value
6748
| `a' to the 64-bit unsigned integer format. The conversion is
6749
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6750
| Arithmetic---which means in particular that the conversion is rounded
6751
| according to the current rounding mode. If `a' is a NaN, the largest
6752
| positive integer is returned. If the conversion overflows, the
6753
| largest unsigned integer is returned. If 'a' is negative, the value is
6754
| rounded and zero is returned; negative values that do not round to zero
6755
| will raise the inexact exception.
6756
*----------------------------------------------------------------------------*/
6758
uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6761
int_fast16_t aExp, shiftCount;
6762
uint64_t aSig, aSigExtra;
6763
a = float64_squash_input_denormal(a STATUS_VAR);
6765
aSig = extractFloat64Frac(a);
6766
aExp = extractFloat64Exp(a);
6767
aSign = extractFloat64Sign(a);
6768
if (aSign && (aExp > 1022)) {
6769
float_raise(float_flag_invalid STATUS_VAR);
6770
if (float64_is_any_nan(a)) {
6771
return LIT64(0xFFFFFFFFFFFFFFFF);
6777
aSig |= LIT64(0x0010000000000000);
6779
shiftCount = 0x433 - aExp;
6780
if (shiftCount <= 0) {
6782
float_raise(float_flag_invalid STATUS_VAR);
6783
return LIT64(0xFFFFFFFFFFFFFFFF);
6786
aSig <<= -shiftCount;
6788
shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6790
return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6793
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6797
v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6798
v += float64_val(a);
6799
v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6801
return v - INT64_MIN;
6804
#define COMPARE(s, nan_exp) \
6805
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6806
int is_quiet STATUS_PARAM ) \
6808
flag aSign, bSign; \
6809
uint ## s ## _t av, bv; \
6810
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6811
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6813
if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6814
extractFloat ## s ## Frac( a ) ) || \
6815
( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6816
extractFloat ## s ## Frac( b ) )) { \
6818
float ## s ## _is_signaling_nan( a ) || \
6819
float ## s ## _is_signaling_nan( b ) ) { \
6820
float_raise( float_flag_invalid STATUS_VAR); \
6822
return float_relation_unordered; \
6824
aSign = extractFloat ## s ## Sign( a ); \
6825
bSign = extractFloat ## s ## Sign( b ); \
6826
av = float ## s ## _val(a); \
6827
bv = float ## s ## _val(b); \
6828
if ( aSign != bSign ) { \
6829
if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6831
return float_relation_equal; \
6833
return 1 - (2 * aSign); \
6837
return float_relation_equal; \
6839
return 1 - 2 * (aSign ^ ( av < bv )); \
6844
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6846
return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6849
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6851
return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6857
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6858
int is_quiet STATUS_PARAM )
6862
if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6863
( extractFloatx80Frac( a )<<1 ) ) ||
6864
( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6865
( extractFloatx80Frac( b )<<1 ) )) {
6867
floatx80_is_signaling_nan( a ) ||
6868
floatx80_is_signaling_nan( b ) ) {
6869
float_raise( float_flag_invalid STATUS_VAR);
6871
return float_relation_unordered;
6873
aSign = extractFloatx80Sign( a );
6874
bSign = extractFloatx80Sign( b );
6875
if ( aSign != bSign ) {
6877
if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6878
( ( a.low | b.low ) == 0 ) ) {
6880
return float_relation_equal;
6882
return 1 - (2 * aSign);
6885
if (a.low == b.low && a.high == b.high) {
6886
return float_relation_equal;
6888
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6893
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6895
return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6898
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6900
return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6903
INLINE int float128_compare_internal( float128 a, float128 b,
6904
int is_quiet STATUS_PARAM )
6908
if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6909
( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6910
( ( extractFloat128Exp( b ) == 0x7fff ) &&
6911
( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6913
float128_is_signaling_nan( a ) ||
6914
float128_is_signaling_nan( b ) ) {
6915
float_raise( float_flag_invalid STATUS_VAR);
6917
return float_relation_unordered;
6919
aSign = extractFloat128Sign( a );
6920
bSign = extractFloat128Sign( b );
6921
if ( aSign != bSign ) {
6922
if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6924
return float_relation_equal;
6926
return 1 - (2 * aSign);
6929
if (a.low == b.low && a.high == b.high) {
6930
return float_relation_equal;
6932
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6937
int float128_compare( float128 a, float128 b STATUS_PARAM )
6939
return float128_compare_internal(a, b, 0 STATUS_VAR);
6942
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6944
return float128_compare_internal(a, b, 1 STATUS_VAR);
6947
/* min() and max() functions. These can't be implemented as
6948
* 'compare and pick one input' because that would mishandle
6949
* NaNs and +0 vs -0.
6951
* minnum() and maxnum() functions. These are similar to the min()
6952
* and max() functions but if one of the arguments is a QNaN and
6953
* the other is numerical then the numerical argument is returned.
6954
* minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6955
* and maxNum() operations. min() and max() are the typical min/max
6956
* semantics provided by many CPUs which predate that specification.
6958
#define MINMAX(s, nan_exp) \
6959
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6960
int ismin, int isieee STATUS_PARAM) \
6962
flag aSign, bSign; \
6963
uint ## s ## _t av, bv; \
6964
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6965
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6966
if (float ## s ## _is_any_nan(a) || \
6967
float ## s ## _is_any_nan(b)) { \
6969
if (float ## s ## _is_quiet_nan(a) && \
6970
!float ## s ##_is_any_nan(b)) { \
6972
} else if (float ## s ## _is_quiet_nan(b) && \
6973
!float ## s ## _is_any_nan(a)) { \
6977
return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6979
aSign = extractFloat ## s ## Sign(a); \
6980
bSign = extractFloat ## s ## Sign(b); \
6981
av = float ## s ## _val(a); \
6982
bv = float ## s ## _val(b); \
6983
if (aSign != bSign) { \
6985
return aSign ? a : b; \
6987
return aSign ? b : a; \
6991
return (aSign ^ (av < bv)) ? a : b; \
6993
return (aSign ^ (av < bv)) ? b : a; \
6998
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7000
return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
7003
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7005
return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7008
float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7010
return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7013
float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7015
return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7022
/* Multiply A by 2 raised to the power N. */
7023
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7029
a = float32_squash_input_denormal(a STATUS_VAR);
7030
aSig = extractFloat32Frac( a );
7031
aExp = extractFloat32Exp( a );
7032
aSign = extractFloat32Sign( a );
7034
if ( aExp == 0xFF ) {
7036
return propagateFloat32NaN( a, a STATUS_VAR );
7042
} else if (aSig == 0) {
7050
} else if (n < -0x200) {
7056
return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7059
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7065
a = float64_squash_input_denormal(a STATUS_VAR);
7066
aSig = extractFloat64Frac( a );
7067
aExp = extractFloat64Exp( a );
7068
aSign = extractFloat64Sign( a );
7070
if ( aExp == 0x7FF ) {
7072
return propagateFloat64NaN( a, a STATUS_VAR );
7077
aSig |= LIT64( 0x0010000000000000 );
7078
} else if (aSig == 0) {
7086
} else if (n < -0x1000) {
7092
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7095
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7101
aSig = extractFloatx80Frac( a );
7102
aExp = extractFloatx80Exp( a );
7103
aSign = extractFloatx80Sign( a );
7105
if ( aExp == 0x7FFF ) {
7107
return propagateFloatx80NaN( a, a STATUS_VAR );
7121
} else if (n < -0x10000) {
7126
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7127
aSign, aExp, aSig, 0 STATUS_VAR );
7130
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7134
uint64_t aSig0, aSig1;
7136
aSig1 = extractFloat128Frac1( a );
7137
aSig0 = extractFloat128Frac0( a );
7138
aExp = extractFloat128Exp( a );
7139
aSign = extractFloat128Sign( a );
7140
if ( aExp == 0x7FFF ) {
7141
if ( aSig0 | aSig1 ) {
7142
return propagateFloat128NaN( a, a STATUS_VAR );
7147
aSig0 |= LIT64( 0x0001000000000000 );
7148
} else if (aSig0 == 0 && aSig1 == 0) {
7156
} else if (n < -0x10000) {
7161
return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1