4
* Derived from SoftFloat.
7
/*============================================================================
9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12
Written by John R. Hauser. This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704. Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980. The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
36
=============================================================================*/
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
* target-dependent and needs the TARGET_* macros.
43
#include "fpu/softfloat.h"
45
/*----------------------------------------------------------------------------
46
| Primitive arithmetic functions, including multi-word arithmetic, and
47
| division and square root approximations. (Can be specialized to target if
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-macros.h"
52
/*----------------------------------------------------------------------------
53
| Functions and definitions to determine: (1) whether tininess for underflow
54
| is detected before or after rounding by default, (2) what (if anything)
55
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57
| are propagated from function inputs to output. These details are target-
59
*----------------------------------------------------------------------------*/
60
#include "softfloat-specialize.h"
62
/*----------------------------------------------------------------------------
63
| Returns the fraction bits of the half-precision floating-point value `a'.
64
*----------------------------------------------------------------------------*/
66
INLINE uint32_t extractFloat16Frac(float16 a)
68
return float16_val(a) & 0x3ff;
71
/*----------------------------------------------------------------------------
72
| Returns the exponent bits of the half-precision floating-point value `a'.
73
*----------------------------------------------------------------------------*/
75
INLINE int_fast16_t extractFloat16Exp(float16 a)
77
return (float16_val(a) >> 10) & 0x1f;
80
/*----------------------------------------------------------------------------
81
| Returns the sign bit of the single-precision floating-point value `a'.
82
*----------------------------------------------------------------------------*/
84
INLINE flag extractFloat16Sign(float16 a)
86
return float16_val(a)>>15;
89
/*----------------------------------------------------------------------------
90
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
91
| and 7, and returns the properly rounded 32-bit integer corresponding to the
92
| input. If `zSign' is 1, the input is negated before being converted to an
93
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
94
| is simply rounded to an integer, with the inexact exception raised if the
95
| input cannot be represented exactly as an integer. However, if the fixed-
96
| point input is too large, the invalid exception is raised and the largest
97
| positive or negative integer is returned.
98
*----------------------------------------------------------------------------*/
100
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
103
flag roundNearestEven;
104
int8 roundIncrement, roundBits;
107
roundingMode = STATUS(float_rounding_mode);
108
roundNearestEven = ( roundingMode == float_round_nearest_even );
109
roundIncrement = 0x40;
110
if ( ! roundNearestEven ) {
111
if ( roundingMode == float_round_to_zero ) {
115
roundIncrement = 0x7F;
117
if ( roundingMode == float_round_up ) roundIncrement = 0;
120
if ( roundingMode == float_round_down ) roundIncrement = 0;
124
roundBits = absZ & 0x7F;
125
absZ = ( absZ + roundIncrement )>>7;
126
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
128
if ( zSign ) z = - z;
129
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
130
float_raise( float_flag_invalid STATUS_VAR);
131
return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
133
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
138
/*----------------------------------------------------------------------------
139
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
140
| `absZ1', with binary point between bits 63 and 64 (between the input words),
141
| and returns the properly rounded 64-bit integer corresponding to the input.
142
| If `zSign' is 1, the input is negated before being converted to an integer.
143
| Ordinarily, the fixed-point input is simply rounded to an integer, with
144
| the inexact exception raised if the input cannot be represented exactly as
145
| an integer. However, if the fixed-point input is too large, the invalid
146
| exception is raised and the largest positive or negative integer is
148
*----------------------------------------------------------------------------*/
150
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
153
flag roundNearestEven, increment;
156
roundingMode = STATUS(float_rounding_mode);
157
roundNearestEven = ( roundingMode == float_round_nearest_even );
158
increment = ( (int64_t) absZ1 < 0 );
159
if ( ! roundNearestEven ) {
160
if ( roundingMode == float_round_to_zero ) {
165
increment = ( roundingMode == float_round_down ) && absZ1;
168
increment = ( roundingMode == float_round_up ) && absZ1;
174
if ( absZ0 == 0 ) goto overflow;
175
absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
178
if ( zSign ) z = - z;
179
if ( z && ( ( z < 0 ) ^ zSign ) ) {
181
float_raise( float_flag_invalid STATUS_VAR);
183
zSign ? (int64_t) LIT64( 0x8000000000000000 )
184
: LIT64( 0x7FFFFFFFFFFFFFFF );
186
if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
191
/*----------------------------------------------------------------------------
192
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
193
| `absZ1', with binary point between bits 63 and 64 (between the input words),
194
| and returns the properly rounded 64-bit unsigned integer corresponding to the
195
| input. Ordinarily, the fixed-point input is simply rounded to an integer,
196
| with the inexact exception raised if the input cannot be represented exactly
197
| as an integer. However, if the fixed-point input is too large, the invalid
198
| exception is raised and the largest unsigned integer is returned.
199
*----------------------------------------------------------------------------*/
201
static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
202
uint64_t absZ1 STATUS_PARAM)
205
flag roundNearestEven, increment;
207
roundingMode = STATUS(float_rounding_mode);
208
roundNearestEven = (roundingMode == float_round_nearest_even);
209
increment = ((int64_t)absZ1 < 0);
210
if (!roundNearestEven) {
211
if (roundingMode == float_round_to_zero) {
215
increment = (roundingMode == float_round_down) && absZ1;
217
increment = (roundingMode == float_round_up) && absZ1;
224
float_raise(float_flag_invalid STATUS_VAR);
225
return LIT64(0xFFFFFFFFFFFFFFFF);
227
absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
230
if (zSign && absZ0) {
231
float_raise(float_flag_invalid STATUS_VAR);
236
STATUS(float_exception_flags) |= float_flag_inexact;
241
/*----------------------------------------------------------------------------
242
| Returns the fraction bits of the single-precision floating-point value `a'.
243
*----------------------------------------------------------------------------*/
245
INLINE uint32_t extractFloat32Frac( float32 a )
248
return float32_val(a) & 0x007FFFFF;
252
/*----------------------------------------------------------------------------
253
| Returns the exponent bits of the single-precision floating-point value `a'.
254
*----------------------------------------------------------------------------*/
256
INLINE int_fast16_t extractFloat32Exp(float32 a)
259
return ( float32_val(a)>>23 ) & 0xFF;
263
/*----------------------------------------------------------------------------
264
| Returns the sign bit of the single-precision floating-point value `a'.
265
*----------------------------------------------------------------------------*/
267
INLINE flag extractFloat32Sign( float32 a )
270
return float32_val(a)>>31;
274
/*----------------------------------------------------------------------------
275
| If `a' is denormal and we are in flush-to-zero mode then set the
276
| input-denormal exception and return zero. Otherwise just return the value.
277
*----------------------------------------------------------------------------*/
278
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
280
if (STATUS(flush_inputs_to_zero)) {
281
if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
282
float_raise(float_flag_input_denormal STATUS_VAR);
283
return make_float32(float32_val(a) & 0x80000000);
289
/*----------------------------------------------------------------------------
290
| Normalizes the subnormal single-precision floating-point value represented
291
| by the denormalized significand `aSig'. The normalized exponent and
292
| significand are stored at the locations pointed to by `zExpPtr' and
293
| `zSigPtr', respectively.
294
*----------------------------------------------------------------------------*/
297
normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
301
shiftCount = countLeadingZeros32( aSig ) - 8;
302
*zSigPtr = aSig<<shiftCount;
303
*zExpPtr = 1 - shiftCount;
307
/*----------------------------------------------------------------------------
308
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
309
| single-precision floating-point value, returning the result. After being
310
| shifted into the proper positions, the three fields are simply added
311
| together to form the result. This means that any integer portion of `zSig'
312
| will be added into the exponent. Since a properly normalized significand
313
| will have an integer portion equal to 1, the `zExp' input should be 1 less
314
| than the desired result exponent whenever `zSig' is a complete, normalized
316
*----------------------------------------------------------------------------*/
318
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
322
( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
326
/*----------------------------------------------------------------------------
327
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
328
| and significand `zSig', and returns the proper single-precision floating-
329
| point value corresponding to the abstract input. Ordinarily, the abstract
330
| value is simply rounded and packed into the single-precision format, with
331
| the inexact exception raised if the abstract input cannot be represented
332
| exactly. However, if the abstract value is too large, the overflow and
333
| inexact exceptions are raised and an infinity or maximal finite value is
334
| returned. If the abstract value is too small, the input value is rounded to
335
| a subnormal number, and the underflow and inexact exceptions are raised if
336
| the abstract input cannot be represented exactly as a subnormal single-
337
| precision floating-point number.
338
| The input significand `zSig' has its binary point between bits 30
339
| and 29, which is 7 bits to the left of the usual location. This shifted
340
| significand must be normalized or smaller. If `zSig' is not normalized,
341
| `zExp' must be 0; in that case, the result returned is a subnormal number,
342
| and it must not require rounding. In the usual case that `zSig' is
343
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
344
| The handling of underflow and overflow follows the IEC/IEEE Standard for
345
| Binary Floating-Point Arithmetic.
346
*----------------------------------------------------------------------------*/
348
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
351
flag roundNearestEven;
352
int8 roundIncrement, roundBits;
355
roundingMode = STATUS(float_rounding_mode);
356
roundNearestEven = ( roundingMode == float_round_nearest_even );
357
roundIncrement = 0x40;
358
if ( ! roundNearestEven ) {
359
if ( roundingMode == float_round_to_zero ) {
363
roundIncrement = 0x7F;
365
if ( roundingMode == float_round_up ) roundIncrement = 0;
368
if ( roundingMode == float_round_down ) roundIncrement = 0;
372
roundBits = zSig & 0x7F;
373
if ( 0xFD <= (uint16_t) zExp ) {
375
|| ( ( zExp == 0xFD )
376
&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
378
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
379
return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
382
if (STATUS(flush_to_zero)) {
383
float_raise(float_flag_output_denormal STATUS_VAR);
384
return packFloat32(zSign, 0, 0);
387
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
389
|| ( zSig + roundIncrement < 0x80000000 );
390
shift32RightJamming( zSig, - zExp, &zSig );
392
roundBits = zSig & 0x7F;
393
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
396
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
397
zSig = ( zSig + roundIncrement )>>7;
398
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
399
if ( zSig == 0 ) zExp = 0;
400
return packFloat32( zSign, zExp, zSig );
404
/*----------------------------------------------------------------------------
405
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406
| and significand `zSig', and returns the proper single-precision floating-
407
| point value corresponding to the abstract input. This routine is just like
408
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
409
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
410
| floating-point exponent.
411
*----------------------------------------------------------------------------*/
414
normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
418
shiftCount = countLeadingZeros32( zSig ) - 1;
419
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
423
/*----------------------------------------------------------------------------
424
| Returns the fraction bits of the double-precision floating-point value `a'.
425
*----------------------------------------------------------------------------*/
427
INLINE uint64_t extractFloat64Frac( float64 a )
430
return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
434
/*----------------------------------------------------------------------------
435
| Returns the exponent bits of the double-precision floating-point value `a'.
436
*----------------------------------------------------------------------------*/
438
INLINE int_fast16_t extractFloat64Exp(float64 a)
441
return ( float64_val(a)>>52 ) & 0x7FF;
445
/*----------------------------------------------------------------------------
446
| Returns the sign bit of the double-precision floating-point value `a'.
447
*----------------------------------------------------------------------------*/
449
INLINE flag extractFloat64Sign( float64 a )
452
return float64_val(a)>>63;
456
/*----------------------------------------------------------------------------
457
| If `a' is denormal and we are in flush-to-zero mode then set the
458
| input-denormal exception and return zero. Otherwise just return the value.
459
*----------------------------------------------------------------------------*/
460
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
462
if (STATUS(flush_inputs_to_zero)) {
463
if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
464
float_raise(float_flag_input_denormal STATUS_VAR);
465
return make_float64(float64_val(a) & (1ULL << 63));
471
/*----------------------------------------------------------------------------
472
| Normalizes the subnormal double-precision floating-point value represented
473
| by the denormalized significand `aSig'. The normalized exponent and
474
| significand are stored at the locations pointed to by `zExpPtr' and
475
| `zSigPtr', respectively.
476
*----------------------------------------------------------------------------*/
479
normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
483
shiftCount = countLeadingZeros64( aSig ) - 11;
484
*zSigPtr = aSig<<shiftCount;
485
*zExpPtr = 1 - shiftCount;
489
/*----------------------------------------------------------------------------
490
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
491
| double-precision floating-point value, returning the result. After being
492
| shifted into the proper positions, the three fields are simply added
493
| together to form the result. This means that any integer portion of `zSig'
494
| will be added into the exponent. Since a properly normalized significand
495
| will have an integer portion equal to 1, the `zExp' input should be 1 less
496
| than the desired result exponent whenever `zSig' is a complete, normalized
498
*----------------------------------------------------------------------------*/
500
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
504
( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
508
/*----------------------------------------------------------------------------
509
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
510
| and significand `zSig', and returns the proper double-precision floating-
511
| point value corresponding to the abstract input. Ordinarily, the abstract
512
| value is simply rounded and packed into the double-precision format, with
513
| the inexact exception raised if the abstract input cannot be represented
514
| exactly. However, if the abstract value is too large, the overflow and
515
| inexact exceptions are raised and an infinity or maximal finite value is
516
| returned. If the abstract value is too small, the input value is rounded
517
| to a subnormal number, and the underflow and inexact exceptions are raised
518
| if the abstract input cannot be represented exactly as a subnormal double-
519
| precision floating-point number.
520
| The input significand `zSig' has its binary point between bits 62
521
| and 61, which is 10 bits to the left of the usual location. This shifted
522
| significand must be normalized or smaller. If `zSig' is not normalized,
523
| `zExp' must be 0; in that case, the result returned is a subnormal number,
524
| and it must not require rounding. In the usual case that `zSig' is
525
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
526
| The handling of underflow and overflow follows the IEC/IEEE Standard for
527
| Binary Floating-Point Arithmetic.
528
*----------------------------------------------------------------------------*/
530
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
533
flag roundNearestEven;
534
int_fast16_t roundIncrement, roundBits;
537
roundingMode = STATUS(float_rounding_mode);
538
roundNearestEven = ( roundingMode == float_round_nearest_even );
539
roundIncrement = 0x200;
540
if ( ! roundNearestEven ) {
541
if ( roundingMode == float_round_to_zero ) {
545
roundIncrement = 0x3FF;
547
if ( roundingMode == float_round_up ) roundIncrement = 0;
550
if ( roundingMode == float_round_down ) roundIncrement = 0;
554
roundBits = zSig & 0x3FF;
555
if ( 0x7FD <= (uint16_t) zExp ) {
556
if ( ( 0x7FD < zExp )
557
|| ( ( zExp == 0x7FD )
558
&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
560
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
561
return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
564
if (STATUS(flush_to_zero)) {
565
float_raise(float_flag_output_denormal STATUS_VAR);
566
return packFloat64(zSign, 0, 0);
569
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
571
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
572
shift64RightJamming( zSig, - zExp, &zSig );
574
roundBits = zSig & 0x3FF;
575
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
578
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
579
zSig = ( zSig + roundIncrement )>>10;
580
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
581
if ( zSig == 0 ) zExp = 0;
582
return packFloat64( zSign, zExp, zSig );
586
/*----------------------------------------------------------------------------
587
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
588
| and significand `zSig', and returns the proper double-precision floating-
589
| point value corresponding to the abstract input. This routine is just like
590
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
591
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
592
| floating-point exponent.
593
*----------------------------------------------------------------------------*/
596
normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
600
shiftCount = countLeadingZeros64( zSig ) - 1;
601
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
605
/*----------------------------------------------------------------------------
606
| Returns the fraction bits of the extended double-precision floating-point
608
*----------------------------------------------------------------------------*/
610
INLINE uint64_t extractFloatx80Frac( floatx80 a )
617
/*----------------------------------------------------------------------------
618
| Returns the exponent bits of the extended double-precision floating-point
620
*----------------------------------------------------------------------------*/
622
INLINE int32 extractFloatx80Exp( floatx80 a )
625
return a.high & 0x7FFF;
629
/*----------------------------------------------------------------------------
630
| Returns the sign bit of the extended double-precision floating-point value
632
*----------------------------------------------------------------------------*/
634
INLINE flag extractFloatx80Sign( floatx80 a )
641
/*----------------------------------------------------------------------------
642
| Normalizes the subnormal extended double-precision floating-point value
643
| represented by the denormalized significand `aSig'. The normalized exponent
644
| and significand are stored at the locations pointed to by `zExpPtr' and
645
| `zSigPtr', respectively.
646
*----------------------------------------------------------------------------*/
649
normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
653
shiftCount = countLeadingZeros64( aSig );
654
*zSigPtr = aSig<<shiftCount;
655
*zExpPtr = 1 - shiftCount;
659
/*----------------------------------------------------------------------------
660
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
661
| extended double-precision floating-point value, returning the result.
662
*----------------------------------------------------------------------------*/
664
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
669
z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
674
/*----------------------------------------------------------------------------
675
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
676
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
677
| and returns the proper extended double-precision floating-point value
678
| corresponding to the abstract input. Ordinarily, the abstract value is
679
| rounded and packed into the extended double-precision format, with the
680
| inexact exception raised if the abstract input cannot be represented
681
| exactly. However, if the abstract value is too large, the overflow and
682
| inexact exceptions are raised and an infinity or maximal finite value is
683
| returned. If the abstract value is too small, the input value is rounded to
684
| a subnormal number, and the underflow and inexact exceptions are raised if
685
| the abstract input cannot be represented exactly as a subnormal extended
686
| double-precision floating-point number.
687
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
688
| number of bits as single or double precision, respectively. Otherwise, the
689
| result is rounded to the full precision of the extended double-precision
691
| The input significand must be normalized or smaller. If the input
692
| significand is not normalized, `zExp' must be 0; in that case, the result
693
| returned is a subnormal number, and it must not require rounding. The
694
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
695
| Floating-Point Arithmetic.
696
*----------------------------------------------------------------------------*/
699
roundAndPackFloatx80(
700
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
704
flag roundNearestEven, increment, isTiny;
705
int64 roundIncrement, roundMask, roundBits;
707
roundingMode = STATUS(float_rounding_mode);
708
roundNearestEven = ( roundingMode == float_round_nearest_even );
709
if ( roundingPrecision == 80 ) goto precision80;
710
if ( roundingPrecision == 64 ) {
711
roundIncrement = LIT64( 0x0000000000000400 );
712
roundMask = LIT64( 0x00000000000007FF );
714
else if ( roundingPrecision == 32 ) {
715
roundIncrement = LIT64( 0x0000008000000000 );
716
roundMask = LIT64( 0x000000FFFFFFFFFF );
721
zSig0 |= ( zSig1 != 0 );
722
if ( ! roundNearestEven ) {
723
if ( roundingMode == float_round_to_zero ) {
727
roundIncrement = roundMask;
729
if ( roundingMode == float_round_up ) roundIncrement = 0;
732
if ( roundingMode == float_round_down ) roundIncrement = 0;
736
roundBits = zSig0 & roundMask;
737
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
738
if ( ( 0x7FFE < zExp )
739
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
744
if (STATUS(flush_to_zero)) {
745
float_raise(float_flag_output_denormal STATUS_VAR);
746
return packFloatx80(zSign, 0, 0);
749
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
751
|| ( zSig0 <= zSig0 + roundIncrement );
752
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
754
roundBits = zSig0 & roundMask;
755
if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
756
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
757
zSig0 += roundIncrement;
758
if ( (int64_t) zSig0 < 0 ) zExp = 1;
759
roundIncrement = roundMask + 1;
760
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
761
roundMask |= roundIncrement;
763
zSig0 &= ~ roundMask;
764
return packFloatx80( zSign, zExp, zSig0 );
767
if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
768
zSig0 += roundIncrement;
769
if ( zSig0 < roundIncrement ) {
771
zSig0 = LIT64( 0x8000000000000000 );
773
roundIncrement = roundMask + 1;
774
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
775
roundMask |= roundIncrement;
777
zSig0 &= ~ roundMask;
778
if ( zSig0 == 0 ) zExp = 0;
779
return packFloatx80( zSign, zExp, zSig0 );
781
increment = ( (int64_t) zSig1 < 0 );
782
if ( ! roundNearestEven ) {
783
if ( roundingMode == float_round_to_zero ) {
788
increment = ( roundingMode == float_round_down ) && zSig1;
791
increment = ( roundingMode == float_round_up ) && zSig1;
795
if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
796
if ( ( 0x7FFE < zExp )
797
|| ( ( zExp == 0x7FFE )
798
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
804
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
805
if ( ( roundingMode == float_round_to_zero )
806
|| ( zSign && ( roundingMode == float_round_up ) )
807
|| ( ! zSign && ( roundingMode == float_round_down ) )
809
return packFloatx80( zSign, 0x7FFE, ~ roundMask );
811
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
815
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
818
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
819
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
821
if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
822
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
823
if ( roundNearestEven ) {
824
increment = ( (int64_t) zSig1 < 0 );
828
increment = ( roundingMode == float_round_down ) && zSig1;
831
increment = ( roundingMode == float_round_up ) && zSig1;
837
~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
838
if ( (int64_t) zSig0 < 0 ) zExp = 1;
840
return packFloatx80( zSign, zExp, zSig0 );
843
if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
848
zSig0 = LIT64( 0x8000000000000000 );
851
zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
855
if ( zSig0 == 0 ) zExp = 0;
857
return packFloatx80( zSign, zExp, zSig0 );
861
/*----------------------------------------------------------------------------
862
| Takes an abstract floating-point value having sign `zSign', exponent
863
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
864
| and returns the proper extended double-precision floating-point value
865
| corresponding to the abstract input. This routine is just like
866
| `roundAndPackFloatx80' except that the input significand does not have to be
868
*----------------------------------------------------------------------------*/
871
normalizeRoundAndPackFloatx80(
872
int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
882
shiftCount = countLeadingZeros64( zSig0 );
883
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
886
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
890
/*----------------------------------------------------------------------------
891
| Returns the least-significant 64 fraction bits of the quadruple-precision
892
| floating-point value `a'.
893
*----------------------------------------------------------------------------*/
895
INLINE uint64_t extractFloat128Frac1( float128 a )
902
/*----------------------------------------------------------------------------
903
| Returns the most-significant 48 fraction bits of the quadruple-precision
904
| floating-point value `a'.
905
*----------------------------------------------------------------------------*/
907
INLINE uint64_t extractFloat128Frac0( float128 a )
910
return a.high & LIT64( 0x0000FFFFFFFFFFFF );
914
/*----------------------------------------------------------------------------
915
| Returns the exponent bits of the quadruple-precision floating-point value
917
*----------------------------------------------------------------------------*/
919
INLINE int32 extractFloat128Exp( float128 a )
922
return ( a.high>>48 ) & 0x7FFF;
926
/*----------------------------------------------------------------------------
927
| Returns the sign bit of the quadruple-precision floating-point value `a'.
928
*----------------------------------------------------------------------------*/
930
INLINE flag extractFloat128Sign( float128 a )
937
/*----------------------------------------------------------------------------
938
| Normalizes the subnormal quadruple-precision floating-point value
939
| represented by the denormalized significand formed by the concatenation of
940
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
941
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
942
| significand are stored at the location pointed to by `zSig0Ptr', and the
943
| least significant 64 bits of the normalized significand are stored at the
944
| location pointed to by `zSig1Ptr'.
945
*----------------------------------------------------------------------------*/
948
normalizeFloat128Subnormal(
959
shiftCount = countLeadingZeros64( aSig1 ) - 15;
960
if ( shiftCount < 0 ) {
961
*zSig0Ptr = aSig1>>( - shiftCount );
962
*zSig1Ptr = aSig1<<( shiftCount & 63 );
965
*zSig0Ptr = aSig1<<shiftCount;
968
*zExpPtr = - shiftCount - 63;
971
shiftCount = countLeadingZeros64( aSig0 ) - 15;
972
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
973
*zExpPtr = 1 - shiftCount;
978
/*----------------------------------------------------------------------------
979
| Packs the sign `zSign', the exponent `zExp', and the significand formed
980
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
981
| floating-point value, returning the result. After being shifted into the
982
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
983
| added together to form the most significant 32 bits of the result. This
984
| means that any integer portion of `zSig0' will be added into the exponent.
985
| Since a properly normalized significand will have an integer portion equal
986
| to 1, the `zExp' input should be 1 less than the desired result exponent
987
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
989
*----------------------------------------------------------------------------*/
992
packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
997
z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1002
/*----------------------------------------------------------------------------
1003
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004
| and extended significand formed by the concatenation of `zSig0', `zSig1',
1005
| and `zSig2', and returns the proper quadruple-precision floating-point value
1006
| corresponding to the abstract input. Ordinarily, the abstract value is
1007
| simply rounded and packed into the quadruple-precision format, with the
1008
| inexact exception raised if the abstract input cannot be represented
1009
| exactly. However, if the abstract value is too large, the overflow and
1010
| inexact exceptions are raised and an infinity or maximal finite value is
1011
| returned. If the abstract value is too small, the input value is rounded to
1012
| a subnormal number, and the underflow and inexact exceptions are raised if
1013
| the abstract input cannot be represented exactly as a subnormal quadruple-
1014
| precision floating-point number.
1015
| The input significand must be normalized or smaller. If the input
1016
| significand is not normalized, `zExp' must be 0; in that case, the result
1017
| returned is a subnormal number, and it must not require rounding. In the
1018
| usual case that the input significand is normalized, `zExp' must be 1 less
1019
| than the ``true'' floating-point exponent. The handling of underflow and
1020
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021
*----------------------------------------------------------------------------*/
1024
roundAndPackFloat128(
1025
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1028
flag roundNearestEven, increment, isTiny;
1030
roundingMode = STATUS(float_rounding_mode);
1031
roundNearestEven = ( roundingMode == float_round_nearest_even );
1032
increment = ( (int64_t) zSig2 < 0 );
1033
if ( ! roundNearestEven ) {
1034
if ( roundingMode == float_round_to_zero ) {
1039
increment = ( roundingMode == float_round_down ) && zSig2;
1042
increment = ( roundingMode == float_round_up ) && zSig2;
1046
if ( 0x7FFD <= (uint32_t) zExp ) {
1047
if ( ( 0x7FFD < zExp )
1048
|| ( ( zExp == 0x7FFD )
1050
LIT64( 0x0001FFFFFFFFFFFF ),
1051
LIT64( 0xFFFFFFFFFFFFFFFF ),
1058
float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1059
if ( ( roundingMode == float_round_to_zero )
1060
|| ( zSign && ( roundingMode == float_round_up ) )
1061
|| ( ! zSign && ( roundingMode == float_round_down ) )
1067
LIT64( 0x0000FFFFFFFFFFFF ),
1068
LIT64( 0xFFFFFFFFFFFFFFFF )
1071
return packFloat128( zSign, 0x7FFF, 0, 0 );
1074
if (STATUS(flush_to_zero)) {
1075
float_raise(float_flag_output_denormal STATUS_VAR);
1076
return packFloat128(zSign, 0, 0, 0);
1079
( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1085
LIT64( 0x0001FFFFFFFFFFFF ),
1086
LIT64( 0xFFFFFFFFFFFFFFFF )
1088
shift128ExtraRightJamming(
1089
zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1091
if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1092
if ( roundNearestEven ) {
1093
increment = ( (int64_t) zSig2 < 0 );
1097
increment = ( roundingMode == float_round_down ) && zSig2;
1100
increment = ( roundingMode == float_round_up ) && zSig2;
1105
if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1107
add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1108
zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1111
if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1113
return packFloat128( zSign, zExp, zSig0, zSig1 );
1117
/*----------------------------------------------------------------------------
1118
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1119
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1120
| returns the proper quadruple-precision floating-point value corresponding
1121
| to the abstract input. This routine is just like `roundAndPackFloat128'
1122
| except that the input significand has fewer bits and does not have to be
1123
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1125
*----------------------------------------------------------------------------*/
1128
normalizeRoundAndPackFloat128(
1129
flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1139
shiftCount = countLeadingZeros64( zSig0 ) - 15;
1140
if ( 0 <= shiftCount ) {
1142
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1145
shift128ExtraRightJamming(
1146
zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1149
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1153
/*----------------------------------------------------------------------------
1154
| Returns the result of converting the 32-bit two's complement integer `a'
1155
| to the single-precision floating-point format. The conversion is performed
1156
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157
*----------------------------------------------------------------------------*/
1159
float32 int32_to_float32(int32_t a STATUS_PARAM)
1163
if ( a == 0 ) return float32_zero;
1164
if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1166
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1170
/*----------------------------------------------------------------------------
1171
| Returns the result of converting the 32-bit two's complement integer `a'
1172
| to the double-precision floating-point format. The conversion is performed
1173
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1174
*----------------------------------------------------------------------------*/
1176
float64 int32_to_float64(int32_t a STATUS_PARAM)
1183
if ( a == 0 ) return float64_zero;
1185
absA = zSign ? - a : a;
1186
shiftCount = countLeadingZeros32( absA ) + 21;
1188
return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1192
/*----------------------------------------------------------------------------
1193
| Returns the result of converting the 32-bit two's complement integer `a'
1194
| to the extended double-precision floating-point format. The conversion
1195
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1197
*----------------------------------------------------------------------------*/
1199
floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1206
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1208
absA = zSign ? - a : a;
1209
shiftCount = countLeadingZeros32( absA ) + 32;
1211
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1215
/*----------------------------------------------------------------------------
1216
| Returns the result of converting the 32-bit two's complement integer `a' to
1217
| the quadruple-precision floating-point format. The conversion is performed
1218
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219
*----------------------------------------------------------------------------*/
1221
float128 int32_to_float128(int32_t a STATUS_PARAM)
1228
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1230
absA = zSign ? - a : a;
1231
shiftCount = countLeadingZeros32( absA ) + 17;
1233
return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1237
/*----------------------------------------------------------------------------
1238
| Returns the result of converting the 64-bit two's complement integer `a'
1239
| to the single-precision floating-point format. The conversion is performed
1240
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1241
*----------------------------------------------------------------------------*/
1243
float32 int64_to_float32(int64_t a STATUS_PARAM)
1249
if ( a == 0 ) return float32_zero;
1251
absA = zSign ? - a : a;
1252
shiftCount = countLeadingZeros64( absA ) - 40;
1253
if ( 0 <= shiftCount ) {
1254
return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1258
if ( shiftCount < 0 ) {
1259
shift64RightJamming( absA, - shiftCount, &absA );
1262
absA <<= shiftCount;
1264
return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1269
float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1273
if ( a == 0 ) return float32_zero;
1274
shiftCount = countLeadingZeros64( a ) - 40;
1275
if ( 0 <= shiftCount ) {
1276
return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1280
if ( shiftCount < 0 ) {
1281
shift64RightJamming( a, - shiftCount, &a );
1286
return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1290
/*----------------------------------------------------------------------------
1291
| Returns the result of converting the 64-bit two's complement integer `a'
1292
| to the double-precision floating-point format. The conversion is performed
1293
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294
*----------------------------------------------------------------------------*/
1296
float64 int64_to_float64(int64_t a STATUS_PARAM)
1300
if ( a == 0 ) return float64_zero;
1301
if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1302
return packFloat64( 1, 0x43E, 0 );
1305
return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1309
float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1314
return float64_zero;
1316
if ((int64_t)a < 0) {
1317
shift64RightJamming(a, 1, &a);
1320
return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1323
/*----------------------------------------------------------------------------
1324
| Returns the result of converting the 64-bit two's complement integer `a'
1325
| to the extended double-precision floating-point format. The conversion
1326
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1328
*----------------------------------------------------------------------------*/
1330
floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1336
if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1338
absA = zSign ? - a : a;
1339
shiftCount = countLeadingZeros64( absA );
1340
return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1344
/*----------------------------------------------------------------------------
1345
| Returns the result of converting the 64-bit two's complement integer `a' to
1346
| the quadruple-precision floating-point format. The conversion is performed
1347
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1348
*----------------------------------------------------------------------------*/
1350
float128 int64_to_float128(int64_t a STATUS_PARAM)
1356
uint64_t zSig0, zSig1;
1358
if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1360
absA = zSign ? - a : a;
1361
shiftCount = countLeadingZeros64( absA ) + 49;
1362
zExp = 0x406E - shiftCount;
1363
if ( 64 <= shiftCount ) {
1372
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1373
return packFloat128( zSign, zExp, zSig0, zSig1 );
1377
float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1380
return float128_zero;
1382
return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1385
/*----------------------------------------------------------------------------
1386
| Returns the result of converting the single-precision floating-point value
1387
| `a' to the 32-bit two's complement integer format. The conversion is
1388
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1389
| Arithmetic---which means in particular that the conversion is rounded
1390
| according to the current rounding mode. If `a' is a NaN, the largest
1391
| positive integer is returned. Otherwise, if the conversion overflows, the
1392
| largest integer with the same sign as `a' is returned.
1393
*----------------------------------------------------------------------------*/
1395
int32 float32_to_int32( float32 a STATUS_PARAM )
1398
int_fast16_t aExp, shiftCount;
1402
a = float32_squash_input_denormal(a STATUS_VAR);
1403
aSig = extractFloat32Frac( a );
1404
aExp = extractFloat32Exp( a );
1405
aSign = extractFloat32Sign( a );
1406
if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1407
if ( aExp ) aSig |= 0x00800000;
1408
shiftCount = 0xAF - aExp;
1411
if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1412
return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1416
/*----------------------------------------------------------------------------
1417
| Returns the result of converting the single-precision floating-point value
1418
| `a' to the 32-bit two's complement integer format. The conversion is
1419
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1420
| Arithmetic, except that the conversion is always rounded toward zero.
1421
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1422
| the conversion overflows, the largest integer with the same sign as `a' is
1424
*----------------------------------------------------------------------------*/
1426
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1429
int_fast16_t aExp, shiftCount;
1432
a = float32_squash_input_denormal(a STATUS_VAR);
1434
aSig = extractFloat32Frac( a );
1435
aExp = extractFloat32Exp( a );
1436
aSign = extractFloat32Sign( a );
1437
shiftCount = aExp - 0x9E;
1438
if ( 0 <= shiftCount ) {
1439
if ( float32_val(a) != 0xCF000000 ) {
1440
float_raise( float_flag_invalid STATUS_VAR);
1441
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1443
return (int32_t) 0x80000000;
1445
else if ( aExp <= 0x7E ) {
1446
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1449
aSig = ( aSig | 0x00800000 )<<8;
1450
z = aSig>>( - shiftCount );
1451
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1452
STATUS(float_exception_flags) |= float_flag_inexact;
1454
if ( aSign ) z = - z;
1459
/*----------------------------------------------------------------------------
1460
| Returns the result of converting the single-precision floating-point value
1461
| `a' to the 16-bit two's complement integer format. The conversion is
1462
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1463
| Arithmetic, except that the conversion is always rounded toward zero.
1464
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1465
| the conversion overflows, the largest integer with the same sign as `a' is
1467
*----------------------------------------------------------------------------*/
1469
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1472
int_fast16_t aExp, shiftCount;
1476
aSig = extractFloat32Frac( a );
1477
aExp = extractFloat32Exp( a );
1478
aSign = extractFloat32Sign( a );
1479
shiftCount = aExp - 0x8E;
1480
if ( 0 <= shiftCount ) {
1481
if ( float32_val(a) != 0xC7000000 ) {
1482
float_raise( float_flag_invalid STATUS_VAR);
1483
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487
return (int32_t) 0xffff8000;
1489
else if ( aExp <= 0x7E ) {
1490
if ( aExp | aSig ) {
1491
STATUS(float_exception_flags) |= float_flag_inexact;
1496
aSig = ( aSig | 0x00800000 )<<8;
1497
z = aSig>>( - shiftCount );
1498
if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1499
STATUS(float_exception_flags) |= float_flag_inexact;
1508
/*----------------------------------------------------------------------------
1509
| Returns the result of converting the single-precision floating-point value
1510
| `a' to the 64-bit two's complement integer format. The conversion is
1511
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1512
| Arithmetic---which means in particular that the conversion is rounded
1513
| according to the current rounding mode. If `a' is a NaN, the largest
1514
| positive integer is returned. Otherwise, if the conversion overflows, the
1515
| largest integer with the same sign as `a' is returned.
1516
*----------------------------------------------------------------------------*/
1518
int64 float32_to_int64( float32 a STATUS_PARAM )
1521
int_fast16_t aExp, shiftCount;
1523
uint64_t aSig64, aSigExtra;
1524
a = float32_squash_input_denormal(a STATUS_VAR);
1526
aSig = extractFloat32Frac( a );
1527
aExp = extractFloat32Exp( a );
1528
aSign = extractFloat32Sign( a );
1529
shiftCount = 0xBE - aExp;
1530
if ( shiftCount < 0 ) {
1531
float_raise( float_flag_invalid STATUS_VAR);
1532
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1533
return LIT64( 0x7FFFFFFFFFFFFFFF );
1535
return (int64_t) LIT64( 0x8000000000000000 );
1537
if ( aExp ) aSig |= 0x00800000;
1540
shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1541
return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1545
/*----------------------------------------------------------------------------
1546
| Returns the result of converting the single-precision floating-point value
1547
| `a' to the 64-bit unsigned integer format. The conversion is
1548
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1549
| Arithmetic---which means in particular that the conversion is rounded
1550
| according to the current rounding mode. If `a' is a NaN, the largest
1551
| unsigned integer is returned. Otherwise, if the conversion overflows, the
1552
| largest unsigned integer is returned. If the 'a' is negative, the result
1553
| is rounded and zero is returned; values that do not round to zero will
1554
| raise the inexact exception flag.
1555
*----------------------------------------------------------------------------*/
1557
uint64 float32_to_uint64(float32 a STATUS_PARAM)
1560
int_fast16_t aExp, shiftCount;
1562
uint64_t aSig64, aSigExtra;
1563
a = float32_squash_input_denormal(a STATUS_VAR);
1565
aSig = extractFloat32Frac(a);
1566
aExp = extractFloat32Exp(a);
1567
aSign = extractFloat32Sign(a);
1568
if ((aSign) && (aExp > 126)) {
1569
float_raise(float_flag_invalid STATUS_VAR);
1570
if (float32_is_any_nan(a)) {
1571
return LIT64(0xFFFFFFFFFFFFFFFF);
1576
shiftCount = 0xBE - aExp;
1580
if (shiftCount < 0) {
1581
float_raise(float_flag_invalid STATUS_VAR);
1582
return LIT64(0xFFFFFFFFFFFFFFFF);
1587
shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1588
return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1591
/*----------------------------------------------------------------------------
1592
| Returns the result of converting the single-precision floating-point value
1593
| `a' to the 64-bit two's complement integer format. The conversion is
1594
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1595
| Arithmetic, except that the conversion is always rounded toward zero. If
1596
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1597
| conversion overflows, the largest integer with the same sign as `a' is
1599
*----------------------------------------------------------------------------*/
1601
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1604
int_fast16_t aExp, shiftCount;
1608
a = float32_squash_input_denormal(a STATUS_VAR);
1610
aSig = extractFloat32Frac( a );
1611
aExp = extractFloat32Exp( a );
1612
aSign = extractFloat32Sign( a );
1613
shiftCount = aExp - 0xBE;
1614
if ( 0 <= shiftCount ) {
1615
if ( float32_val(a) != 0xDF000000 ) {
1616
float_raise( float_flag_invalid STATUS_VAR);
1617
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1618
return LIT64( 0x7FFFFFFFFFFFFFFF );
1621
return (int64_t) LIT64( 0x8000000000000000 );
1623
else if ( aExp <= 0x7E ) {
1624
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1627
aSig64 = aSig | 0x00800000;
1629
z = aSig64>>( - shiftCount );
1630
if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1631
STATUS(float_exception_flags) |= float_flag_inexact;
1633
if ( aSign ) z = - z;
1638
/*----------------------------------------------------------------------------
1639
| Returns the result of converting the single-precision floating-point value
1640
| `a' to the double-precision floating-point format. The conversion is
1641
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1643
*----------------------------------------------------------------------------*/
1645
float64 float32_to_float64( float32 a STATUS_PARAM )
1650
a = float32_squash_input_denormal(a STATUS_VAR);
1652
aSig = extractFloat32Frac( a );
1653
aExp = extractFloat32Exp( a );
1654
aSign = extractFloat32Sign( a );
1655
if ( aExp == 0xFF ) {
1656
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1657
return packFloat64( aSign, 0x7FF, 0 );
1660
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1661
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1664
return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1668
/*----------------------------------------------------------------------------
1669
| Returns the result of converting the single-precision floating-point value
1670
| `a' to the extended double-precision floating-point format. The conversion
1671
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1673
*----------------------------------------------------------------------------*/
1675
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1681
a = float32_squash_input_denormal(a STATUS_VAR);
1682
aSig = extractFloat32Frac( a );
1683
aExp = extractFloat32Exp( a );
1684
aSign = extractFloat32Sign( a );
1685
if ( aExp == 0xFF ) {
1686
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1687
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1690
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1691
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1694
return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1698
/*----------------------------------------------------------------------------
1699
| Returns the result of converting the single-precision floating-point value
1700
| `a' to the double-precision floating-point format. The conversion is
1701
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1703
*----------------------------------------------------------------------------*/
1705
float128 float32_to_float128( float32 a STATUS_PARAM )
1711
a = float32_squash_input_denormal(a STATUS_VAR);
1712
aSig = extractFloat32Frac( a );
1713
aExp = extractFloat32Exp( a );
1714
aSign = extractFloat32Sign( a );
1715
if ( aExp == 0xFF ) {
1716
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1717
return packFloat128( aSign, 0x7FFF, 0, 0 );
1720
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1721
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1724
return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1728
/*----------------------------------------------------------------------------
1729
| Rounds the single-precision floating-point value `a' to an integer, and
1730
| returns the result as a single-precision floating-point value. The
1731
| operation is performed according to the IEC/IEEE Standard for Binary
1732
| Floating-Point Arithmetic.
1733
*----------------------------------------------------------------------------*/
1735
float32 float32_round_to_int( float32 a STATUS_PARAM)
1739
uint32_t lastBitMask, roundBitsMask;
1742
a = float32_squash_input_denormal(a STATUS_VAR);
1744
aExp = extractFloat32Exp( a );
1745
if ( 0x96 <= aExp ) {
1746
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1747
return propagateFloat32NaN( a, a STATUS_VAR );
1751
if ( aExp <= 0x7E ) {
1752
if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1753
STATUS(float_exception_flags) |= float_flag_inexact;
1754
aSign = extractFloat32Sign( a );
1755
switch ( STATUS(float_rounding_mode) ) {
1756
case float_round_nearest_even:
1757
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1758
return packFloat32( aSign, 0x7F, 0 );
1761
case float_round_down:
1762
return make_float32(aSign ? 0xBF800000 : 0);
1763
case float_round_up:
1764
return make_float32(aSign ? 0x80000000 : 0x3F800000);
1766
return packFloat32( aSign, 0, 0 );
1769
lastBitMask <<= 0x96 - aExp;
1770
roundBitsMask = lastBitMask - 1;
1772
roundingMode = STATUS(float_rounding_mode);
1773
if ( roundingMode == float_round_nearest_even ) {
1774
z += lastBitMask>>1;
1775
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1777
else if ( roundingMode != float_round_to_zero ) {
1778
if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1782
z &= ~ roundBitsMask;
1783
if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1784
return make_float32(z);
1788
/*----------------------------------------------------------------------------
1789
| Returns the result of adding the absolute values of the single-precision
1790
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1791
| before being returned. `zSign' is ignored if the result is a NaN.
1792
| The addition is performed according to the IEC/IEEE Standard for Binary
1793
| Floating-Point Arithmetic.
1794
*----------------------------------------------------------------------------*/
1796
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1798
int_fast16_t aExp, bExp, zExp;
1799
uint32_t aSig, bSig, zSig;
1800
int_fast16_t expDiff;
1802
aSig = extractFloat32Frac( a );
1803
aExp = extractFloat32Exp( a );
1804
bSig = extractFloat32Frac( b );
1805
bExp = extractFloat32Exp( b );
1806
expDiff = aExp - bExp;
1809
if ( 0 < expDiff ) {
1810
if ( aExp == 0xFF ) {
1811
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1820
shift32RightJamming( bSig, expDiff, &bSig );
1823
else if ( expDiff < 0 ) {
1824
if ( bExp == 0xFF ) {
1825
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1826
return packFloat32( zSign, 0xFF, 0 );
1834
shift32RightJamming( aSig, - expDiff, &aSig );
1838
if ( aExp == 0xFF ) {
1839
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1843
if (STATUS(flush_to_zero)) {
1845
float_raise(float_flag_output_denormal STATUS_VAR);
1847
return packFloat32(zSign, 0, 0);
1849
return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1851
zSig = 0x40000000 + aSig + bSig;
1856
zSig = ( aSig + bSig )<<1;
1858
if ( (int32_t) zSig < 0 ) {
1863
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1867
/*----------------------------------------------------------------------------
1868
| Returns the result of subtracting the absolute values of the single-
1869
| precision floating-point values `a' and `b'. If `zSign' is 1, the
1870
| difference is negated before being returned. `zSign' is ignored if the
1871
| result is a NaN. The subtraction is performed according to the IEC/IEEE
1872
| Standard for Binary Floating-Point Arithmetic.
1873
*----------------------------------------------------------------------------*/
1875
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1877
int_fast16_t aExp, bExp, zExp;
1878
uint32_t aSig, bSig, zSig;
1879
int_fast16_t expDiff;
1881
aSig = extractFloat32Frac( a );
1882
aExp = extractFloat32Exp( a );
1883
bSig = extractFloat32Frac( b );
1884
bExp = extractFloat32Exp( b );
1885
expDiff = aExp - bExp;
1888
if ( 0 < expDiff ) goto aExpBigger;
1889
if ( expDiff < 0 ) goto bExpBigger;
1890
if ( aExp == 0xFF ) {
1891
if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1892
float_raise( float_flag_invalid STATUS_VAR);
1893
return float32_default_nan;
1899
if ( bSig < aSig ) goto aBigger;
1900
if ( aSig < bSig ) goto bBigger;
1901
return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1903
if ( bExp == 0xFF ) {
1904
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1905
return packFloat32( zSign ^ 1, 0xFF, 0 );
1913
shift32RightJamming( aSig, - expDiff, &aSig );
1919
goto normalizeRoundAndPack;
1921
if ( aExp == 0xFF ) {
1922
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1931
shift32RightJamming( bSig, expDiff, &bSig );
1936
normalizeRoundAndPack:
1938
return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1942
/*----------------------------------------------------------------------------
1943
| Returns the result of adding the single-precision floating-point values `a'
1944
| and `b'. The operation is performed according to the IEC/IEEE Standard for
1945
| Binary Floating-Point Arithmetic.
1946
*----------------------------------------------------------------------------*/
1948
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1951
a = float32_squash_input_denormal(a STATUS_VAR);
1952
b = float32_squash_input_denormal(b STATUS_VAR);
1954
aSign = extractFloat32Sign( a );
1955
bSign = extractFloat32Sign( b );
1956
if ( aSign == bSign ) {
1957
return addFloat32Sigs( a, b, aSign STATUS_VAR);
1960
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1965
/*----------------------------------------------------------------------------
1966
| Returns the result of subtracting the single-precision floating-point values
1967
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1968
| for Binary Floating-Point Arithmetic.
1969
*----------------------------------------------------------------------------*/
1971
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1974
a = float32_squash_input_denormal(a STATUS_VAR);
1975
b = float32_squash_input_denormal(b STATUS_VAR);
1977
aSign = extractFloat32Sign( a );
1978
bSign = extractFloat32Sign( b );
1979
if ( aSign == bSign ) {
1980
return subFloat32Sigs( a, b, aSign STATUS_VAR );
1983
return addFloat32Sigs( a, b, aSign STATUS_VAR );
1988
/*----------------------------------------------------------------------------
1989
| Returns the result of multiplying the single-precision floating-point values
1990
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1991
| for Binary Floating-Point Arithmetic.
1992
*----------------------------------------------------------------------------*/
1994
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1996
flag aSign, bSign, zSign;
1997
int_fast16_t aExp, bExp, zExp;
1998
uint32_t aSig, bSig;
2002
a = float32_squash_input_denormal(a STATUS_VAR);
2003
b = float32_squash_input_denormal(b STATUS_VAR);
2005
aSig = extractFloat32Frac( a );
2006
aExp = extractFloat32Exp( a );
2007
aSign = extractFloat32Sign( a );
2008
bSig = extractFloat32Frac( b );
2009
bExp = extractFloat32Exp( b );
2010
bSign = extractFloat32Sign( b );
2011
zSign = aSign ^ bSign;
2012
if ( aExp == 0xFF ) {
2013
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2014
return propagateFloat32NaN( a, b STATUS_VAR );
2016
if ( ( bExp | bSig ) == 0 ) {
2017
float_raise( float_flag_invalid STATUS_VAR);
2018
return float32_default_nan;
2020
return packFloat32( zSign, 0xFF, 0 );
2022
if ( bExp == 0xFF ) {
2023
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2024
if ( ( aExp | aSig ) == 0 ) {
2025
float_raise( float_flag_invalid STATUS_VAR);
2026
return float32_default_nan;
2028
return packFloat32( zSign, 0xFF, 0 );
2031
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2032
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2036
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2038
zExp = aExp + bExp - 0x7F;
2039
aSig = ( aSig | 0x00800000 )<<7;
2040
bSig = ( bSig | 0x00800000 )<<8;
2041
shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2043
if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2047
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2051
/*----------------------------------------------------------------------------
2052
| Returns the result of dividing the single-precision floating-point value `a'
2053
| by the corresponding value `b'. The operation is performed according to the
2054
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2055
*----------------------------------------------------------------------------*/
2057
float32 float32_div( float32 a, float32 b STATUS_PARAM )
2059
flag aSign, bSign, zSign;
2060
int_fast16_t aExp, bExp, zExp;
2061
uint32_t aSig, bSig, zSig;
2062
a = float32_squash_input_denormal(a STATUS_VAR);
2063
b = float32_squash_input_denormal(b STATUS_VAR);
2065
aSig = extractFloat32Frac( a );
2066
aExp = extractFloat32Exp( a );
2067
aSign = extractFloat32Sign( a );
2068
bSig = extractFloat32Frac( b );
2069
bExp = extractFloat32Exp( b );
2070
bSign = extractFloat32Sign( b );
2071
zSign = aSign ^ bSign;
2072
if ( aExp == 0xFF ) {
2073
if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074
if ( bExp == 0xFF ) {
2075
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2076
float_raise( float_flag_invalid STATUS_VAR);
2077
return float32_default_nan;
2079
return packFloat32( zSign, 0xFF, 0 );
2081
if ( bExp == 0xFF ) {
2082
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2083
return packFloat32( zSign, 0, 0 );
2087
if ( ( aExp | aSig ) == 0 ) {
2088
float_raise( float_flag_invalid STATUS_VAR);
2089
return float32_default_nan;
2091
float_raise( float_flag_divbyzero STATUS_VAR);
2092
return packFloat32( zSign, 0xFF, 0 );
2094
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2097
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2098
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2100
zExp = aExp - bExp + 0x7D;
2101
aSig = ( aSig | 0x00800000 )<<7;
2102
bSig = ( bSig | 0x00800000 )<<8;
2103
if ( bSig <= ( aSig + aSig ) ) {
2107
zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2108
if ( ( zSig & 0x3F ) == 0 ) {
2109
zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2111
return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2115
/*----------------------------------------------------------------------------
2116
| Returns the remainder of the single-precision floating-point value `a'
2117
| with respect to the corresponding value `b'. The operation is performed
2118
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2119
*----------------------------------------------------------------------------*/
2121
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2124
int_fast16_t aExp, bExp, expDiff;
2125
uint32_t aSig, bSig;
2127
uint64_t aSig64, bSig64, q64;
2128
uint32_t alternateASig;
2130
a = float32_squash_input_denormal(a STATUS_VAR);
2131
b = float32_squash_input_denormal(b STATUS_VAR);
2133
aSig = extractFloat32Frac( a );
2134
aExp = extractFloat32Exp( a );
2135
aSign = extractFloat32Sign( a );
2136
bSig = extractFloat32Frac( b );
2137
bExp = extractFloat32Exp( b );
2138
if ( aExp == 0xFF ) {
2139
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2140
return propagateFloat32NaN( a, b STATUS_VAR );
2142
float_raise( float_flag_invalid STATUS_VAR);
2143
return float32_default_nan;
2145
if ( bExp == 0xFF ) {
2146
if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2151
float_raise( float_flag_invalid STATUS_VAR);
2152
return float32_default_nan;
2154
normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2157
if ( aSig == 0 ) return a;
2158
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2160
expDiff = aExp - bExp;
2163
if ( expDiff < 32 ) {
2166
if ( expDiff < 0 ) {
2167
if ( expDiff < -1 ) return a;
2170
q = ( bSig <= aSig );
2171
if ( q ) aSig -= bSig;
2172
if ( 0 < expDiff ) {
2173
q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2176
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2184
if ( bSig <= aSig ) aSig -= bSig;
2185
aSig64 = ( (uint64_t) aSig )<<40;
2186
bSig64 = ( (uint64_t) bSig )<<40;
2188
while ( 0 < expDiff ) {
2189
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2190
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2191
aSig64 = - ( ( bSig * q64 )<<38 );
2195
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2196
q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2197
q = q64>>( 64 - expDiff );
2199
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2202
alternateASig = aSig;
2205
} while ( 0 <= (int32_t) aSig );
2206
sigMean = aSig + alternateASig;
2207
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2208
aSig = alternateASig;
2210
zSign = ( (int32_t) aSig < 0 );
2211
if ( zSign ) aSig = - aSig;
2212
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2216
/*----------------------------------------------------------------------------
2217
| Returns the result of multiplying the single-precision floating-point values
2218
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2219
| multiplication. The operation is performed according to the IEC/IEEE
2220
| Standard for Binary Floating-Point Arithmetic 754-2008.
2221
| The flags argument allows the caller to select negation of the
2222
| addend, the intermediate product, or the final result. (The difference
2223
| between this and having the caller do a separate negation is that negating
2224
| externally will flip the sign bit on NaNs.)
2225
*----------------------------------------------------------------------------*/
2227
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2229
flag aSign, bSign, cSign, zSign;
2230
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2231
uint32_t aSig, bSig, cSig;
2232
flag pInf, pZero, pSign;
2233
uint64_t pSig64, cSig64, zSig64;
2236
flag signflip, infzero;
2238
a = float32_squash_input_denormal(a STATUS_VAR);
2239
b = float32_squash_input_denormal(b STATUS_VAR);
2240
c = float32_squash_input_denormal(c STATUS_VAR);
2241
aSig = extractFloat32Frac(a);
2242
aExp = extractFloat32Exp(a);
2243
aSign = extractFloat32Sign(a);
2244
bSig = extractFloat32Frac(b);
2245
bExp = extractFloat32Exp(b);
2246
bSign = extractFloat32Sign(b);
2247
cSig = extractFloat32Frac(c);
2248
cExp = extractFloat32Exp(c);
2249
cSign = extractFloat32Sign(c);
2251
infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2252
(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2254
/* It is implementation-defined whether the cases of (0,inf,qnan)
2255
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2256
* they return if they do), so we have to hand this information
2257
* off to the target-specific pick-a-NaN routine.
2259
if (((aExp == 0xff) && aSig) ||
2260
((bExp == 0xff) && bSig) ||
2261
((cExp == 0xff) && cSig)) {
2262
return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2266
float_raise(float_flag_invalid STATUS_VAR);
2267
return float32_default_nan;
2270
if (flags & float_muladd_negate_c) {
2274
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2276
/* Work out the sign and type of the product */
2277
pSign = aSign ^ bSign;
2278
if (flags & float_muladd_negate_product) {
2281
pInf = (aExp == 0xff) || (bExp == 0xff);
2282
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2285
if (pInf && (pSign ^ cSign)) {
2286
/* addition of opposite-signed infinities => InvalidOperation */
2287
float_raise(float_flag_invalid STATUS_VAR);
2288
return float32_default_nan;
2290
/* Otherwise generate an infinity of the same sign */
2291
return packFloat32(cSign ^ signflip, 0xff, 0);
2295
return packFloat32(pSign ^ signflip, 0xff, 0);
2301
/* Adding two exact zeroes */
2302
if (pSign == cSign) {
2304
} else if (STATUS(float_rounding_mode) == float_round_down) {
2309
return packFloat32(zSign ^ signflip, 0, 0);
2311
/* Exact zero plus a denorm */
2312
if (STATUS(flush_to_zero)) {
2313
float_raise(float_flag_output_denormal STATUS_VAR);
2314
return packFloat32(cSign ^ signflip, 0, 0);
2317
/* Zero plus something non-zero : just return the something */
2318
return packFloat32(cSign ^ signflip, cExp, cSig);
2322
normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2325
normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2328
/* Calculate the actual result a * b + c */
2330
/* Multiply first; this is easy. */
2331
/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2332
* because we want the true exponent, not the "one-less-than"
2333
* flavour that roundAndPackFloat32() takes.
2335
pExp = aExp + bExp - 0x7e;
2336
aSig = (aSig | 0x00800000) << 7;
2337
bSig = (bSig | 0x00800000) << 8;
2338
pSig64 = (uint64_t)aSig * bSig;
2339
if ((int64_t)(pSig64 << 1) >= 0) {
2344
zSign = pSign ^ signflip;
2346
/* Now pSig64 is the significand of the multiply, with the explicit bit in
2351
/* Throw out the special case of c being an exact zero now */
2352
shift64RightJamming(pSig64, 32, &pSig64);
2354
return roundAndPackFloat32(zSign, pExp - 1,
2357
normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2360
cSig64 = (uint64_t)cSig << (62 - 23);
2361
cSig64 |= LIT64(0x4000000000000000);
2362
expDiff = pExp - cExp;
2364
if (pSign == cSign) {
2367
/* scale c to match p */
2368
shift64RightJamming(cSig64, expDiff, &cSig64);
2370
} else if (expDiff < 0) {
2371
/* scale p to match c */
2372
shift64RightJamming(pSig64, -expDiff, &pSig64);
2375
/* no scaling needed */
2378
/* Add significands and make sure explicit bit ends up in posn 62 */
2379
zSig64 = pSig64 + cSig64;
2380
if ((int64_t)zSig64 < 0) {
2381
shift64RightJamming(zSig64, 1, &zSig64);
2388
shift64RightJamming(cSig64, expDiff, &cSig64);
2389
zSig64 = pSig64 - cSig64;
2391
} else if (expDiff < 0) {
2392
shift64RightJamming(pSig64, -expDiff, &pSig64);
2393
zSig64 = cSig64 - pSig64;
2398
if (cSig64 < pSig64) {
2399
zSig64 = pSig64 - cSig64;
2400
} else if (pSig64 < cSig64) {
2401
zSig64 = cSig64 - pSig64;
2406
if (STATUS(float_rounding_mode) == float_round_down) {
2409
return packFloat32(zSign, 0, 0);
2413
/* Normalize to put the explicit bit back into bit 62. */
2414
shiftcount = countLeadingZeros64(zSig64) - 1;
2415
zSig64 <<= shiftcount;
2418
shift64RightJamming(zSig64, 32, &zSig64);
2419
return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2423
/*----------------------------------------------------------------------------
2424
| Returns the square root of the single-precision floating-point value `a'.
2425
| The operation is performed according to the IEC/IEEE Standard for Binary
2426
| Floating-Point Arithmetic.
2427
*----------------------------------------------------------------------------*/
2429
float32 float32_sqrt( float32 a STATUS_PARAM )
2432
int_fast16_t aExp, zExp;
2433
uint32_t aSig, zSig;
2435
a = float32_squash_input_denormal(a STATUS_VAR);
2437
aSig = extractFloat32Frac( a );
2438
aExp = extractFloat32Exp( a );
2439
aSign = extractFloat32Sign( a );
2440
if ( aExp == 0xFF ) {
2441
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2442
if ( ! aSign ) return a;
2443
float_raise( float_flag_invalid STATUS_VAR);
2444
return float32_default_nan;
2447
if ( ( aExp | aSig ) == 0 ) return a;
2448
float_raise( float_flag_invalid STATUS_VAR);
2449
return float32_default_nan;
2452
if ( aSig == 0 ) return float32_zero;
2453
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2455
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2456
aSig = ( aSig | 0x00800000 )<<8;
2457
zSig = estimateSqrt32( aExp, aSig ) + 2;
2458
if ( ( zSig & 0x7F ) <= 5 ) {
2464
term = ( (uint64_t) zSig ) * zSig;
2465
rem = ( ( (uint64_t) aSig )<<32 ) - term;
2466
while ( (int64_t) rem < 0 ) {
2468
rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2470
zSig |= ( rem != 0 );
2472
shift32RightJamming( zSig, 1, &zSig );
2474
return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2478
/*----------------------------------------------------------------------------
2479
| Returns the binary exponential of the single-precision floating-point value
2480
| `a'. The operation is performed according to the IEC/IEEE Standard for
2481
| Binary Floating-Point Arithmetic.
2483
| Uses the following identities:
2485
| 1. -------------------------------------------------------------------------
2489
| 2. -------------------------------------------------------------------------
2492
| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2494
*----------------------------------------------------------------------------*/
2496
static const float64 float32_exp2_coefficients[15] =
2498
const_float64( 0x3ff0000000000000ll ), /* 1 */
2499
const_float64( 0x3fe0000000000000ll ), /* 2 */
2500
const_float64( 0x3fc5555555555555ll ), /* 3 */
2501
const_float64( 0x3fa5555555555555ll ), /* 4 */
2502
const_float64( 0x3f81111111111111ll ), /* 5 */
2503
const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2504
const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2505
const_float64( 0x3efa01a01a01a01all ), /* 8 */
2506
const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2507
const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2508
const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2509
const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2510
const_float64( 0x3de6124613a86d09ll ), /* 13 */
2511
const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2512
const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2515
float32 float32_exp2( float32 a STATUS_PARAM )
2522
a = float32_squash_input_denormal(a STATUS_VAR);
2524
aSig = extractFloat32Frac( a );
2525
aExp = extractFloat32Exp( a );
2526
aSign = extractFloat32Sign( a );
2528
if ( aExp == 0xFF) {
2529
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2530
return (aSign) ? float32_zero : a;
2533
if (aSig == 0) return float32_one;
2536
float_raise( float_flag_inexact STATUS_VAR);
2538
/* ******************************* */
2539
/* using float64 for approximation */
2540
/* ******************************* */
2541
x = float32_to_float64(a STATUS_VAR);
2542
x = float64_mul(x, float64_ln2 STATUS_VAR);
2546
for (i = 0 ; i < 15 ; i++) {
2549
f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2550
r = float64_add(r, f STATUS_VAR);
2552
xn = float64_mul(xn, x STATUS_VAR);
2555
return float64_to_float32(r, status);
2558
/*----------------------------------------------------------------------------
2559
| Returns the binary log of the single-precision floating-point value `a'.
2560
| The operation is performed according to the IEC/IEEE Standard for Binary
2561
| Floating-Point Arithmetic.
2562
*----------------------------------------------------------------------------*/
2563
float32 float32_log2( float32 a STATUS_PARAM )
2567
uint32_t aSig, zSig, i;
2569
a = float32_squash_input_denormal(a STATUS_VAR);
2570
aSig = extractFloat32Frac( a );
2571
aExp = extractFloat32Exp( a );
2572
aSign = extractFloat32Sign( a );
2575
if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2576
normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2579
float_raise( float_flag_invalid STATUS_VAR);
2580
return float32_default_nan;
2582
if ( aExp == 0xFF ) {
2583
if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2592
for (i = 1 << 22; i > 0; i >>= 1) {
2593
aSig = ( (uint64_t)aSig * aSig ) >> 23;
2594
if ( aSig & 0x01000000 ) {
2603
return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2606
/*----------------------------------------------------------------------------
2607
| Returns 1 if the single-precision floating-point value `a' is equal to
2608
| the corresponding value `b', and 0 otherwise. The invalid exception is
2609
| raised if either operand is a NaN. Otherwise, the comparison is performed
2610
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611
*----------------------------------------------------------------------------*/
2613
int float32_eq( float32 a, float32 b STATUS_PARAM )
2616
a = float32_squash_input_denormal(a STATUS_VAR);
2617
b = float32_squash_input_denormal(b STATUS_VAR);
2619
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2622
float_raise( float_flag_invalid STATUS_VAR);
2625
av = float32_val(a);
2626
bv = float32_val(b);
2627
return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2630
/*----------------------------------------------------------------------------
2631
| Returns 1 if the single-precision floating-point value `a' is less than
2632
| or equal to the corresponding value `b', and 0 otherwise. The invalid
2633
| exception is raised if either operand is a NaN. The comparison is performed
2634
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2635
*----------------------------------------------------------------------------*/
2637
int float32_le( float32 a, float32 b STATUS_PARAM )
2641
a = float32_squash_input_denormal(a STATUS_VAR);
2642
b = float32_squash_input_denormal(b STATUS_VAR);
2644
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2645
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2647
float_raise( float_flag_invalid STATUS_VAR);
2650
aSign = extractFloat32Sign( a );
2651
bSign = extractFloat32Sign( b );
2652
av = float32_val(a);
2653
bv = float32_val(b);
2654
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2655
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2659
/*----------------------------------------------------------------------------
2660
| Returns 1 if the single-precision floating-point value `a' is less than
2661
| the corresponding value `b', and 0 otherwise. The invalid exception is
2662
| raised if either operand is a NaN. The comparison is performed according
2663
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2664
*----------------------------------------------------------------------------*/
2666
int float32_lt( float32 a, float32 b STATUS_PARAM )
2670
a = float32_squash_input_denormal(a STATUS_VAR);
2671
b = float32_squash_input_denormal(b STATUS_VAR);
2673
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2674
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2676
float_raise( float_flag_invalid STATUS_VAR);
2679
aSign = extractFloat32Sign( a );
2680
bSign = extractFloat32Sign( b );
2681
av = float32_val(a);
2682
bv = float32_val(b);
2683
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2684
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2688
/*----------------------------------------------------------------------------
2689
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2690
| be compared, and 0 otherwise. The invalid exception is raised if either
2691
| operand is a NaN. The comparison is performed according to the IEC/IEEE
2692
| Standard for Binary Floating-Point Arithmetic.
2693
*----------------------------------------------------------------------------*/
2695
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2697
a = float32_squash_input_denormal(a STATUS_VAR);
2698
b = float32_squash_input_denormal(b STATUS_VAR);
2700
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2701
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2703
float_raise( float_flag_invalid STATUS_VAR);
2709
/*----------------------------------------------------------------------------
2710
| Returns 1 if the single-precision floating-point value `a' is equal to
2711
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2712
| exception. The comparison is performed according to the IEC/IEEE Standard
2713
| for Binary Floating-Point Arithmetic.
2714
*----------------------------------------------------------------------------*/
2716
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2718
a = float32_squash_input_denormal(a STATUS_VAR);
2719
b = float32_squash_input_denormal(b STATUS_VAR);
2721
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2722
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2724
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2725
float_raise( float_flag_invalid STATUS_VAR);
2729
return ( float32_val(a) == float32_val(b) ) ||
2730
( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2733
/*----------------------------------------------------------------------------
2734
| Returns 1 if the single-precision floating-point value `a' is less than or
2735
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2736
| cause an exception. Otherwise, the comparison is performed according to the
2737
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2738
*----------------------------------------------------------------------------*/
2740
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2744
a = float32_squash_input_denormal(a STATUS_VAR);
2745
b = float32_squash_input_denormal(b STATUS_VAR);
2747
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2748
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2750
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2751
float_raise( float_flag_invalid STATUS_VAR);
2755
aSign = extractFloat32Sign( a );
2756
bSign = extractFloat32Sign( b );
2757
av = float32_val(a);
2758
bv = float32_val(b);
2759
if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2760
return ( av == bv ) || ( aSign ^ ( av < bv ) );
2764
/*----------------------------------------------------------------------------
2765
| Returns 1 if the single-precision floating-point value `a' is less than
2766
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2767
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2768
| Standard for Binary Floating-Point Arithmetic.
2769
*----------------------------------------------------------------------------*/
2771
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2775
a = float32_squash_input_denormal(a STATUS_VAR);
2776
b = float32_squash_input_denormal(b STATUS_VAR);
2778
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2779
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2781
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2782
float_raise( float_flag_invalid STATUS_VAR);
2786
aSign = extractFloat32Sign( a );
2787
bSign = extractFloat32Sign( b );
2788
av = float32_val(a);
2789
bv = float32_val(b);
2790
if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2791
return ( av != bv ) && ( aSign ^ ( av < bv ) );
2795
/*----------------------------------------------------------------------------
2796
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2797
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2798
| comparison is performed according to the IEC/IEEE Standard for Binary
2799
| Floating-Point Arithmetic.
2800
*----------------------------------------------------------------------------*/
2802
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2804
a = float32_squash_input_denormal(a STATUS_VAR);
2805
b = float32_squash_input_denormal(b STATUS_VAR);
2807
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2808
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2810
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2811
float_raise( float_flag_invalid STATUS_VAR);
2818
/*----------------------------------------------------------------------------
2819
| Returns the result of converting the double-precision floating-point value
2820
| `a' to the 32-bit two's complement integer format. The conversion is
2821
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2822
| Arithmetic---which means in particular that the conversion is rounded
2823
| according to the current rounding mode. If `a' is a NaN, the largest
2824
| positive integer is returned. Otherwise, if the conversion overflows, the
2825
| largest integer with the same sign as `a' is returned.
2826
*----------------------------------------------------------------------------*/
2828
int32 float64_to_int32( float64 a STATUS_PARAM )
2831
int_fast16_t aExp, shiftCount;
2833
a = float64_squash_input_denormal(a STATUS_VAR);
2835
aSig = extractFloat64Frac( a );
2836
aExp = extractFloat64Exp( a );
2837
aSign = extractFloat64Sign( a );
2838
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2839
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2840
shiftCount = 0x42C - aExp;
2841
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2842
return roundAndPackInt32( aSign, aSig STATUS_VAR );
2846
/*----------------------------------------------------------------------------
2847
| Returns the result of converting the double-precision floating-point value
2848
| `a' to the 32-bit two's complement integer format. The conversion is
2849
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2850
| Arithmetic, except that the conversion is always rounded toward zero.
2851
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2852
| the conversion overflows, the largest integer with the same sign as `a' is
2854
*----------------------------------------------------------------------------*/
2856
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2859
int_fast16_t aExp, shiftCount;
2860
uint64_t aSig, savedASig;
2862
a = float64_squash_input_denormal(a STATUS_VAR);
2864
aSig = extractFloat64Frac( a );
2865
aExp = extractFloat64Exp( a );
2866
aSign = extractFloat64Sign( a );
2867
if ( 0x41E < aExp ) {
2868
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2871
else if ( aExp < 0x3FF ) {
2872
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2875
aSig |= LIT64( 0x0010000000000000 );
2876
shiftCount = 0x433 - aExp;
2878
aSig >>= shiftCount;
2880
if ( aSign ) z = - z;
2881
if ( ( z < 0 ) ^ aSign ) {
2883
float_raise( float_flag_invalid STATUS_VAR);
2884
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2886
if ( ( aSig<<shiftCount ) != savedASig ) {
2887
STATUS(float_exception_flags) |= float_flag_inexact;
2893
/*----------------------------------------------------------------------------
2894
| Returns the result of converting the double-precision floating-point value
2895
| `a' to the 16-bit two's complement integer format. The conversion is
2896
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2897
| Arithmetic, except that the conversion is always rounded toward zero.
2898
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2899
| the conversion overflows, the largest integer with the same sign as `a' is
2901
*----------------------------------------------------------------------------*/
2903
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2906
int_fast16_t aExp, shiftCount;
2907
uint64_t aSig, savedASig;
2910
aSig = extractFloat64Frac( a );
2911
aExp = extractFloat64Exp( a );
2912
aSign = extractFloat64Sign( a );
2913
if ( 0x40E < aExp ) {
2914
if ( ( aExp == 0x7FF ) && aSig ) {
2919
else if ( aExp < 0x3FF ) {
2920
if ( aExp || aSig ) {
2921
STATUS(float_exception_flags) |= float_flag_inexact;
2925
aSig |= LIT64( 0x0010000000000000 );
2926
shiftCount = 0x433 - aExp;
2928
aSig >>= shiftCount;
2933
if ( ( (int16_t)z < 0 ) ^ aSign ) {
2935
float_raise( float_flag_invalid STATUS_VAR);
2936
return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2938
if ( ( aSig<<shiftCount ) != savedASig ) {
2939
STATUS(float_exception_flags) |= float_flag_inexact;
2944
/*----------------------------------------------------------------------------
2945
| Returns the result of converting the double-precision floating-point value
2946
| `a' to the 64-bit two's complement integer format. The conversion is
2947
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2948
| Arithmetic---which means in particular that the conversion is rounded
2949
| according to the current rounding mode. If `a' is a NaN, the largest
2950
| positive integer is returned. Otherwise, if the conversion overflows, the
2951
| largest integer with the same sign as `a' is returned.
2952
*----------------------------------------------------------------------------*/
2954
int64 float64_to_int64( float64 a STATUS_PARAM )
2957
int_fast16_t aExp, shiftCount;
2958
uint64_t aSig, aSigExtra;
2959
a = float64_squash_input_denormal(a STATUS_VAR);
2961
aSig = extractFloat64Frac( a );
2962
aExp = extractFloat64Exp( a );
2963
aSign = extractFloat64Sign( a );
2964
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2965
shiftCount = 0x433 - aExp;
2966
if ( shiftCount <= 0 ) {
2967
if ( 0x43E < aExp ) {
2968
float_raise( float_flag_invalid STATUS_VAR);
2970
|| ( ( aExp == 0x7FF )
2971
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2973
return LIT64( 0x7FFFFFFFFFFFFFFF );
2975
return (int64_t) LIT64( 0x8000000000000000 );
2978
aSig <<= - shiftCount;
2981
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2983
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2987
/*----------------------------------------------------------------------------
2988
| Returns the result of converting the double-precision floating-point value
2989
| `a' to the 64-bit two's complement integer format. The conversion is
2990
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2991
| Arithmetic, except that the conversion is always rounded toward zero.
2992
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2993
| the conversion overflows, the largest integer with the same sign as `a' is
2995
*----------------------------------------------------------------------------*/
2997
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3000
int_fast16_t aExp, shiftCount;
3003
a = float64_squash_input_denormal(a STATUS_VAR);
3005
aSig = extractFloat64Frac( a );
3006
aExp = extractFloat64Exp( a );
3007
aSign = extractFloat64Sign( a );
3008
if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009
shiftCount = aExp - 0x433;
3010
if ( 0 <= shiftCount ) {
3011
if ( 0x43E <= aExp ) {
3012
if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3013
float_raise( float_flag_invalid STATUS_VAR);
3015
|| ( ( aExp == 0x7FF )
3016
&& ( aSig != LIT64( 0x0010000000000000 ) ) )
3018
return LIT64( 0x7FFFFFFFFFFFFFFF );
3021
return (int64_t) LIT64( 0x8000000000000000 );
3023
z = aSig<<shiftCount;
3026
if ( aExp < 0x3FE ) {
3027
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3030
z = aSig>>( - shiftCount );
3031
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3032
STATUS(float_exception_flags) |= float_flag_inexact;
3035
if ( aSign ) z = - z;
3040
/*----------------------------------------------------------------------------
3041
| Returns the result of converting the double-precision floating-point value
3042
| `a' to the single-precision floating-point format. The conversion is
3043
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3045
*----------------------------------------------------------------------------*/
3047
float32 float64_to_float32( float64 a STATUS_PARAM )
3053
a = float64_squash_input_denormal(a STATUS_VAR);
3055
aSig = extractFloat64Frac( a );
3056
aExp = extractFloat64Exp( a );
3057
aSign = extractFloat64Sign( a );
3058
if ( aExp == 0x7FF ) {
3059
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3060
return packFloat32( aSign, 0xFF, 0 );
3062
shift64RightJamming( aSig, 22, &aSig );
3064
if ( aExp || zSig ) {
3068
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3073
/*----------------------------------------------------------------------------
3074
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3075
| half-precision floating-point value, returning the result. After being
3076
| shifted into the proper positions, the three fields are simply added
3077
| together to form the result. This means that any integer portion of `zSig'
3078
| will be added into the exponent. Since a properly normalized significand
3079
| will have an integer portion equal to 1, the `zExp' input should be 1 less
3080
| than the desired result exponent whenever `zSig' is a complete, normalized
3082
*----------------------------------------------------------------------------*/
3083
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3085
return make_float16(
3086
(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3089
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3090
The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3092
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3098
aSign = extractFloat16Sign(a);
3099
aExp = extractFloat16Exp(a);
3100
aSig = extractFloat16Frac(a);
3102
if (aExp == 0x1f && ieee) {
3104
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3106
return packFloat32(aSign, 0xff, 0);
3112
return packFloat32(aSign, 0, 0);
3115
shiftCount = countLeadingZeros32( aSig ) - 21;
3116
aSig = aSig << shiftCount;
3119
return packFloat32( aSign, aExp + 0x70, aSig << 13);
3122
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3130
int maxexp = ieee ? 15 : 16;
3131
bool rounding_bumps_exp;
3132
bool is_tiny = false;
3134
a = float32_squash_input_denormal(a STATUS_VAR);
3136
aSig = extractFloat32Frac( a );
3137
aExp = extractFloat32Exp( a );
3138
aSign = extractFloat32Sign( a );
3139
if ( aExp == 0xFF ) {
3141
/* Input is a NaN */
3143
float_raise(float_flag_invalid STATUS_VAR);
3144
return packFloat16(aSign, 0, 0);
3146
return commonNaNToFloat16(
3147
float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3151
float_raise(float_flag_invalid STATUS_VAR);
3152
return packFloat16(aSign, 0x1f, 0x3ff);
3154
return packFloat16(aSign, 0x1f, 0);
3156
if (aExp == 0 && aSig == 0) {
3157
return packFloat16(aSign, 0, 0);
3159
/* Decimal point between bits 22 and 23. Note that we add the 1 bit
3160
* even if the input is denormal; however this is harmless because
3161
* the largest possible single-precision denormal is still smaller
3162
* than the smallest representable half-precision denormal, and so we
3163
* will end up ignoring aSig and returning via the "always return zero"
3168
/* Calculate the mask of bits of the mantissa which are not
3169
* representable in half-precision and will be lost.
3172
/* Will be denormal in halfprec */
3178
/* Normal number in halfprec */
3182
roundingMode = STATUS(float_rounding_mode);
3183
switch (roundingMode) {
3184
case float_round_nearest_even:
3185
increment = (mask + 1) >> 1;
3186
if ((aSig & mask) == increment) {
3187
increment = aSig & (increment << 1);
3190
case float_round_up:
3191
increment = aSign ? 0 : mask;
3193
case float_round_down:
3194
increment = aSign ? mask : 0;
3196
default: /* round_to_zero */
3201
rounding_bumps_exp = (aSig + increment >= 0x01000000);
3203
if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3205
float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3206
return packFloat16(aSign, 0x1f, 0);
3208
float_raise(float_flag_invalid STATUS_VAR);
3209
return packFloat16(aSign, 0x1f, 0x3ff);
3214
/* Note that flush-to-zero does not affect half-precision results */
3216
(STATUS(float_detect_tininess) == float_tininess_before_rounding)
3218
|| (!rounding_bumps_exp);
3221
float_raise(float_flag_inexact STATUS_VAR);
3223
float_raise(float_flag_underflow STATUS_VAR);
3228
if (rounding_bumps_exp) {
3234
return packFloat16(aSign, 0, 0);
3237
aSig >>= -14 - aExp;
3240
return packFloat16(aSign, aExp + 14, aSig >> 13);
3243
/*----------------------------------------------------------------------------
3244
| Returns the result of converting the double-precision floating-point value
3245
| `a' to the extended double-precision floating-point format. The conversion
3246
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3248
*----------------------------------------------------------------------------*/
3250
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3256
a = float64_squash_input_denormal(a STATUS_VAR);
3257
aSig = extractFloat64Frac( a );
3258
aExp = extractFloat64Exp( a );
3259
aSign = extractFloat64Sign( a );
3260
if ( aExp == 0x7FF ) {
3261
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3262
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3265
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3266
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3270
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3274
/*----------------------------------------------------------------------------
3275
| Returns the result of converting the double-precision floating-point value
3276
| `a' to the quadruple-precision floating-point format. The conversion is
3277
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3279
*----------------------------------------------------------------------------*/
3281
float128 float64_to_float128( float64 a STATUS_PARAM )
3285
uint64_t aSig, zSig0, zSig1;
3287
a = float64_squash_input_denormal(a STATUS_VAR);
3288
aSig = extractFloat64Frac( a );
3289
aExp = extractFloat64Exp( a );
3290
aSign = extractFloat64Sign( a );
3291
if ( aExp == 0x7FF ) {
3292
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3293
return packFloat128( aSign, 0x7FFF, 0, 0 );
3296
if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3297
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3300
shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3301
return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3305
/*----------------------------------------------------------------------------
3306
| Rounds the double-precision floating-point value `a' to an integer, and
3307
| returns the result as a double-precision floating-point value. The
3308
| operation is performed according to the IEC/IEEE Standard for Binary
3309
| Floating-Point Arithmetic.
3310
*----------------------------------------------------------------------------*/
3312
float64 float64_round_to_int( float64 a STATUS_PARAM )
3316
uint64_t lastBitMask, roundBitsMask;
3319
a = float64_squash_input_denormal(a STATUS_VAR);
3321
aExp = extractFloat64Exp( a );
3322
if ( 0x433 <= aExp ) {
3323
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3324
return propagateFloat64NaN( a, a STATUS_VAR );
3328
if ( aExp < 0x3FF ) {
3329
if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3330
STATUS(float_exception_flags) |= float_flag_inexact;
3331
aSign = extractFloat64Sign( a );
3332
switch ( STATUS(float_rounding_mode) ) {
3333
case float_round_nearest_even:
3334
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3335
return packFloat64( aSign, 0x3FF, 0 );
3338
case float_round_down:
3339
return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3340
case float_round_up:
3341
return make_float64(
3342
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3344
return packFloat64( aSign, 0, 0 );
3347
lastBitMask <<= 0x433 - aExp;
3348
roundBitsMask = lastBitMask - 1;
3350
roundingMode = STATUS(float_rounding_mode);
3351
if ( roundingMode == float_round_nearest_even ) {
3352
z += lastBitMask>>1;
3353
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3355
else if ( roundingMode != float_round_to_zero ) {
3356
if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3360
z &= ~ roundBitsMask;
3361
if ( z != float64_val(a) )
3362
STATUS(float_exception_flags) |= float_flag_inexact;
3363
return make_float64(z);
3367
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3371
oldmode = STATUS(float_rounding_mode);
3372
STATUS(float_rounding_mode) = float_round_to_zero;
3373
res = float64_round_to_int(a STATUS_VAR);
3374
STATUS(float_rounding_mode) = oldmode;
3378
/*----------------------------------------------------------------------------
3379
| Returns the result of adding the absolute values of the double-precision
3380
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3381
| before being returned. `zSign' is ignored if the result is a NaN.
3382
| The addition is performed according to the IEC/IEEE Standard for Binary
3383
| Floating-Point Arithmetic.
3384
*----------------------------------------------------------------------------*/
3386
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3388
int_fast16_t aExp, bExp, zExp;
3389
uint64_t aSig, bSig, zSig;
3390
int_fast16_t expDiff;
3392
aSig = extractFloat64Frac( a );
3393
aExp = extractFloat64Exp( a );
3394
bSig = extractFloat64Frac( b );
3395
bExp = extractFloat64Exp( b );
3396
expDiff = aExp - bExp;
3399
if ( 0 < expDiff ) {
3400
if ( aExp == 0x7FF ) {
3401
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3408
bSig |= LIT64( 0x2000000000000000 );
3410
shift64RightJamming( bSig, expDiff, &bSig );
3413
else if ( expDiff < 0 ) {
3414
if ( bExp == 0x7FF ) {
3415
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3416
return packFloat64( zSign, 0x7FF, 0 );
3422
aSig |= LIT64( 0x2000000000000000 );
3424
shift64RightJamming( aSig, - expDiff, &aSig );
3428
if ( aExp == 0x7FF ) {
3429
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3433
if (STATUS(flush_to_zero)) {
3435
float_raise(float_flag_output_denormal STATUS_VAR);
3437
return packFloat64(zSign, 0, 0);
3439
return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3441
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3445
aSig |= LIT64( 0x2000000000000000 );
3446
zSig = ( aSig + bSig )<<1;
3448
if ( (int64_t) zSig < 0 ) {
3453
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3457
/*----------------------------------------------------------------------------
3458
| Returns the result of subtracting the absolute values of the double-
3459
| precision floating-point values `a' and `b'. If `zSign' is 1, the
3460
| difference is negated before being returned. `zSign' is ignored if the
3461
| result is a NaN. The subtraction is performed according to the IEC/IEEE
3462
| Standard for Binary Floating-Point Arithmetic.
3463
*----------------------------------------------------------------------------*/
3465
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3467
int_fast16_t aExp, bExp, zExp;
3468
uint64_t aSig, bSig, zSig;
3469
int_fast16_t expDiff;
3471
aSig = extractFloat64Frac( a );
3472
aExp = extractFloat64Exp( a );
3473
bSig = extractFloat64Frac( b );
3474
bExp = extractFloat64Exp( b );
3475
expDiff = aExp - bExp;
3478
if ( 0 < expDiff ) goto aExpBigger;
3479
if ( expDiff < 0 ) goto bExpBigger;
3480
if ( aExp == 0x7FF ) {
3481
if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3482
float_raise( float_flag_invalid STATUS_VAR);
3483
return float64_default_nan;
3489
if ( bSig < aSig ) goto aBigger;
3490
if ( aSig < bSig ) goto bBigger;
3491
return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3493
if ( bExp == 0x7FF ) {
3494
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3495
return packFloat64( zSign ^ 1, 0x7FF, 0 );
3501
aSig |= LIT64( 0x4000000000000000 );
3503
shift64RightJamming( aSig, - expDiff, &aSig );
3504
bSig |= LIT64( 0x4000000000000000 );
3509
goto normalizeRoundAndPack;
3511
if ( aExp == 0x7FF ) {
3512
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3519
bSig |= LIT64( 0x4000000000000000 );
3521
shift64RightJamming( bSig, expDiff, &bSig );
3522
aSig |= LIT64( 0x4000000000000000 );
3526
normalizeRoundAndPack:
3528
return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3532
/*----------------------------------------------------------------------------
3533
| Returns the result of adding the double-precision floating-point values `a'
3534
| and `b'. The operation is performed according to the IEC/IEEE Standard for
3535
| Binary Floating-Point Arithmetic.
3536
*----------------------------------------------------------------------------*/
3538
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3541
a = float64_squash_input_denormal(a STATUS_VAR);
3542
b = float64_squash_input_denormal(b STATUS_VAR);
3544
aSign = extractFloat64Sign( a );
3545
bSign = extractFloat64Sign( b );
3546
if ( aSign == bSign ) {
3547
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3550
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3555
/*----------------------------------------------------------------------------
3556
| Returns the result of subtracting the double-precision floating-point values
3557
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3558
| for Binary Floating-Point Arithmetic.
3559
*----------------------------------------------------------------------------*/
3561
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3564
a = float64_squash_input_denormal(a STATUS_VAR);
3565
b = float64_squash_input_denormal(b STATUS_VAR);
3567
aSign = extractFloat64Sign( a );
3568
bSign = extractFloat64Sign( b );
3569
if ( aSign == bSign ) {
3570
return subFloat64Sigs( a, b, aSign STATUS_VAR );
3573
return addFloat64Sigs( a, b, aSign STATUS_VAR );
3578
/*----------------------------------------------------------------------------
3579
| Returns the result of multiplying the double-precision floating-point values
3580
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3581
| for Binary Floating-Point Arithmetic.
3582
*----------------------------------------------------------------------------*/
3584
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3586
flag aSign, bSign, zSign;
3587
int_fast16_t aExp, bExp, zExp;
3588
uint64_t aSig, bSig, zSig0, zSig1;
3590
a = float64_squash_input_denormal(a STATUS_VAR);
3591
b = float64_squash_input_denormal(b STATUS_VAR);
3593
aSig = extractFloat64Frac( a );
3594
aExp = extractFloat64Exp( a );
3595
aSign = extractFloat64Sign( a );
3596
bSig = extractFloat64Frac( b );
3597
bExp = extractFloat64Exp( b );
3598
bSign = extractFloat64Sign( b );
3599
zSign = aSign ^ bSign;
3600
if ( aExp == 0x7FF ) {
3601
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3602
return propagateFloat64NaN( a, b STATUS_VAR );
3604
if ( ( bExp | bSig ) == 0 ) {
3605
float_raise( float_flag_invalid STATUS_VAR);
3606
return float64_default_nan;
3608
return packFloat64( zSign, 0x7FF, 0 );
3610
if ( bExp == 0x7FF ) {
3611
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3612
if ( ( aExp | aSig ) == 0 ) {
3613
float_raise( float_flag_invalid STATUS_VAR);
3614
return float64_default_nan;
3616
return packFloat64( zSign, 0x7FF, 0 );
3619
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3620
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3623
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3624
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3626
zExp = aExp + bExp - 0x3FF;
3627
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3628
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3629
mul64To128( aSig, bSig, &zSig0, &zSig1 );
3630
zSig0 |= ( zSig1 != 0 );
3631
if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3635
return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3639
/*----------------------------------------------------------------------------
3640
| Returns the result of dividing the double-precision floating-point value `a'
3641
| by the corresponding value `b'. The operation is performed according to
3642
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3643
*----------------------------------------------------------------------------*/
3645
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3647
flag aSign, bSign, zSign;
3648
int_fast16_t aExp, bExp, zExp;
3649
uint64_t aSig, bSig, zSig;
3650
uint64_t rem0, rem1;
3651
uint64_t term0, term1;
3652
a = float64_squash_input_denormal(a STATUS_VAR);
3653
b = float64_squash_input_denormal(b STATUS_VAR);
3655
aSig = extractFloat64Frac( a );
3656
aExp = extractFloat64Exp( a );
3657
aSign = extractFloat64Sign( a );
3658
bSig = extractFloat64Frac( b );
3659
bExp = extractFloat64Exp( b );
3660
bSign = extractFloat64Sign( b );
3661
zSign = aSign ^ bSign;
3662
if ( aExp == 0x7FF ) {
3663
if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3664
if ( bExp == 0x7FF ) {
3665
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3666
float_raise( float_flag_invalid STATUS_VAR);
3667
return float64_default_nan;
3669
return packFloat64( zSign, 0x7FF, 0 );
3671
if ( bExp == 0x7FF ) {
3672
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3673
return packFloat64( zSign, 0, 0 );
3677
if ( ( aExp | aSig ) == 0 ) {
3678
float_raise( float_flag_invalid STATUS_VAR);
3679
return float64_default_nan;
3681
float_raise( float_flag_divbyzero STATUS_VAR);
3682
return packFloat64( zSign, 0x7FF, 0 );
3684
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3687
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3688
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3690
zExp = aExp - bExp + 0x3FD;
3691
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3692
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3693
if ( bSig <= ( aSig + aSig ) ) {
3697
zSig = estimateDiv128To64( aSig, 0, bSig );
3698
if ( ( zSig & 0x1FF ) <= 2 ) {
3699
mul64To128( bSig, zSig, &term0, &term1 );
3700
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3701
while ( (int64_t) rem0 < 0 ) {
3703
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3705
zSig |= ( rem1 != 0 );
3707
return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3711
/*----------------------------------------------------------------------------
3712
| Returns the remainder of the double-precision floating-point value `a'
3713
| with respect to the corresponding value `b'. The operation is performed
3714
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3715
*----------------------------------------------------------------------------*/
3717
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3720
int_fast16_t aExp, bExp, expDiff;
3721
uint64_t aSig, bSig;
3722
uint64_t q, alternateASig;
3725
a = float64_squash_input_denormal(a STATUS_VAR);
3726
b = float64_squash_input_denormal(b STATUS_VAR);
3727
aSig = extractFloat64Frac( a );
3728
aExp = extractFloat64Exp( a );
3729
aSign = extractFloat64Sign( a );
3730
bSig = extractFloat64Frac( b );
3731
bExp = extractFloat64Exp( b );
3732
if ( aExp == 0x7FF ) {
3733
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3734
return propagateFloat64NaN( a, b STATUS_VAR );
3736
float_raise( float_flag_invalid STATUS_VAR);
3737
return float64_default_nan;
3739
if ( bExp == 0x7FF ) {
3740
if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3745
float_raise( float_flag_invalid STATUS_VAR);
3746
return float64_default_nan;
3748
normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3751
if ( aSig == 0 ) return a;
3752
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3754
expDiff = aExp - bExp;
3755
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3756
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3757
if ( expDiff < 0 ) {
3758
if ( expDiff < -1 ) return a;
3761
q = ( bSig <= aSig );
3762
if ( q ) aSig -= bSig;
3764
while ( 0 < expDiff ) {
3765
q = estimateDiv128To64( aSig, 0, bSig );
3766
q = ( 2 < q ) ? q - 2 : 0;
3767
aSig = - ( ( bSig>>2 ) * q );
3771
if ( 0 < expDiff ) {
3772
q = estimateDiv128To64( aSig, 0, bSig );
3773
q = ( 2 < q ) ? q - 2 : 0;
3776
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3783
alternateASig = aSig;
3786
} while ( 0 <= (int64_t) aSig );
3787
sigMean = aSig + alternateASig;
3788
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3789
aSig = alternateASig;
3791
zSign = ( (int64_t) aSig < 0 );
3792
if ( zSign ) aSig = - aSig;
3793
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3797
/*----------------------------------------------------------------------------
3798
| Returns the result of multiplying the double-precision floating-point values
3799
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3800
| multiplication. The operation is performed according to the IEC/IEEE
3801
| Standard for Binary Floating-Point Arithmetic 754-2008.
3802
| The flags argument allows the caller to select negation of the
3803
| addend, the intermediate product, or the final result. (The difference
3804
| between this and having the caller do a separate negation is that negating
3805
| externally will flip the sign bit on NaNs.)
3806
*----------------------------------------------------------------------------*/
3808
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3810
flag aSign, bSign, cSign, zSign;
3811
int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3812
uint64_t aSig, bSig, cSig;
3813
flag pInf, pZero, pSign;
3814
uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3816
flag signflip, infzero;
3818
a = float64_squash_input_denormal(a STATUS_VAR);
3819
b = float64_squash_input_denormal(b STATUS_VAR);
3820
c = float64_squash_input_denormal(c STATUS_VAR);
3821
aSig = extractFloat64Frac(a);
3822
aExp = extractFloat64Exp(a);
3823
aSign = extractFloat64Sign(a);
3824
bSig = extractFloat64Frac(b);
3825
bExp = extractFloat64Exp(b);
3826
bSign = extractFloat64Sign(b);
3827
cSig = extractFloat64Frac(c);
3828
cExp = extractFloat64Exp(c);
3829
cSign = extractFloat64Sign(c);
3831
infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3832
(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3834
/* It is implementation-defined whether the cases of (0,inf,qnan)
3835
* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3836
* they return if they do), so we have to hand this information
3837
* off to the target-specific pick-a-NaN routine.
3839
if (((aExp == 0x7ff) && aSig) ||
3840
((bExp == 0x7ff) && bSig) ||
3841
((cExp == 0x7ff) && cSig)) {
3842
return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3846
float_raise(float_flag_invalid STATUS_VAR);
3847
return float64_default_nan;
3850
if (flags & float_muladd_negate_c) {
3854
signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3856
/* Work out the sign and type of the product */
3857
pSign = aSign ^ bSign;
3858
if (flags & float_muladd_negate_product) {
3861
pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3862
pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3864
if (cExp == 0x7ff) {
3865
if (pInf && (pSign ^ cSign)) {
3866
/* addition of opposite-signed infinities => InvalidOperation */
3867
float_raise(float_flag_invalid STATUS_VAR);
3868
return float64_default_nan;
3870
/* Otherwise generate an infinity of the same sign */
3871
return packFloat64(cSign ^ signflip, 0x7ff, 0);
3875
return packFloat64(pSign ^ signflip, 0x7ff, 0);
3881
/* Adding two exact zeroes */
3882
if (pSign == cSign) {
3884
} else if (STATUS(float_rounding_mode) == float_round_down) {
3889
return packFloat64(zSign ^ signflip, 0, 0);
3891
/* Exact zero plus a denorm */
3892
if (STATUS(flush_to_zero)) {
3893
float_raise(float_flag_output_denormal STATUS_VAR);
3894
return packFloat64(cSign ^ signflip, 0, 0);
3897
/* Zero plus something non-zero : just return the something */
3898
return packFloat64(cSign ^ signflip, cExp, cSig);
3902
normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3905
normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3908
/* Calculate the actual result a * b + c */
3910
/* Multiply first; this is easy. */
3911
/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3912
* because we want the true exponent, not the "one-less-than"
3913
* flavour that roundAndPackFloat64() takes.
3915
pExp = aExp + bExp - 0x3fe;
3916
aSig = (aSig | LIT64(0x0010000000000000))<<10;
3917
bSig = (bSig | LIT64(0x0010000000000000))<<11;
3918
mul64To128(aSig, bSig, &pSig0, &pSig1);
3919
if ((int64_t)(pSig0 << 1) >= 0) {
3920
shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3924
zSign = pSign ^ signflip;
3926
/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3927
* bit in position 126.
3931
/* Throw out the special case of c being an exact zero now */
3932
shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3933
return roundAndPackFloat64(zSign, pExp - 1,
3936
normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3939
/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3940
* significand of the addend, with the explicit bit in position 126.
3942
cSig0 = cSig << (126 - 64 - 52);
3944
cSig0 |= LIT64(0x4000000000000000);
3945
expDiff = pExp - cExp;
3947
if (pSign == cSign) {
3950
/* scale c to match p */
3951
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3953
} else if (expDiff < 0) {
3954
/* scale p to match c */
3955
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3958
/* no scaling needed */
3961
/* Add significands and make sure explicit bit ends up in posn 126 */
3962
add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3963
if ((int64_t)zSig0 < 0) {
3964
shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3968
shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3969
return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3973
shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3974
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3976
} else if (expDiff < 0) {
3977
shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3978
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3983
if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3984
sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3985
} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3986
sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3991
if (STATUS(float_rounding_mode) == float_round_down) {
3994
return packFloat64(zSign, 0, 0);
3998
/* Do the equivalent of normalizeRoundAndPackFloat64() but
3999
* starting with the significand in a pair of uint64_t.
4002
shiftcount = countLeadingZeros64(zSig0) - 1;
4003
shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4009
shiftcount = countLeadingZeros64(zSig1);
4010
if (shiftcount == 0) {
4011
zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4015
zSig0 = zSig1 << shiftcount;
4016
zExp -= (shiftcount + 64);
4019
return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4023
/*----------------------------------------------------------------------------
4024
| Returns the square root of the double-precision floating-point value `a'.
4025
| The operation is performed according to the IEC/IEEE Standard for Binary
4026
| Floating-Point Arithmetic.
4027
*----------------------------------------------------------------------------*/
4029
float64 float64_sqrt( float64 a STATUS_PARAM )
4032
int_fast16_t aExp, zExp;
4033
uint64_t aSig, zSig, doubleZSig;
4034
uint64_t rem0, rem1, term0, term1;
4035
a = float64_squash_input_denormal(a STATUS_VAR);
4037
aSig = extractFloat64Frac( a );
4038
aExp = extractFloat64Exp( a );
4039
aSign = extractFloat64Sign( a );
4040
if ( aExp == 0x7FF ) {
4041
if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4042
if ( ! aSign ) return a;
4043
float_raise( float_flag_invalid STATUS_VAR);
4044
return float64_default_nan;
4047
if ( ( aExp | aSig ) == 0 ) return a;
4048
float_raise( float_flag_invalid STATUS_VAR);
4049
return float64_default_nan;
4052
if ( aSig == 0 ) return float64_zero;
4053
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4055
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4056
aSig |= LIT64( 0x0010000000000000 );
4057
zSig = estimateSqrt32( aExp, aSig>>21 );
4058
aSig <<= 9 - ( aExp & 1 );
4059
zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4060
if ( ( zSig & 0x1FF ) <= 5 ) {
4061
doubleZSig = zSig<<1;
4062
mul64To128( zSig, zSig, &term0, &term1 );
4063
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4064
while ( (int64_t) rem0 < 0 ) {
4067
add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4069
zSig |= ( ( rem0 | rem1 ) != 0 );
4071
return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4075
/*----------------------------------------------------------------------------
4076
| Returns the binary log of the double-precision floating-point value `a'.
4077
| The operation is performed according to the IEC/IEEE Standard for Binary
4078
| Floating-Point Arithmetic.
4079
*----------------------------------------------------------------------------*/
4080
float64 float64_log2( float64 a STATUS_PARAM )
4084
uint64_t aSig, aSig0, aSig1, zSig, i;
4085
a = float64_squash_input_denormal(a STATUS_VAR);
4087
aSig = extractFloat64Frac( a );
4088
aExp = extractFloat64Exp( a );
4089
aSign = extractFloat64Sign( a );
4092
if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4093
normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4096
float_raise( float_flag_invalid STATUS_VAR);
4097
return float64_default_nan;
4099
if ( aExp == 0x7FF ) {
4100
if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4105
aSig |= LIT64( 0x0010000000000000 );
4107
zSig = (uint64_t)aExp << 52;
4108
for (i = 1LL << 51; i > 0; i >>= 1) {
4109
mul64To128( aSig, aSig, &aSig0, &aSig1 );
4110
aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4111
if ( aSig & LIT64( 0x0020000000000000 ) ) {
4119
return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4122
/*----------------------------------------------------------------------------
4123
| Returns 1 if the double-precision floating-point value `a' is equal to the
4124
| corresponding value `b', and 0 otherwise. The invalid exception is raised
4125
| if either operand is a NaN. Otherwise, the comparison is performed
4126
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4127
*----------------------------------------------------------------------------*/
4129
int float64_eq( float64 a, float64 b STATUS_PARAM )
4132
a = float64_squash_input_denormal(a STATUS_VAR);
4133
b = float64_squash_input_denormal(b STATUS_VAR);
4135
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4136
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4138
float_raise( float_flag_invalid STATUS_VAR);
4141
av = float64_val(a);
4142
bv = float64_val(b);
4143
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4147
/*----------------------------------------------------------------------------
4148
| Returns 1 if the double-precision floating-point value `a' is less than or
4149
| equal to the corresponding value `b', and 0 otherwise. The invalid
4150
| exception is raised if either operand is a NaN. The comparison is performed
4151
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4152
*----------------------------------------------------------------------------*/
4154
int float64_le( float64 a, float64 b STATUS_PARAM )
4158
a = float64_squash_input_denormal(a STATUS_VAR);
4159
b = float64_squash_input_denormal(b STATUS_VAR);
4161
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4162
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4164
float_raise( float_flag_invalid STATUS_VAR);
4167
aSign = extractFloat64Sign( a );
4168
bSign = extractFloat64Sign( b );
4169
av = float64_val(a);
4170
bv = float64_val(b);
4171
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4172
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4176
/*----------------------------------------------------------------------------
4177
| Returns 1 if the double-precision floating-point value `a' is less than
4178
| the corresponding value `b', and 0 otherwise. The invalid exception is
4179
| raised if either operand is a NaN. The comparison is performed according
4180
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181
*----------------------------------------------------------------------------*/
4183
int float64_lt( float64 a, float64 b STATUS_PARAM )
4188
a = float64_squash_input_denormal(a STATUS_VAR);
4189
b = float64_squash_input_denormal(b STATUS_VAR);
4190
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4191
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4193
float_raise( float_flag_invalid STATUS_VAR);
4196
aSign = extractFloat64Sign( a );
4197
bSign = extractFloat64Sign( b );
4198
av = float64_val(a);
4199
bv = float64_val(b);
4200
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4201
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4205
/*----------------------------------------------------------------------------
4206
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4207
| be compared, and 0 otherwise. The invalid exception is raised if either
4208
| operand is a NaN. The comparison is performed according to the IEC/IEEE
4209
| Standard for Binary Floating-Point Arithmetic.
4210
*----------------------------------------------------------------------------*/
4212
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4214
a = float64_squash_input_denormal(a STATUS_VAR);
4215
b = float64_squash_input_denormal(b STATUS_VAR);
4217
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4218
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4220
float_raise( float_flag_invalid STATUS_VAR);
4226
/*----------------------------------------------------------------------------
4227
| Returns 1 if the double-precision floating-point value `a' is equal to the
4228
| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4229
| exception.The comparison is performed according to the IEC/IEEE Standard
4230
| for Binary Floating-Point Arithmetic.
4231
*----------------------------------------------------------------------------*/
4233
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4236
a = float64_squash_input_denormal(a STATUS_VAR);
4237
b = float64_squash_input_denormal(b STATUS_VAR);
4239
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4240
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4242
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4243
float_raise( float_flag_invalid STATUS_VAR);
4247
av = float64_val(a);
4248
bv = float64_val(b);
4249
return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4253
/*----------------------------------------------------------------------------
4254
| Returns 1 if the double-precision floating-point value `a' is less than or
4255
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4256
| cause an exception. Otherwise, the comparison is performed according to the
4257
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4258
*----------------------------------------------------------------------------*/
4260
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4264
a = float64_squash_input_denormal(a STATUS_VAR);
4265
b = float64_squash_input_denormal(b STATUS_VAR);
4267
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4268
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4270
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4271
float_raise( float_flag_invalid STATUS_VAR);
4275
aSign = extractFloat64Sign( a );
4276
bSign = extractFloat64Sign( b );
4277
av = float64_val(a);
4278
bv = float64_val(b);
4279
if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4280
return ( av == bv ) || ( aSign ^ ( av < bv ) );
4284
/*----------------------------------------------------------------------------
4285
| Returns 1 if the double-precision floating-point value `a' is less than
4286
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4287
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4288
| Standard for Binary Floating-Point Arithmetic.
4289
*----------------------------------------------------------------------------*/
4291
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4295
a = float64_squash_input_denormal(a STATUS_VAR);
4296
b = float64_squash_input_denormal(b STATUS_VAR);
4298
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4299
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4301
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4302
float_raise( float_flag_invalid STATUS_VAR);
4306
aSign = extractFloat64Sign( a );
4307
bSign = extractFloat64Sign( b );
4308
av = float64_val(a);
4309
bv = float64_val(b);
4310
if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4311
return ( av != bv ) && ( aSign ^ ( av < bv ) );
4315
/*----------------------------------------------------------------------------
4316
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4317
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4318
| comparison is performed according to the IEC/IEEE Standard for Binary
4319
| Floating-Point Arithmetic.
4320
*----------------------------------------------------------------------------*/
4322
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4324
a = float64_squash_input_denormal(a STATUS_VAR);
4325
b = float64_squash_input_denormal(b STATUS_VAR);
4327
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4328
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4330
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4331
float_raise( float_flag_invalid STATUS_VAR);
4338
/*----------------------------------------------------------------------------
4339
| Returns the result of converting the extended double-precision floating-
4340
| point value `a' to the 32-bit two's complement integer format. The
4341
| conversion is performed according to the IEC/IEEE Standard for Binary
4342
| Floating-Point Arithmetic---which means in particular that the conversion
4343
| is rounded according to the current rounding mode. If `a' is a NaN, the
4344
| largest positive integer is returned. Otherwise, if the conversion
4345
| overflows, the largest integer with the same sign as `a' is returned.
4346
*----------------------------------------------------------------------------*/
4348
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4351
int32 aExp, shiftCount;
4354
aSig = extractFloatx80Frac( a );
4355
aExp = extractFloatx80Exp( a );
4356
aSign = extractFloatx80Sign( a );
4357
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4358
shiftCount = 0x4037 - aExp;
4359
if ( shiftCount <= 0 ) shiftCount = 1;
4360
shift64RightJamming( aSig, shiftCount, &aSig );
4361
return roundAndPackInt32( aSign, aSig STATUS_VAR );
4365
/*----------------------------------------------------------------------------
4366
| Returns the result of converting the extended double-precision floating-
4367
| point value `a' to the 32-bit two's complement integer format. The
4368
| conversion is performed according to the IEC/IEEE Standard for Binary
4369
| Floating-Point Arithmetic, except that the conversion is always rounded
4370
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4371
| Otherwise, if the conversion overflows, the largest integer with the same
4372
| sign as `a' is returned.
4373
*----------------------------------------------------------------------------*/
4375
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4378
int32 aExp, shiftCount;
4379
uint64_t aSig, savedASig;
4382
aSig = extractFloatx80Frac( a );
4383
aExp = extractFloatx80Exp( a );
4384
aSign = extractFloatx80Sign( a );
4385
if ( 0x401E < aExp ) {
4386
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4389
else if ( aExp < 0x3FFF ) {
4390
if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4393
shiftCount = 0x403E - aExp;
4395
aSig >>= shiftCount;
4397
if ( aSign ) z = - z;
4398
if ( ( z < 0 ) ^ aSign ) {
4400
float_raise( float_flag_invalid STATUS_VAR);
4401
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4403
if ( ( aSig<<shiftCount ) != savedASig ) {
4404
STATUS(float_exception_flags) |= float_flag_inexact;
4410
/*----------------------------------------------------------------------------
4411
| Returns the result of converting the extended double-precision floating-
4412
| point value `a' to the 64-bit two's complement integer format. The
4413
| conversion is performed according to the IEC/IEEE Standard for Binary
4414
| Floating-Point Arithmetic---which means in particular that the conversion
4415
| is rounded according to the current rounding mode. If `a' is a NaN,
4416
| the largest positive integer is returned. Otherwise, if the conversion
4417
| overflows, the largest integer with the same sign as `a' is returned.
4418
*----------------------------------------------------------------------------*/
4420
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4423
int32 aExp, shiftCount;
4424
uint64_t aSig, aSigExtra;
4426
aSig = extractFloatx80Frac( a );
4427
aExp = extractFloatx80Exp( a );
4428
aSign = extractFloatx80Sign( a );
4429
shiftCount = 0x403E - aExp;
4430
if ( shiftCount <= 0 ) {
4432
float_raise( float_flag_invalid STATUS_VAR);
4434
|| ( ( aExp == 0x7FFF )
4435
&& ( aSig != LIT64( 0x8000000000000000 ) ) )
4437
return LIT64( 0x7FFFFFFFFFFFFFFF );
4439
return (int64_t) LIT64( 0x8000000000000000 );
4444
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4446
return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4450
/*----------------------------------------------------------------------------
4451
| Returns the result of converting the extended double-precision floating-
4452
| point value `a' to the 64-bit two's complement integer format. The
4453
| conversion is performed according to the IEC/IEEE Standard for Binary
4454
| Floating-Point Arithmetic, except that the conversion is always rounded
4455
| toward zero. If `a' is a NaN, the largest positive integer is returned.
4456
| Otherwise, if the conversion overflows, the largest integer with the same
4457
| sign as `a' is returned.
4458
*----------------------------------------------------------------------------*/
4460
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4463
int32 aExp, shiftCount;
4467
aSig = extractFloatx80Frac( a );
4468
aExp = extractFloatx80Exp( a );
4469
aSign = extractFloatx80Sign( a );
4470
shiftCount = aExp - 0x403E;
4471
if ( 0 <= shiftCount ) {
4472
aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4473
if ( ( a.high != 0xC03E ) || aSig ) {
4474
float_raise( float_flag_invalid STATUS_VAR);
4475
if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4476
return LIT64( 0x7FFFFFFFFFFFFFFF );
4479
return (int64_t) LIT64( 0x8000000000000000 );
4481
else if ( aExp < 0x3FFF ) {
4482
if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4485
z = aSig>>( - shiftCount );
4486
if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4487
STATUS(float_exception_flags) |= float_flag_inexact;
4489
if ( aSign ) z = - z;
4494
/*----------------------------------------------------------------------------
4495
| Returns the result of converting the extended double-precision floating-
4496
| point value `a' to the single-precision floating-point format. The
4497
| conversion is performed according to the IEC/IEEE Standard for Binary
4498
| Floating-Point Arithmetic.
4499
*----------------------------------------------------------------------------*/
4501
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4507
aSig = extractFloatx80Frac( a );
4508
aExp = extractFloatx80Exp( a );
4509
aSign = extractFloatx80Sign( a );
4510
if ( aExp == 0x7FFF ) {
4511
if ( (uint64_t) ( aSig<<1 ) ) {
4512
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4514
return packFloat32( aSign, 0xFF, 0 );
4516
shift64RightJamming( aSig, 33, &aSig );
4517
if ( aExp || aSig ) aExp -= 0x3F81;
4518
return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4522
/*----------------------------------------------------------------------------
4523
| Returns the result of converting the extended double-precision floating-
4524
| point value `a' to the double-precision floating-point format. The
4525
| conversion is performed according to the IEC/IEEE Standard for Binary
4526
| Floating-Point Arithmetic.
4527
*----------------------------------------------------------------------------*/
4529
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4533
uint64_t aSig, zSig;
4535
aSig = extractFloatx80Frac( a );
4536
aExp = extractFloatx80Exp( a );
4537
aSign = extractFloatx80Sign( a );
4538
if ( aExp == 0x7FFF ) {
4539
if ( (uint64_t) ( aSig<<1 ) ) {
4540
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4542
return packFloat64( aSign, 0x7FF, 0 );
4544
shift64RightJamming( aSig, 1, &zSig );
4545
if ( aExp || aSig ) aExp -= 0x3C01;
4546
return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4550
/*----------------------------------------------------------------------------
4551
| Returns the result of converting the extended double-precision floating-
4552
| point value `a' to the quadruple-precision floating-point format. The
4553
| conversion is performed according to the IEC/IEEE Standard for Binary
4554
| Floating-Point Arithmetic.
4555
*----------------------------------------------------------------------------*/
4557
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4561
uint64_t aSig, zSig0, zSig1;
4563
aSig = extractFloatx80Frac( a );
4564
aExp = extractFloatx80Exp( a );
4565
aSign = extractFloatx80Sign( a );
4566
if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4567
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4569
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4570
return packFloat128( aSign, aExp, zSig0, zSig1 );
4574
/*----------------------------------------------------------------------------
4575
| Rounds the extended double-precision floating-point value `a' to an integer,
4576
| and returns the result as an extended quadruple-precision floating-point
4577
| value. The operation is performed according to the IEC/IEEE Standard for
4578
| Binary Floating-Point Arithmetic.
4579
*----------------------------------------------------------------------------*/
4581
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4585
uint64_t lastBitMask, roundBitsMask;
4589
aExp = extractFloatx80Exp( a );
4590
if ( 0x403E <= aExp ) {
4591
if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4592
return propagateFloatx80NaN( a, a STATUS_VAR );
4596
if ( aExp < 0x3FFF ) {
4598
&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4601
STATUS(float_exception_flags) |= float_flag_inexact;
4602
aSign = extractFloatx80Sign( a );
4603
switch ( STATUS(float_rounding_mode) ) {
4604
case float_round_nearest_even:
4605
if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4608
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4611
case float_round_down:
4614
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4615
: packFloatx80( 0, 0, 0 );
4616
case float_round_up:
4618
aSign ? packFloatx80( 1, 0, 0 )
4619
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4621
return packFloatx80( aSign, 0, 0 );
4624
lastBitMask <<= 0x403E - aExp;
4625
roundBitsMask = lastBitMask - 1;
4627
roundingMode = STATUS(float_rounding_mode);
4628
if ( roundingMode == float_round_nearest_even ) {
4629
z.low += lastBitMask>>1;
4630
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4632
else if ( roundingMode != float_round_to_zero ) {
4633
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4634
z.low += roundBitsMask;
4637
z.low &= ~ roundBitsMask;
4640
z.low = LIT64( 0x8000000000000000 );
4642
if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4647
/*----------------------------------------------------------------------------
4648
| Returns the result of adding the absolute values of the extended double-
4649
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4650
| negated before being returned. `zSign' is ignored if the result is a NaN.
4651
| The addition is performed according to the IEC/IEEE Standard for Binary
4652
| Floating-Point Arithmetic.
4653
*----------------------------------------------------------------------------*/
4655
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4657
int32 aExp, bExp, zExp;
4658
uint64_t aSig, bSig, zSig0, zSig1;
4661
aSig = extractFloatx80Frac( a );
4662
aExp = extractFloatx80Exp( a );
4663
bSig = extractFloatx80Frac( b );
4664
bExp = extractFloatx80Exp( b );
4665
expDiff = aExp - bExp;
4666
if ( 0 < expDiff ) {
4667
if ( aExp == 0x7FFF ) {
4668
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4671
if ( bExp == 0 ) --expDiff;
4672
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4675
else if ( expDiff < 0 ) {
4676
if ( bExp == 0x7FFF ) {
4677
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4678
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4680
if ( aExp == 0 ) ++expDiff;
4681
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4685
if ( aExp == 0x7FFF ) {
4686
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4687
return propagateFloatx80NaN( a, b STATUS_VAR );
4692
zSig0 = aSig + bSig;
4694
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4700
zSig0 = aSig + bSig;
4701
if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4703
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4704
zSig0 |= LIT64( 0x8000000000000000 );
4708
roundAndPackFloatx80(
4709
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4713
/*----------------------------------------------------------------------------
4714
| Returns the result of subtracting the absolute values of the extended
4715
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4716
| difference is negated before being returned. `zSign' is ignored if the
4717
| result is a NaN. The subtraction is performed according to the IEC/IEEE
4718
| Standard for Binary Floating-Point Arithmetic.
4719
*----------------------------------------------------------------------------*/
4721
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4723
int32 aExp, bExp, zExp;
4724
uint64_t aSig, bSig, zSig0, zSig1;
4728
aSig = extractFloatx80Frac( a );
4729
aExp = extractFloatx80Exp( a );
4730
bSig = extractFloatx80Frac( b );
4731
bExp = extractFloatx80Exp( b );
4732
expDiff = aExp - bExp;
4733
if ( 0 < expDiff ) goto aExpBigger;
4734
if ( expDiff < 0 ) goto bExpBigger;
4735
if ( aExp == 0x7FFF ) {
4736
if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4737
return propagateFloatx80NaN( a, b STATUS_VAR );
4739
float_raise( float_flag_invalid STATUS_VAR);
4740
z.low = floatx80_default_nan_low;
4741
z.high = floatx80_default_nan_high;
4749
if ( bSig < aSig ) goto aBigger;
4750
if ( aSig < bSig ) goto bBigger;
4751
return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4753
if ( bExp == 0x7FFF ) {
4754
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4755
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4757
if ( aExp == 0 ) ++expDiff;
4758
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4760
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4763
goto normalizeRoundAndPack;
4765
if ( aExp == 0x7FFF ) {
4766
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4769
if ( bExp == 0 ) --expDiff;
4770
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4772
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4774
normalizeRoundAndPack:
4776
normalizeRoundAndPackFloatx80(
4777
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4781
/*----------------------------------------------------------------------------
4782
| Returns the result of adding the extended double-precision floating-point
4783
| values `a' and `b'. The operation is performed according to the IEC/IEEE
4784
| Standard for Binary Floating-Point Arithmetic.
4785
*----------------------------------------------------------------------------*/
4787
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4791
aSign = extractFloatx80Sign( a );
4792
bSign = extractFloatx80Sign( b );
4793
if ( aSign == bSign ) {
4794
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4797
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4802
/*----------------------------------------------------------------------------
4803
| Returns the result of subtracting the extended double-precision floating-
4804
| point values `a' and `b'. The operation is performed according to the
4805
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4806
*----------------------------------------------------------------------------*/
4808
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4812
aSign = extractFloatx80Sign( a );
4813
bSign = extractFloatx80Sign( b );
4814
if ( aSign == bSign ) {
4815
return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4818
return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4823
/*----------------------------------------------------------------------------
4824
| Returns the result of multiplying the extended double-precision floating-
4825
| point values `a' and `b'. The operation is performed according to the
4826
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4827
*----------------------------------------------------------------------------*/
4829
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4831
flag aSign, bSign, zSign;
4832
int32 aExp, bExp, zExp;
4833
uint64_t aSig, bSig, zSig0, zSig1;
4836
aSig = extractFloatx80Frac( a );
4837
aExp = extractFloatx80Exp( a );
4838
aSign = extractFloatx80Sign( a );
4839
bSig = extractFloatx80Frac( b );
4840
bExp = extractFloatx80Exp( b );
4841
bSign = extractFloatx80Sign( b );
4842
zSign = aSign ^ bSign;
4843
if ( aExp == 0x7FFF ) {
4844
if ( (uint64_t) ( aSig<<1 )
4845
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4846
return propagateFloatx80NaN( a, b STATUS_VAR );
4848
if ( ( bExp | bSig ) == 0 ) goto invalid;
4849
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4851
if ( bExp == 0x7FFF ) {
4852
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4853
if ( ( aExp | aSig ) == 0 ) {
4855
float_raise( float_flag_invalid STATUS_VAR);
4856
z.low = floatx80_default_nan_low;
4857
z.high = floatx80_default_nan_high;
4860
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4863
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4864
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4867
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4868
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4870
zExp = aExp + bExp - 0x3FFE;
4871
mul64To128( aSig, bSig, &zSig0, &zSig1 );
4872
if ( 0 < (int64_t) zSig0 ) {
4873
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4877
roundAndPackFloatx80(
4878
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4882
/*----------------------------------------------------------------------------
4883
| Returns the result of dividing the extended double-precision floating-point
4884
| value `a' by the corresponding value `b'. The operation is performed
4885
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4886
*----------------------------------------------------------------------------*/
4888
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4890
flag aSign, bSign, zSign;
4891
int32 aExp, bExp, zExp;
4892
uint64_t aSig, bSig, zSig0, zSig1;
4893
uint64_t rem0, rem1, rem2, term0, term1, term2;
4896
aSig = extractFloatx80Frac( a );
4897
aExp = extractFloatx80Exp( a );
4898
aSign = extractFloatx80Sign( a );
4899
bSig = extractFloatx80Frac( b );
4900
bExp = extractFloatx80Exp( b );
4901
bSign = extractFloatx80Sign( b );
4902
zSign = aSign ^ bSign;
4903
if ( aExp == 0x7FFF ) {
4904
if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4905
if ( bExp == 0x7FFF ) {
4906
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4909
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4911
if ( bExp == 0x7FFF ) {
4912
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4913
return packFloatx80( zSign, 0, 0 );
4917
if ( ( aExp | aSig ) == 0 ) {
4919
float_raise( float_flag_invalid STATUS_VAR);
4920
z.low = floatx80_default_nan_low;
4921
z.high = floatx80_default_nan_high;
4924
float_raise( float_flag_divbyzero STATUS_VAR);
4925
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4927
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4930
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4931
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4933
zExp = aExp - bExp + 0x3FFE;
4935
if ( bSig <= aSig ) {
4936
shift128Right( aSig, 0, 1, &aSig, &rem1 );
4939
zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4940
mul64To128( bSig, zSig0, &term0, &term1 );
4941
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4942
while ( (int64_t) rem0 < 0 ) {
4944
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4946
zSig1 = estimateDiv128To64( rem1, 0, bSig );
4947
if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4948
mul64To128( bSig, zSig1, &term1, &term2 );
4949
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4950
while ( (int64_t) rem1 < 0 ) {
4952
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4954
zSig1 |= ( ( rem1 | rem2 ) != 0 );
4957
roundAndPackFloatx80(
4958
STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4962
/*----------------------------------------------------------------------------
4963
| Returns the remainder of the extended double-precision floating-point value
4964
| `a' with respect to the corresponding value `b'. The operation is performed
4965
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4966
*----------------------------------------------------------------------------*/
4968
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4971
int32 aExp, bExp, expDiff;
4972
uint64_t aSig0, aSig1, bSig;
4973
uint64_t q, term0, term1, alternateASig0, alternateASig1;
4976
aSig0 = extractFloatx80Frac( a );
4977
aExp = extractFloatx80Exp( a );
4978
aSign = extractFloatx80Sign( a );
4979
bSig = extractFloatx80Frac( b );
4980
bExp = extractFloatx80Exp( b );
4981
if ( aExp == 0x7FFF ) {
4982
if ( (uint64_t) ( aSig0<<1 )
4983
|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4984
return propagateFloatx80NaN( a, b STATUS_VAR );
4988
if ( bExp == 0x7FFF ) {
4989
if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4995
float_raise( float_flag_invalid STATUS_VAR);
4996
z.low = floatx80_default_nan_low;
4997
z.high = floatx80_default_nan_high;
5000
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5003
if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5004
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5006
bSig |= LIT64( 0x8000000000000000 );
5008
expDiff = aExp - bExp;
5010
if ( expDiff < 0 ) {
5011
if ( expDiff < -1 ) return a;
5012
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5015
q = ( bSig <= aSig0 );
5016
if ( q ) aSig0 -= bSig;
5018
while ( 0 < expDiff ) {
5019
q = estimateDiv128To64( aSig0, aSig1, bSig );
5020
q = ( 2 < q ) ? q - 2 : 0;
5021
mul64To128( bSig, q, &term0, &term1 );
5022
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5023
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5027
if ( 0 < expDiff ) {
5028
q = estimateDiv128To64( aSig0, aSig1, bSig );
5029
q = ( 2 < q ) ? q - 2 : 0;
5031
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5032
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5033
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5034
while ( le128( term0, term1, aSig0, aSig1 ) ) {
5036
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5043
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5044
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5045
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5048
aSig0 = alternateASig0;
5049
aSig1 = alternateASig1;
5053
normalizeRoundAndPackFloatx80(
5054
80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5058
/*----------------------------------------------------------------------------
5059
| Returns the square root of the extended double-precision floating-point
5060
| value `a'. The operation is performed according to the IEC/IEEE Standard
5061
| for Binary Floating-Point Arithmetic.
5062
*----------------------------------------------------------------------------*/
5064
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5068
uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5069
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5072
aSig0 = extractFloatx80Frac( a );
5073
aExp = extractFloatx80Exp( a );
5074
aSign = extractFloatx80Sign( a );
5075
if ( aExp == 0x7FFF ) {
5076
if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5077
if ( ! aSign ) return a;
5081
if ( ( aExp | aSig0 ) == 0 ) return a;
5083
float_raise( float_flag_invalid STATUS_VAR);
5084
z.low = floatx80_default_nan_low;
5085
z.high = floatx80_default_nan_high;
5089
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5090
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5092
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5093
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5094
shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5095
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5096
doubleZSig0 = zSig0<<1;
5097
mul64To128( zSig0, zSig0, &term0, &term1 );
5098
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5099
while ( (int64_t) rem0 < 0 ) {
5102
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5104
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5105
if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5106
if ( zSig1 == 0 ) zSig1 = 1;
5107
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5108
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5109
mul64To128( zSig1, zSig1, &term2, &term3 );
5110
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5111
while ( (int64_t) rem1 < 0 ) {
5113
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5115
term2 |= doubleZSig0;
5116
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5118
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5120
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5121
zSig0 |= doubleZSig0;
5123
roundAndPackFloatx80(
5124
STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5128
/*----------------------------------------------------------------------------
5129
| Returns 1 if the extended double-precision floating-point value `a' is equal
5130
| to the corresponding value `b', and 0 otherwise. The invalid exception is
5131
| raised if either operand is a NaN. Otherwise, the comparison is performed
5132
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5133
*----------------------------------------------------------------------------*/
5135
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5138
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5139
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5140
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5141
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5143
float_raise( float_flag_invalid STATUS_VAR);
5148
&& ( ( a.high == b.high )
5150
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5155
/*----------------------------------------------------------------------------
5156
| Returns 1 if the extended double-precision floating-point value `a' is
5157
| less than or equal to the corresponding value `b', and 0 otherwise. The
5158
| invalid exception is raised if either operand is a NaN. The comparison is
5159
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5161
*----------------------------------------------------------------------------*/
5163
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5167
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5168
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5169
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5170
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5172
float_raise( float_flag_invalid STATUS_VAR);
5175
aSign = extractFloatx80Sign( a );
5176
bSign = extractFloatx80Sign( b );
5177
if ( aSign != bSign ) {
5180
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5184
aSign ? le128( b.high, b.low, a.high, a.low )
5185
: le128( a.high, a.low, b.high, b.low );
5189
/*----------------------------------------------------------------------------
5190
| Returns 1 if the extended double-precision floating-point value `a' is
5191
| less than the corresponding value `b', and 0 otherwise. The invalid
5192
| exception is raised if either operand is a NaN. The comparison is performed
5193
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5194
*----------------------------------------------------------------------------*/
5196
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5200
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5201
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5202
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5203
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5205
float_raise( float_flag_invalid STATUS_VAR);
5208
aSign = extractFloatx80Sign( a );
5209
bSign = extractFloatx80Sign( b );
5210
if ( aSign != bSign ) {
5213
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5217
aSign ? lt128( b.high, b.low, a.high, a.low )
5218
: lt128( a.high, a.low, b.high, b.low );
5222
/*----------------------------------------------------------------------------
5223
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5224
| cannot be compared, and 0 otherwise. The invalid exception is raised if
5225
| either operand is a NaN. The comparison is performed according to the
5226
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5227
*----------------------------------------------------------------------------*/
5228
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5230
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5231
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5232
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5233
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5235
float_raise( float_flag_invalid STATUS_VAR);
5241
/*----------------------------------------------------------------------------
5242
| Returns 1 if the extended double-precision floating-point value `a' is
5243
| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5244
| cause an exception. The comparison is performed according to the IEC/IEEE
5245
| Standard for Binary Floating-Point Arithmetic.
5246
*----------------------------------------------------------------------------*/
5248
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5251
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5252
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5253
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5254
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5256
if ( floatx80_is_signaling_nan( a )
5257
|| floatx80_is_signaling_nan( b ) ) {
5258
float_raise( float_flag_invalid STATUS_VAR);
5264
&& ( ( a.high == b.high )
5266
&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5271
/*----------------------------------------------------------------------------
5272
| Returns 1 if the extended double-precision floating-point value `a' is less
5273
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5274
| do not cause an exception. Otherwise, the comparison is performed according
5275
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5276
*----------------------------------------------------------------------------*/
5278
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5282
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5283
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5284
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5285
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5287
if ( floatx80_is_signaling_nan( a )
5288
|| floatx80_is_signaling_nan( b ) ) {
5289
float_raise( float_flag_invalid STATUS_VAR);
5293
aSign = extractFloatx80Sign( a );
5294
bSign = extractFloatx80Sign( b );
5295
if ( aSign != bSign ) {
5298
|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5302
aSign ? le128( b.high, b.low, a.high, a.low )
5303
: le128( a.high, a.low, b.high, b.low );
5307
/*----------------------------------------------------------------------------
5308
| Returns 1 if the extended double-precision floating-point value `a' is less
5309
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5310
| an exception. Otherwise, the comparison is performed according to the
5311
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5312
*----------------------------------------------------------------------------*/
5314
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5318
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5319
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5320
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5321
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5323
if ( floatx80_is_signaling_nan( a )
5324
|| floatx80_is_signaling_nan( b ) ) {
5325
float_raise( float_flag_invalid STATUS_VAR);
5329
aSign = extractFloatx80Sign( a );
5330
bSign = extractFloatx80Sign( b );
5331
if ( aSign != bSign ) {
5334
&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5338
aSign ? lt128( b.high, b.low, a.high, a.low )
5339
: lt128( a.high, a.low, b.high, b.low );
5343
/*----------------------------------------------------------------------------
5344
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5345
| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5346
| The comparison is performed according to the IEC/IEEE Standard for Binary
5347
| Floating-Point Arithmetic.
5348
*----------------------------------------------------------------------------*/
5349
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5351
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5352
&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5353
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
5354
&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5356
if ( floatx80_is_signaling_nan( a )
5357
|| floatx80_is_signaling_nan( b ) ) {
5358
float_raise( float_flag_invalid STATUS_VAR);
5365
/*----------------------------------------------------------------------------
5366
| Returns the result of converting the quadruple-precision floating-point
5367
| value `a' to the 32-bit two's complement integer format. The conversion
5368
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5369
| Arithmetic---which means in particular that the conversion is rounded
5370
| according to the current rounding mode. If `a' is a NaN, the largest
5371
| positive integer is returned. Otherwise, if the conversion overflows, the
5372
| largest integer with the same sign as `a' is returned.
5373
*----------------------------------------------------------------------------*/
5375
int32 float128_to_int32( float128 a STATUS_PARAM )
5378
int32 aExp, shiftCount;
5379
uint64_t aSig0, aSig1;
5381
aSig1 = extractFloat128Frac1( a );
5382
aSig0 = extractFloat128Frac0( a );
5383
aExp = extractFloat128Exp( a );
5384
aSign = extractFloat128Sign( a );
5385
if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5386
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5387
aSig0 |= ( aSig1 != 0 );
5388
shiftCount = 0x4028 - aExp;
5389
if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5390
return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5394
/*----------------------------------------------------------------------------
5395
| Returns the result of converting the quadruple-precision floating-point
5396
| value `a' to the 32-bit two's complement integer format. The conversion
5397
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5398
| Arithmetic, except that the conversion is always rounded toward zero. If
5399
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5400
| conversion overflows, the largest integer with the same sign as `a' is
5402
*----------------------------------------------------------------------------*/
5404
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5407
int32 aExp, shiftCount;
5408
uint64_t aSig0, aSig1, savedASig;
5411
aSig1 = extractFloat128Frac1( a );
5412
aSig0 = extractFloat128Frac0( a );
5413
aExp = extractFloat128Exp( a );
5414
aSign = extractFloat128Sign( a );
5415
aSig0 |= ( aSig1 != 0 );
5416
if ( 0x401E < aExp ) {
5417
if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5420
else if ( aExp < 0x3FFF ) {
5421
if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5424
aSig0 |= LIT64( 0x0001000000000000 );
5425
shiftCount = 0x402F - aExp;
5427
aSig0 >>= shiftCount;
5429
if ( aSign ) z = - z;
5430
if ( ( z < 0 ) ^ aSign ) {
5432
float_raise( float_flag_invalid STATUS_VAR);
5433
return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5435
if ( ( aSig0<<shiftCount ) != savedASig ) {
5436
STATUS(float_exception_flags) |= float_flag_inexact;
5442
/*----------------------------------------------------------------------------
5443
| Returns the result of converting the quadruple-precision floating-point
5444
| value `a' to the 64-bit two's complement integer format. The conversion
5445
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5446
| Arithmetic---which means in particular that the conversion is rounded
5447
| according to the current rounding mode. If `a' is a NaN, the largest
5448
| positive integer is returned. Otherwise, if the conversion overflows, the
5449
| largest integer with the same sign as `a' is returned.
5450
*----------------------------------------------------------------------------*/
5452
int64 float128_to_int64( float128 a STATUS_PARAM )
5455
int32 aExp, shiftCount;
5456
uint64_t aSig0, aSig1;
5458
aSig1 = extractFloat128Frac1( a );
5459
aSig0 = extractFloat128Frac0( a );
5460
aExp = extractFloat128Exp( a );
5461
aSign = extractFloat128Sign( a );
5462
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5463
shiftCount = 0x402F - aExp;
5464
if ( shiftCount <= 0 ) {
5465
if ( 0x403E < aExp ) {
5466
float_raise( float_flag_invalid STATUS_VAR);
5468
|| ( ( aExp == 0x7FFF )
5469
&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5472
return LIT64( 0x7FFFFFFFFFFFFFFF );
5474
return (int64_t) LIT64( 0x8000000000000000 );
5476
shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5479
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5481
return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5485
/*----------------------------------------------------------------------------
5486
| Returns the result of converting the quadruple-precision floating-point
5487
| value `a' to the 64-bit two's complement integer format. The conversion
5488
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5489
| Arithmetic, except that the conversion is always rounded toward zero.
5490
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5491
| the conversion overflows, the largest integer with the same sign as `a' is
5493
*----------------------------------------------------------------------------*/
5495
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5498
int32 aExp, shiftCount;
5499
uint64_t aSig0, aSig1;
5502
aSig1 = extractFloat128Frac1( a );
5503
aSig0 = extractFloat128Frac0( a );
5504
aExp = extractFloat128Exp( a );
5505
aSign = extractFloat128Sign( a );
5506
if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5507
shiftCount = aExp - 0x402F;
5508
if ( 0 < shiftCount ) {
5509
if ( 0x403E <= aExp ) {
5510
aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5511
if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5512
&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5513
if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5516
float_raise( float_flag_invalid STATUS_VAR);
5517
if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5518
return LIT64( 0x7FFFFFFFFFFFFFFF );
5521
return (int64_t) LIT64( 0x8000000000000000 );
5523
z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5524
if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5525
STATUS(float_exception_flags) |= float_flag_inexact;
5529
if ( aExp < 0x3FFF ) {
5530
if ( aExp | aSig0 | aSig1 ) {
5531
STATUS(float_exception_flags) |= float_flag_inexact;
5535
z = aSig0>>( - shiftCount );
5537
|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5538
STATUS(float_exception_flags) |= float_flag_inexact;
5541
if ( aSign ) z = - z;
5546
/*----------------------------------------------------------------------------
5547
| Returns the result of converting the quadruple-precision floating-point
5548
| value `a' to the single-precision floating-point format. The conversion
5549
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5551
*----------------------------------------------------------------------------*/
5553
float32 float128_to_float32( float128 a STATUS_PARAM )
5557
uint64_t aSig0, aSig1;
5560
aSig1 = extractFloat128Frac1( a );
5561
aSig0 = extractFloat128Frac0( a );
5562
aExp = extractFloat128Exp( a );
5563
aSign = extractFloat128Sign( a );
5564
if ( aExp == 0x7FFF ) {
5565
if ( aSig0 | aSig1 ) {
5566
return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5568
return packFloat32( aSign, 0xFF, 0 );
5570
aSig0 |= ( aSig1 != 0 );
5571
shift64RightJamming( aSig0, 18, &aSig0 );
5573
if ( aExp || zSig ) {
5577
return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5581
/*----------------------------------------------------------------------------
5582
| Returns the result of converting the quadruple-precision floating-point
5583
| value `a' to the double-precision floating-point format. The conversion
5584
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5586
*----------------------------------------------------------------------------*/
5588
float64 float128_to_float64( float128 a STATUS_PARAM )
5592
uint64_t aSig0, aSig1;
5594
aSig1 = extractFloat128Frac1( a );
5595
aSig0 = extractFloat128Frac0( a );
5596
aExp = extractFloat128Exp( a );
5597
aSign = extractFloat128Sign( a );
5598
if ( aExp == 0x7FFF ) {
5599
if ( aSig0 | aSig1 ) {
5600
return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5602
return packFloat64( aSign, 0x7FF, 0 );
5604
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5605
aSig0 |= ( aSig1 != 0 );
5606
if ( aExp || aSig0 ) {
5607
aSig0 |= LIT64( 0x4000000000000000 );
5610
return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5614
/*----------------------------------------------------------------------------
5615
| Returns the result of converting the quadruple-precision floating-point
5616
| value `a' to the extended double-precision floating-point format. The
5617
| conversion is performed according to the IEC/IEEE Standard for Binary
5618
| Floating-Point Arithmetic.
5619
*----------------------------------------------------------------------------*/
5621
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5625
uint64_t aSig0, aSig1;
5627
aSig1 = extractFloat128Frac1( a );
5628
aSig0 = extractFloat128Frac0( a );
5629
aExp = extractFloat128Exp( a );
5630
aSign = extractFloat128Sign( a );
5631
if ( aExp == 0x7FFF ) {
5632
if ( aSig0 | aSig1 ) {
5633
return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5635
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5638
if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5639
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5642
aSig0 |= LIT64( 0x0001000000000000 );
5644
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5645
return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5649
/*----------------------------------------------------------------------------
5650
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5651
| returns the result as a quadruple-precision floating-point value. The
5652
| operation is performed according to the IEC/IEEE Standard for Binary
5653
| Floating-Point Arithmetic.
5654
*----------------------------------------------------------------------------*/
5656
float128 float128_round_to_int( float128 a STATUS_PARAM )
5660
uint64_t lastBitMask, roundBitsMask;
5664
aExp = extractFloat128Exp( a );
5665
if ( 0x402F <= aExp ) {
5666
if ( 0x406F <= aExp ) {
5667
if ( ( aExp == 0x7FFF )
5668
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5670
return propagateFloat128NaN( a, a STATUS_VAR );
5675
lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5676
roundBitsMask = lastBitMask - 1;
5678
roundingMode = STATUS(float_rounding_mode);
5679
if ( roundingMode == float_round_nearest_even ) {
5680
if ( lastBitMask ) {
5681
add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5682
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5685
if ( (int64_t) z.low < 0 ) {
5687
if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5691
else if ( roundingMode != float_round_to_zero ) {
5692
if ( extractFloat128Sign( z )
5693
^ ( roundingMode == float_round_up ) ) {
5694
add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5697
z.low &= ~ roundBitsMask;
5700
if ( aExp < 0x3FFF ) {
5701
if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5702
STATUS(float_exception_flags) |= float_flag_inexact;
5703
aSign = extractFloat128Sign( a );
5704
switch ( STATUS(float_rounding_mode) ) {
5705
case float_round_nearest_even:
5706
if ( ( aExp == 0x3FFE )
5707
&& ( extractFloat128Frac0( a )
5708
| extractFloat128Frac1( a ) )
5710
return packFloat128( aSign, 0x3FFF, 0, 0 );
5713
case float_round_down:
5715
aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5716
: packFloat128( 0, 0, 0, 0 );
5717
case float_round_up:
5719
aSign ? packFloat128( 1, 0, 0, 0 )
5720
: packFloat128( 0, 0x3FFF, 0, 0 );
5722
return packFloat128( aSign, 0, 0, 0 );
5725
lastBitMask <<= 0x402F - aExp;
5726
roundBitsMask = lastBitMask - 1;
5729
roundingMode = STATUS(float_rounding_mode);
5730
if ( roundingMode == float_round_nearest_even ) {
5731
z.high += lastBitMask>>1;
5732
if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5733
z.high &= ~ lastBitMask;
5736
else if ( roundingMode != float_round_to_zero ) {
5737
if ( extractFloat128Sign( z )
5738
^ ( roundingMode == float_round_up ) ) {
5739
z.high |= ( a.low != 0 );
5740
z.high += roundBitsMask;
5743
z.high &= ~ roundBitsMask;
5745
if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5746
STATUS(float_exception_flags) |= float_flag_inexact;
5752
/*----------------------------------------------------------------------------
5753
| Returns the result of adding the absolute values of the quadruple-precision
5754
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5755
| before being returned. `zSign' is ignored if the result is a NaN.
5756
| The addition is performed according to the IEC/IEEE Standard for Binary
5757
| Floating-Point Arithmetic.
5758
*----------------------------------------------------------------------------*/
5760
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5762
int32 aExp, bExp, zExp;
5763
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5766
aSig1 = extractFloat128Frac1( a );
5767
aSig0 = extractFloat128Frac0( a );
5768
aExp = extractFloat128Exp( a );
5769
bSig1 = extractFloat128Frac1( b );
5770
bSig0 = extractFloat128Frac0( b );
5771
bExp = extractFloat128Exp( b );
5772
expDiff = aExp - bExp;
5773
if ( 0 < expDiff ) {
5774
if ( aExp == 0x7FFF ) {
5775
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5782
bSig0 |= LIT64( 0x0001000000000000 );
5784
shift128ExtraRightJamming(
5785
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5788
else if ( expDiff < 0 ) {
5789
if ( bExp == 0x7FFF ) {
5790
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5791
return packFloat128( zSign, 0x7FFF, 0, 0 );
5797
aSig0 |= LIT64( 0x0001000000000000 );
5799
shift128ExtraRightJamming(
5800
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5804
if ( aExp == 0x7FFF ) {
5805
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5806
return propagateFloat128NaN( a, b STATUS_VAR );
5810
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5812
if (STATUS(flush_to_zero)) {
5813
if (zSig0 | zSig1) {
5814
float_raise(float_flag_output_denormal STATUS_VAR);
5816
return packFloat128(zSign, 0, 0, 0);
5818
return packFloat128( zSign, 0, zSig0, zSig1 );
5821
zSig0 |= LIT64( 0x0002000000000000 );
5825
aSig0 |= LIT64( 0x0001000000000000 );
5826
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5828
if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5831
shift128ExtraRightJamming(
5832
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5834
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5838
/*----------------------------------------------------------------------------
5839
| Returns the result of subtracting the absolute values of the quadruple-
5840
| precision floating-point values `a' and `b'. If `zSign' is 1, the
5841
| difference is negated before being returned. `zSign' is ignored if the
5842
| result is a NaN. The subtraction is performed according to the IEC/IEEE
5843
| Standard for Binary Floating-Point Arithmetic.
5844
*----------------------------------------------------------------------------*/
5846
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5848
int32 aExp, bExp, zExp;
5849
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5853
aSig1 = extractFloat128Frac1( a );
5854
aSig0 = extractFloat128Frac0( a );
5855
aExp = extractFloat128Exp( a );
5856
bSig1 = extractFloat128Frac1( b );
5857
bSig0 = extractFloat128Frac0( b );
5858
bExp = extractFloat128Exp( b );
5859
expDiff = aExp - bExp;
5860
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5861
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5862
if ( 0 < expDiff ) goto aExpBigger;
5863
if ( expDiff < 0 ) goto bExpBigger;
5864
if ( aExp == 0x7FFF ) {
5865
if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5866
return propagateFloat128NaN( a, b STATUS_VAR );
5868
float_raise( float_flag_invalid STATUS_VAR);
5869
z.low = float128_default_nan_low;
5870
z.high = float128_default_nan_high;
5877
if ( bSig0 < aSig0 ) goto aBigger;
5878
if ( aSig0 < bSig0 ) goto bBigger;
5879
if ( bSig1 < aSig1 ) goto aBigger;
5880
if ( aSig1 < bSig1 ) goto bBigger;
5881
return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5883
if ( bExp == 0x7FFF ) {
5884
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5885
return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5891
aSig0 |= LIT64( 0x4000000000000000 );
5893
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5894
bSig0 |= LIT64( 0x4000000000000000 );
5896
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5899
goto normalizeRoundAndPack;
5901
if ( aExp == 0x7FFF ) {
5902
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5909
bSig0 |= LIT64( 0x4000000000000000 );
5911
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5912
aSig0 |= LIT64( 0x4000000000000000 );
5914
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5916
normalizeRoundAndPack:
5918
return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5922
/*----------------------------------------------------------------------------
5923
| Returns the result of adding the quadruple-precision floating-point values
5924
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5925
| for Binary Floating-Point Arithmetic.
5926
*----------------------------------------------------------------------------*/
5928
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5932
aSign = extractFloat128Sign( a );
5933
bSign = extractFloat128Sign( b );
5934
if ( aSign == bSign ) {
5935
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5938
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5943
/*----------------------------------------------------------------------------
5944
| Returns the result of subtracting the quadruple-precision floating-point
5945
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5946
| Standard for Binary Floating-Point Arithmetic.
5947
*----------------------------------------------------------------------------*/
5949
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5953
aSign = extractFloat128Sign( a );
5954
bSign = extractFloat128Sign( b );
5955
if ( aSign == bSign ) {
5956
return subFloat128Sigs( a, b, aSign STATUS_VAR );
5959
return addFloat128Sigs( a, b, aSign STATUS_VAR );
5964
/*----------------------------------------------------------------------------
5965
| Returns the result of multiplying the quadruple-precision floating-point
5966
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5967
| Standard for Binary Floating-Point Arithmetic.
5968
*----------------------------------------------------------------------------*/
5970
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5972
flag aSign, bSign, zSign;
5973
int32 aExp, bExp, zExp;
5974
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5977
aSig1 = extractFloat128Frac1( a );
5978
aSig0 = extractFloat128Frac0( a );
5979
aExp = extractFloat128Exp( a );
5980
aSign = extractFloat128Sign( a );
5981
bSig1 = extractFloat128Frac1( b );
5982
bSig0 = extractFloat128Frac0( b );
5983
bExp = extractFloat128Exp( b );
5984
bSign = extractFloat128Sign( b );
5985
zSign = aSign ^ bSign;
5986
if ( aExp == 0x7FFF ) {
5987
if ( ( aSig0 | aSig1 )
5988
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5989
return propagateFloat128NaN( a, b STATUS_VAR );
5991
if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5992
return packFloat128( zSign, 0x7FFF, 0, 0 );
5994
if ( bExp == 0x7FFF ) {
5995
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5996
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5998
float_raise( float_flag_invalid STATUS_VAR);
5999
z.low = float128_default_nan_low;
6000
z.high = float128_default_nan_high;
6003
return packFloat128( zSign, 0x7FFF, 0, 0 );
6006
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6007
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6010
if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6011
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6013
zExp = aExp + bExp - 0x4000;
6014
aSig0 |= LIT64( 0x0001000000000000 );
6015
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6016
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6017
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6018
zSig2 |= ( zSig3 != 0 );
6019
if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6020
shift128ExtraRightJamming(
6021
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6024
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6028
/*----------------------------------------------------------------------------
6029
| Returns the result of dividing the quadruple-precision floating-point value
6030
| `a' by the corresponding value `b'. The operation is performed according to
6031
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6032
*----------------------------------------------------------------------------*/
6034
float128 float128_div( float128 a, float128 b STATUS_PARAM )
6036
flag aSign, bSign, zSign;
6037
int32 aExp, bExp, zExp;
6038
uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6039
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6042
aSig1 = extractFloat128Frac1( a );
6043
aSig0 = extractFloat128Frac0( a );
6044
aExp = extractFloat128Exp( a );
6045
aSign = extractFloat128Sign( a );
6046
bSig1 = extractFloat128Frac1( b );
6047
bSig0 = extractFloat128Frac0( b );
6048
bExp = extractFloat128Exp( b );
6049
bSign = extractFloat128Sign( b );
6050
zSign = aSign ^ bSign;
6051
if ( aExp == 0x7FFF ) {
6052
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6053
if ( bExp == 0x7FFF ) {
6054
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6057
return packFloat128( zSign, 0x7FFF, 0, 0 );
6059
if ( bExp == 0x7FFF ) {
6060
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6061
return packFloat128( zSign, 0, 0, 0 );
6064
if ( ( bSig0 | bSig1 ) == 0 ) {
6065
if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6067
float_raise( float_flag_invalid STATUS_VAR);
6068
z.low = float128_default_nan_low;
6069
z.high = float128_default_nan_high;
6072
float_raise( float_flag_divbyzero STATUS_VAR);
6073
return packFloat128( zSign, 0x7FFF, 0, 0 );
6075
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6078
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6079
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6081
zExp = aExp - bExp + 0x3FFD;
6083
aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6085
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6086
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6087
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6090
zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6091
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6092
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6093
while ( (int64_t) rem0 < 0 ) {
6095
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6097
zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6098
if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6099
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6100
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6101
while ( (int64_t) rem1 < 0 ) {
6103
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6105
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6107
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6108
return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6112
/*----------------------------------------------------------------------------
6113
| Returns the remainder of the quadruple-precision floating-point value `a'
6114
| with respect to the corresponding value `b'. The operation is performed
6115
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6116
*----------------------------------------------------------------------------*/
6118
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6121
int32 aExp, bExp, expDiff;
6122
uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6123
uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6127
aSig1 = extractFloat128Frac1( a );
6128
aSig0 = extractFloat128Frac0( a );
6129
aExp = extractFloat128Exp( a );
6130
aSign = extractFloat128Sign( a );
6131
bSig1 = extractFloat128Frac1( b );
6132
bSig0 = extractFloat128Frac0( b );
6133
bExp = extractFloat128Exp( b );
6134
if ( aExp == 0x7FFF ) {
6135
if ( ( aSig0 | aSig1 )
6136
|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6137
return propagateFloat128NaN( a, b STATUS_VAR );
6141
if ( bExp == 0x7FFF ) {
6142
if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6146
if ( ( bSig0 | bSig1 ) == 0 ) {
6148
float_raise( float_flag_invalid STATUS_VAR);
6149
z.low = float128_default_nan_low;
6150
z.high = float128_default_nan_high;
6153
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6156
if ( ( aSig0 | aSig1 ) == 0 ) return a;
6157
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6159
expDiff = aExp - bExp;
6160
if ( expDiff < -1 ) return a;
6162
aSig0 | LIT64( 0x0001000000000000 ),
6164
15 - ( expDiff < 0 ),
6169
bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6170
q = le128( bSig0, bSig1, aSig0, aSig1 );
6171
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6173
while ( 0 < expDiff ) {
6174
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6175
q = ( 4 < q ) ? q - 4 : 0;
6176
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6177
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6178
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6179
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6182
if ( -64 < expDiff ) {
6183
q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6184
q = ( 4 < q ) ? q - 4 : 0;
6186
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6188
if ( expDiff < 0 ) {
6189
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6192
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6194
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6195
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6198
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6199
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6202
alternateASig0 = aSig0;
6203
alternateASig1 = aSig1;
6205
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6206
} while ( 0 <= (int64_t) aSig0 );
6208
aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6209
if ( ( sigMean0 < 0 )
6210
|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6211
aSig0 = alternateASig0;
6212
aSig1 = alternateASig1;
6214
zSign = ( (int64_t) aSig0 < 0 );
6215
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6217
normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6221
/*----------------------------------------------------------------------------
6222
| Returns the square root of the quadruple-precision floating-point value `a'.
6223
| The operation is performed according to the IEC/IEEE Standard for Binary
6224
| Floating-Point Arithmetic.
6225
*----------------------------------------------------------------------------*/
6227
float128 float128_sqrt( float128 a STATUS_PARAM )
6231
uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6232
uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6235
aSig1 = extractFloat128Frac1( a );
6236
aSig0 = extractFloat128Frac0( a );
6237
aExp = extractFloat128Exp( a );
6238
aSign = extractFloat128Sign( a );
6239
if ( aExp == 0x7FFF ) {
6240
if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6241
if ( ! aSign ) return a;
6245
if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6247
float_raise( float_flag_invalid STATUS_VAR);
6248
z.low = float128_default_nan_low;
6249
z.high = float128_default_nan_high;
6253
if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6254
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6256
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6257
aSig0 |= LIT64( 0x0001000000000000 );
6258
zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6259
shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6260
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6261
doubleZSig0 = zSig0<<1;
6262
mul64To128( zSig0, zSig0, &term0, &term1 );
6263
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6264
while ( (int64_t) rem0 < 0 ) {
6267
add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6269
zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6270
if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6271
if ( zSig1 == 0 ) zSig1 = 1;
6272
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6273
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6274
mul64To128( zSig1, zSig1, &term2, &term3 );
6275
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6276
while ( (int64_t) rem1 < 0 ) {
6278
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6280
term2 |= doubleZSig0;
6281
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6283
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6285
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6286
return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6290
/*----------------------------------------------------------------------------
6291
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6292
| the corresponding value `b', and 0 otherwise. The invalid exception is
6293
| raised if either operand is a NaN. Otherwise, the comparison is performed
6294
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6295
*----------------------------------------------------------------------------*/
6297
int float128_eq( float128 a, float128 b STATUS_PARAM )
6300
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6301
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6302
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6303
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6305
float_raise( float_flag_invalid STATUS_VAR);
6310
&& ( ( a.high == b.high )
6312
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6317
/*----------------------------------------------------------------------------
6318
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6319
| or equal to the corresponding value `b', and 0 otherwise. The invalid
6320
| exception is raised if either operand is a NaN. The comparison is performed
6321
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6322
*----------------------------------------------------------------------------*/
6324
int float128_le( float128 a, float128 b STATUS_PARAM )
6328
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6329
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6330
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6331
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6333
float_raise( float_flag_invalid STATUS_VAR);
6336
aSign = extractFloat128Sign( a );
6337
bSign = extractFloat128Sign( b );
6338
if ( aSign != bSign ) {
6341
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6345
aSign ? le128( b.high, b.low, a.high, a.low )
6346
: le128( a.high, a.low, b.high, b.low );
6350
/*----------------------------------------------------------------------------
6351
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6352
| the corresponding value `b', and 0 otherwise. The invalid exception is
6353
| raised if either operand is a NaN. The comparison is performed according
6354
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6355
*----------------------------------------------------------------------------*/
6357
int float128_lt( float128 a, float128 b STATUS_PARAM )
6361
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6362
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6363
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6364
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6366
float_raise( float_flag_invalid STATUS_VAR);
6369
aSign = extractFloat128Sign( a );
6370
bSign = extractFloat128Sign( b );
6371
if ( aSign != bSign ) {
6374
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6378
aSign ? lt128( b.high, b.low, a.high, a.low )
6379
: lt128( a.high, a.low, b.high, b.low );
6383
/*----------------------------------------------------------------------------
6384
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6385
| be compared, and 0 otherwise. The invalid exception is raised if either
6386
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6387
| Standard for Binary Floating-Point Arithmetic.
6388
*----------------------------------------------------------------------------*/
6390
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6392
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6393
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6394
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6395
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6397
float_raise( float_flag_invalid STATUS_VAR);
6403
/*----------------------------------------------------------------------------
6404
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6405
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6406
| exception. The comparison is performed according to the IEC/IEEE Standard
6407
| for Binary Floating-Point Arithmetic.
6408
*----------------------------------------------------------------------------*/
6410
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6413
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6414
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6415
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6416
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6418
if ( float128_is_signaling_nan( a )
6419
|| float128_is_signaling_nan( b ) ) {
6420
float_raise( float_flag_invalid STATUS_VAR);
6426
&& ( ( a.high == b.high )
6428
&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6433
/*----------------------------------------------------------------------------
6434
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6435
| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6436
| cause an exception. Otherwise, the comparison is performed according to the
6437
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6438
*----------------------------------------------------------------------------*/
6440
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6444
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6445
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6446
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6447
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6449
if ( float128_is_signaling_nan( a )
6450
|| float128_is_signaling_nan( b ) ) {
6451
float_raise( float_flag_invalid STATUS_VAR);
6455
aSign = extractFloat128Sign( a );
6456
bSign = extractFloat128Sign( b );
6457
if ( aSign != bSign ) {
6460
|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6464
aSign ? le128( b.high, b.low, a.high, a.low )
6465
: le128( a.high, a.low, b.high, b.low );
6469
/*----------------------------------------------------------------------------
6470
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6471
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6472
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6473
| Standard for Binary Floating-Point Arithmetic.
6474
*----------------------------------------------------------------------------*/
6476
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6480
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6481
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6482
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6483
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6485
if ( float128_is_signaling_nan( a )
6486
|| float128_is_signaling_nan( b ) ) {
6487
float_raise( float_flag_invalid STATUS_VAR);
6491
aSign = extractFloat128Sign( a );
6492
bSign = extractFloat128Sign( b );
6493
if ( aSign != bSign ) {
6496
&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6500
aSign ? lt128( b.high, b.low, a.high, a.low )
6501
: lt128( a.high, a.low, b.high, b.low );
6505
/*----------------------------------------------------------------------------
6506
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6507
| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6508
| comparison is performed according to the IEC/IEEE Standard for Binary
6509
| Floating-Point Arithmetic.
6510
*----------------------------------------------------------------------------*/
6512
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6514
if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6515
&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6516
|| ( ( extractFloat128Exp( b ) == 0x7FFF )
6517
&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6519
if ( float128_is_signaling_nan( a )
6520
|| float128_is_signaling_nan( b ) ) {
6521
float_raise( float_flag_invalid STATUS_VAR);
6528
/* misc functions */
6529
float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6531
return int64_to_float32(a STATUS_VAR);
6534
float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6536
return int64_to_float64(a STATUS_VAR);
6539
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6543
int old_exc_flags = get_float_exception_flags(status);
6545
v = float32_to_int64(a STATUS_VAR);
6548
} else if (v > 0xffffffff) {
6553
set_float_exception_flags(old_exc_flags, status);
6554
float_raise(float_flag_invalid STATUS_VAR);
6558
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6562
int old_exc_flags = get_float_exception_flags(status);
6564
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6567
} else if (v > 0xffffffff) {
6572
set_float_exception_flags(old_exc_flags, status);
6573
float_raise(float_flag_invalid STATUS_VAR);
6577
int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6581
int old_exc_flags = get_float_exception_flags(status);
6583
v = float32_to_int32(a STATUS_VAR);
6586
} else if (v > 0x7fff) {
6592
set_float_exception_flags(old_exc_flags, status);
6593
float_raise(float_flag_invalid STATUS_VAR);
6597
uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6601
int old_exc_flags = get_float_exception_flags(status);
6603
v = float32_to_int32(a STATUS_VAR);
6606
} else if (v > 0xffff) {
6612
set_float_exception_flags(old_exc_flags, status);
6613
float_raise(float_flag_invalid STATUS_VAR);
6617
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6621
int old_exc_flags = get_float_exception_flags(status);
6623
v = float32_to_int64_round_to_zero(a STATUS_VAR);
6626
} else if (v > 0xffff) {
6631
set_float_exception_flags(old_exc_flags, status);
6632
float_raise(float_flag_invalid STATUS_VAR);
6636
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6640
int old_exc_flags = get_float_exception_flags(status);
6642
v = float64_to_uint64(a STATUS_VAR);
6643
if (v > 0xffffffff) {
6648
set_float_exception_flags(old_exc_flags, status);
6649
float_raise(float_flag_invalid STATUS_VAR);
6653
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6657
int old_exc_flags = get_float_exception_flags(status);
6659
v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6660
if (v > 0xffffffff) {
6665
set_float_exception_flags(old_exc_flags, status);
6666
float_raise(float_flag_invalid STATUS_VAR);
6670
int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6674
int old_exc_flags = get_float_exception_flags(status);
6676
v = float64_to_int32(a STATUS_VAR);
6679
} else if (v > 0x7fff) {
6685
set_float_exception_flags(old_exc_flags, status);
6686
float_raise(float_flag_invalid STATUS_VAR);
6690
uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6694
int old_exc_flags = get_float_exception_flags(status);
6696
v = float64_to_int32(a STATUS_VAR);
6699
} else if (v > 0xffff) {
6705
set_float_exception_flags(old_exc_flags, status);
6706
float_raise(float_flag_invalid STATUS_VAR);
6710
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6714
int old_exc_flags = get_float_exception_flags(status);
6716
v = float64_to_int64_round_to_zero(a STATUS_VAR);
6719
} else if (v > 0xffff) {
6724
set_float_exception_flags(old_exc_flags, status);
6725
float_raise(float_flag_invalid STATUS_VAR);
6729
/*----------------------------------------------------------------------------
6730
| Returns the result of converting the double-precision floating-point value
6731
| `a' to the 64-bit unsigned integer format. The conversion is
6732
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6733
| Arithmetic---which means in particular that the conversion is rounded
6734
| according to the current rounding mode. If `a' is a NaN, the largest
6735
| positive integer is returned. If the conversion overflows, the
6736
| largest unsigned integer is returned. If 'a' is negative, the value is
6737
| rounded and zero is returned; negative values that do not round to zero
6738
| will raise the inexact exception.
6739
*----------------------------------------------------------------------------*/
6741
uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6744
int_fast16_t aExp, shiftCount;
6745
uint64_t aSig, aSigExtra;
6746
a = float64_squash_input_denormal(a STATUS_VAR);
6748
aSig = extractFloat64Frac(a);
6749
aExp = extractFloat64Exp(a);
6750
aSign = extractFloat64Sign(a);
6751
if (aSign && (aExp > 1022)) {
6752
float_raise(float_flag_invalid STATUS_VAR);
6753
if (float64_is_any_nan(a)) {
6754
return LIT64(0xFFFFFFFFFFFFFFFF);
6760
aSig |= LIT64(0x0010000000000000);
6762
shiftCount = 0x433 - aExp;
6763
if (shiftCount <= 0) {
6765
float_raise(float_flag_invalid STATUS_VAR);
6766
return LIT64(0xFFFFFFFFFFFFFFFF);
6769
aSig <<= -shiftCount;
6771
shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6773
return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6776
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6778
signed char current_rounding_mode = STATUS(float_rounding_mode);
6779
set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6780
int64_t v = float64_to_uint64(a STATUS_VAR);
6781
set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6785
#define COMPARE(s, nan_exp) \
6786
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6787
int is_quiet STATUS_PARAM ) \
6789
flag aSign, bSign; \
6790
uint ## s ## _t av, bv; \
6791
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6792
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6794
if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6795
extractFloat ## s ## Frac( a ) ) || \
6796
( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6797
extractFloat ## s ## Frac( b ) )) { \
6799
float ## s ## _is_signaling_nan( a ) || \
6800
float ## s ## _is_signaling_nan( b ) ) { \
6801
float_raise( float_flag_invalid STATUS_VAR); \
6803
return float_relation_unordered; \
6805
aSign = extractFloat ## s ## Sign( a ); \
6806
bSign = extractFloat ## s ## Sign( b ); \
6807
av = float ## s ## _val(a); \
6808
bv = float ## s ## _val(b); \
6809
if ( aSign != bSign ) { \
6810
if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6812
return float_relation_equal; \
6814
return 1 - (2 * aSign); \
6818
return float_relation_equal; \
6820
return 1 - 2 * (aSign ^ ( av < bv )); \
6825
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6827
return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6830
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6832
return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6838
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6839
int is_quiet STATUS_PARAM )
6843
if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6844
( extractFloatx80Frac( a )<<1 ) ) ||
6845
( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6846
( extractFloatx80Frac( b )<<1 ) )) {
6848
floatx80_is_signaling_nan( a ) ||
6849
floatx80_is_signaling_nan( b ) ) {
6850
float_raise( float_flag_invalid STATUS_VAR);
6852
return float_relation_unordered;
6854
aSign = extractFloatx80Sign( a );
6855
bSign = extractFloatx80Sign( b );
6856
if ( aSign != bSign ) {
6858
if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6859
( ( a.low | b.low ) == 0 ) ) {
6861
return float_relation_equal;
6863
return 1 - (2 * aSign);
6866
if (a.low == b.low && a.high == b.high) {
6867
return float_relation_equal;
6869
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6874
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6876
return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6879
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6881
return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6884
INLINE int float128_compare_internal( float128 a, float128 b,
6885
int is_quiet STATUS_PARAM )
6889
if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6890
( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6891
( ( extractFloat128Exp( b ) == 0x7fff ) &&
6892
( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6894
float128_is_signaling_nan( a ) ||
6895
float128_is_signaling_nan( b ) ) {
6896
float_raise( float_flag_invalid STATUS_VAR);
6898
return float_relation_unordered;
6900
aSign = extractFloat128Sign( a );
6901
bSign = extractFloat128Sign( b );
6902
if ( aSign != bSign ) {
6903
if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6905
return float_relation_equal;
6907
return 1 - (2 * aSign);
6910
if (a.low == b.low && a.high == b.high) {
6911
return float_relation_equal;
6913
return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6918
int float128_compare( float128 a, float128 b STATUS_PARAM )
6920
return float128_compare_internal(a, b, 0 STATUS_VAR);
6923
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6925
return float128_compare_internal(a, b, 1 STATUS_VAR);
6928
/* min() and max() functions. These can't be implemented as
6929
* 'compare and pick one input' because that would mishandle
6930
* NaNs and +0 vs -0.
6932
* minnum() and maxnum() functions. These are similar to the min()
6933
* and max() functions but if one of the arguments is a QNaN and
6934
* the other is numerical then the numerical argument is returned.
6935
* minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6936
* and maxNum() operations. min() and max() are the typical min/max
6937
* semantics provided by many CPUs which predate that specification.
6939
#define MINMAX(s, nan_exp) \
6940
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6941
int ismin, int isieee STATUS_PARAM) \
6943
flag aSign, bSign; \
6944
uint ## s ## _t av, bv; \
6945
a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6946
b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6947
if (float ## s ## _is_any_nan(a) || \
6948
float ## s ## _is_any_nan(b)) { \
6950
if (float ## s ## _is_quiet_nan(a) && \
6951
!float ## s ##_is_any_nan(b)) { \
6953
} else if (float ## s ## _is_quiet_nan(b) && \
6954
!float ## s ## _is_any_nan(a)) { \
6958
return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6960
aSign = extractFloat ## s ## Sign(a); \
6961
bSign = extractFloat ## s ## Sign(b); \
6962
av = float ## s ## _val(a); \
6963
bv = float ## s ## _val(b); \
6964
if (aSign != bSign) { \
6966
return aSign ? a : b; \
6968
return aSign ? b : a; \
6972
return (aSign ^ (av < bv)) ? a : b; \
6974
return (aSign ^ (av < bv)) ? b : a; \
6979
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6981
return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
6984
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6986
return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
6989
float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
6991
return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
6994
float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
6996
return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7003
/* Multiply A by 2 raised to the power N. */
7004
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7010
a = float32_squash_input_denormal(a STATUS_VAR);
7011
aSig = extractFloat32Frac( a );
7012
aExp = extractFloat32Exp( a );
7013
aSign = extractFloat32Sign( a );
7015
if ( aExp == 0xFF ) {
7017
return propagateFloat32NaN( a, a STATUS_VAR );
7023
} else if (aSig == 0) {
7031
} else if (n < -0x200) {
7037
return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7040
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7046
a = float64_squash_input_denormal(a STATUS_VAR);
7047
aSig = extractFloat64Frac( a );
7048
aExp = extractFloat64Exp( a );
7049
aSign = extractFloat64Sign( a );
7051
if ( aExp == 0x7FF ) {
7053
return propagateFloat64NaN( a, a STATUS_VAR );
7058
aSig |= LIT64( 0x0010000000000000 );
7059
} else if (aSig == 0) {
7067
} else if (n < -0x1000) {
7073
return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7076
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7082
aSig = extractFloatx80Frac( a );
7083
aExp = extractFloatx80Exp( a );
7084
aSign = extractFloatx80Sign( a );
7086
if ( aExp == 0x7FFF ) {
7088
return propagateFloatx80NaN( a, a STATUS_VAR );
7102
} else if (n < -0x10000) {
7107
return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7108
aSign, aExp, aSig, 0 STATUS_VAR );
7111
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7115
uint64_t aSig0, aSig1;
7117
aSig1 = extractFloat128Frac1( a );
7118
aSig0 = extractFloat128Frac0( a );
7119
aExp = extractFloat128Exp( a );
7120
aSign = extractFloat128Sign( a );
7121
if ( aExp == 0x7FFF ) {
7122
if ( aSig0 | aSig1 ) {
7123
return propagateFloat128NaN( a, a STATUS_VAR );
7128
aSig0 |= LIT64( 0x0001000000000000 );
7129
} else if (aSig0 == 0 && aSig1 == 0) {
7137
} else if (n < -0x10000) {
7142
return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1