69
69
/*----------------------------------------------------------------------------
70
| Returns the fraction bits of the half-precision floating-point value `a'.
71
*----------------------------------------------------------------------------*/
73
INLINE uint32_t extractFloat16Frac(float16 a)
75
return float16_val(a) & 0x3ff;
78
/*----------------------------------------------------------------------------
79
| Returns the exponent bits of the half-precision floating-point value `a'.
80
*----------------------------------------------------------------------------*/
82
INLINE int16 extractFloat16Exp(float16 a)
84
return (float16_val(a) >> 10) & 0x1f;
87
/*----------------------------------------------------------------------------
88
| Returns the sign bit of the single-precision floating-point value `a'.
89
*----------------------------------------------------------------------------*/
91
INLINE flag extractFloat16Sign(float16 a)
93
return float16_val(a)>>15;
96
/*----------------------------------------------------------------------------
70
97
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71
98
| and 7, and returns the properly rounded 32-bit integer corresponding to the
72
99
| input. If `zSign' is 1, the input is negated before being converted to an
1534
1561
aExp = extractFloat32Exp( a );
1535
1562
aSign = extractFloat32Sign( a );
1536
1563
if ( aExp == 0xFF ) {
1537
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1564
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1538
1565
return packFloat64( aSign, 0x7FF, 0 );
1540
1567
if ( aExp == 0 ) {
1566
1593
aExp = extractFloat32Exp( a );
1567
1594
aSign = extractFloat32Sign( a );
1568
1595
if ( aExp == 0xFF ) {
1569
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1596
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570
1597
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1572
1599
if ( aExp == 0 ) {
1600
1627
aExp = extractFloat32Exp( a );
1601
1628
aSign = extractFloat32Sign( a );
1602
1629
if ( aExp == 0xFF ) {
1603
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1630
if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1604
1631
return packFloat128( aSign, 0x7FFF, 0, 0 );
1606
1633
if ( aExp == 0 ) {
2173
2200
static const float64 float32_exp2_coefficients[15] =
2175
make_float64( 0x3ff0000000000000ll ), /* 1 */
2176
make_float64( 0x3fe0000000000000ll ), /* 2 */
2177
make_float64( 0x3fc5555555555555ll ), /* 3 */
2178
make_float64( 0x3fa5555555555555ll ), /* 4 */
2179
make_float64( 0x3f81111111111111ll ), /* 5 */
2180
make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2181
make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2182
make_float64( 0x3efa01a01a01a01all ), /* 8 */
2183
make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2184
make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2185
make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2186
make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2187
make_float64( 0x3de6124613a86d09ll ), /* 13 */
2188
make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2189
make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2202
const_float64( 0x3ff0000000000000ll ), /* 1 */
2203
const_float64( 0x3fe0000000000000ll ), /* 2 */
2204
const_float64( 0x3fc5555555555555ll ), /* 3 */
2205
const_float64( 0x3fa5555555555555ll ), /* 4 */
2206
const_float64( 0x3f81111111111111ll ), /* 5 */
2207
const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2208
const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2209
const_float64( 0x3efa01a01a01a01all ), /* 8 */
2210
const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2211
const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212
const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213
const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214
const_float64( 0x3de6124613a86d09ll ), /* 13 */
2215
const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216
const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2192
2219
float32 float32_exp2( float32 a STATUS_PARAM )
2689
2716
aExp = extractFloat64Exp( a );
2690
2717
aSign = extractFloat64Sign( a );
2691
2718
if ( aExp == 0x7FF ) {
2692
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2719
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2693
2720
return packFloat32( aSign, 0xFF, 0 );
2695
2722
shift64RightJamming( aSig, 22, &aSig );
2713
2740
| than the desired result exponent whenever `zSig' is a complete, normalized
2715
2742
*----------------------------------------------------------------------------*/
2716
static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2743
static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2718
return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2745
return make_float16(
2746
(((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2721
2749
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2722
2750
The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2724
float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2752
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2731
aExp = (a >> 10) & 0x1f;
2758
aSign = extractFloat16Sign(a);
2759
aExp = extractFloat16Exp(a);
2760
aSig = extractFloat16Frac(a);
2734
2762
if (aExp == 0x1f && ieee) {
2736
/* Make sure correct exceptions are raised. */
2737
float32ToCommonNaN(a STATUS_VAR);
2764
return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2740
2766
return packFloat32(aSign, 0xff, aSig << 13);
2768
2794
aSign = extractFloat32Sign( a );
2769
2795
if ( aExp == 0xFF ) {
2771
/* Make sure correct exceptions are raised. */
2772
float32ToCommonNaN(a STATUS_VAR);
2775
return packFloat16(aSign, 0x1f, aSig >> 13);
2797
/* Input is a NaN */
2798
float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2800
return packFloat16(aSign, 0, 0);
2806
float_raise(float_flag_invalid STATUS_VAR);
2807
return packFloat16(aSign, 0x1f, 0x3ff);
2809
return packFloat16(aSign, 0x1f, 0);
2777
if (aExp == 0 && aSign == 0) {
2811
if (aExp == 0 && aSig == 0) {
2778
2812
return packFloat16(aSign, 0, 0);
2780
2814
/* Decimal point between bits 22 and 23. */
2781
2815
aSig |= 0x00800000;
2783
2817
if (aExp < -14) {
2791
2823
mask = 0x00001fff;
2861
2893
aExp = extractFloat64Exp( a );
2862
2894
aSign = extractFloat64Sign( a );
2863
2895
if ( aExp == 0x7FF ) {
2864
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2896
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2865
2897
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2867
2899
if ( aExp == 0 ) {
2896
2928
aExp = extractFloat64Exp( a );
2897
2929
aSign = extractFloat64Sign( a );
2898
2930
if ( aExp == 0x7FF ) {
2899
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2931
if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2900
2932
return packFloat128( aSign, 0x7FFF, 0, 0 );
2902
2934
if ( aExp == 0 ) {
3843
3875
aSign = extractFloatx80Sign( a );
3844
3876
if ( aExp == 0x7FFF ) {
3845
3877
if ( (bits64) ( aSig<<1 ) ) {
3846
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3878
return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3848
3880
return packFloat32( aSign, 0xFF, 0 );
3871
3903
aSign = extractFloatx80Sign( a );
3872
3904
if ( aExp == 0x7FFF ) {
3873
3905
if ( (bits64) ( aSig<<1 ) ) {
3874
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3906
return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3876
3908
return packFloat64( aSign, 0x7FF, 0 );
3900
3932
aExp = extractFloatx80Exp( a );
3901
3933
aSign = extractFloatx80Sign( a );
3902
3934
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3903
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3935
return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3905
3937
shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3906
3938
return packFloat128( aSign, aExp, zSig0, zSig1 );