2
===============================================================================
3
The original notice of the softfloat package is shown below. The conversion
4
to pascal was done by Carl Eric Codere in 2002 (ccodere@ieee.org).
5
===============================================================================
7
This C source file is part of the SoftFloat IEC/IEEE Floating-Point
8
Arithmetic Package, Release 2a.
10
Written by John R. Hauser. This work was made possible in part by the
11
International Computer Science Institute, located at Suite 600, 1947 Center
12
Street, Berkeley, California 94704. Funding was partially provided by the
13
National Science Foundation under grant MIP-9311980. The original version
14
of this code was written as part of a project to build a fixed-point vector
15
processor in collaboration with the University of California at Berkeley,
16
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
17
is available through the Web page
18
`http://HTTP.CS.Berkeley.EDU/~jhauser/arithmetic/SoftFloat.html'.
20
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
21
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
22
TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
23
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
24
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) they include prominent notice that the work is derivative, and (2) they
28
include prominent notice akin to these four paragraphs for those parts of
29
this code that are retained.
31
===============================================================================
33
The float80 and float128 part is translated from the softfloat package
34
by Florian Klaempfl and contained the following copyright notice
36
The code might contain some duplicate stuff because the floatx80/float128 port was
37
done based on the 64 bit enabled softfloat code.
39
===============================================================================
41
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
44
Written by John R. Hauser. This work was made possible in part by the
45
International Computer Science Institute, located at Suite 600, 1947 Center
46
Street, Berkeley, California 94704. Funding was partially provided by the
47
National Science Foundation under grant MIP-9311980. The original version
48
of this code was written as part of a project to build a fixed-point vector
49
processor in collaboration with the University of California at Berkeley,
50
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
51
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
52
arithmetic/SoftFloat.html'.
54
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
55
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
56
RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
57
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
58
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
59
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
60
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
61
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
63
Derivative works are acceptable, even for commercial purposes, so long as
64
(1) the source code for the derivative work includes prominent notice that
65
the work is derivative, and (2) the source code includes prominent notice with
66
these four paragraphs for those parts of this code that are retained.
69
===============================================================================
72
{ $define FPC_SOFTFLOAT_FLOATX80}
73
{ $define FPC_SOFTFLOAT_FLOAT128}
75
{ the softfpu unit can be also embedded directly into the system unit }
77
{$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
82
{ Overflow checking must be disabled,
83
since some operations expect overflow!
89
{$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
91
{$if not(defined(fpc_softfpu_implementation))}
93
-------------------------------------------------------------------------------
94
Software IEC/IEEE floating-point types.
95
-------------------------------------------------------------------------------
99
{ we use here a record in the function header because
100
the record allows bitwise conversion to single }
122
{ now part of the system unit
128
{$ifdef ENDIAN_LITTLE}
129
float64 = packed record
135
int64rec = packed record
140
floatx80 = packed record
145
float128 = packed record
150
float64 = packed record
154
int64rec = packed record
158
floatx80 = packed record
163
float128 = packed record
170
-------------------------------------------------------------------------------
171
Returns 1 if the double-precision floating-point value `a' is less than
172
the corresponding value `b', and 0 otherwise. The comparison is performed
173
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
174
-------------------------------------------------------------------------------
176
Function float64_lt(a: float64;b: float64): flag; compilerproc;
178
-------------------------------------------------------------------------------
179
Returns 1 if the double-precision floating-point value `a' is less than
180
or equal to the corresponding value `b', and 0 otherwise. The comparison
181
is performed according to the IEC/IEEE Standard for Binary Floating-Point
183
-------------------------------------------------------------------------------
185
Function float64_le(a: float64;b: float64): flag; compilerproc;
187
-------------------------------------------------------------------------------
188
Returns 1 if the double-precision floating-point value `a' is equal to
189
the corresponding value `b', and 0 otherwise. The comparison is performed
190
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
191
-------------------------------------------------------------------------------
193
Function float64_eq(a: float64;b: float64): flag; compilerproc;
195
-------------------------------------------------------------------------------
196
Returns the square root of the double-precision floating-point value `a'.
197
The operation is performed according to the IEC/IEEE Standard for Binary
198
Floating-Point Arithmetic.
199
-------------------------------------------------------------------------------
201
Procedure float64_sqrt( a: float64; var out: float64 ); compilerproc;
203
-------------------------------------------------------------------------------
204
Returns the remainder of the double-precision floating-point value `a'
205
with respect to the corresponding value `b'. The operation is performed
206
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
207
-------------------------------------------------------------------------------
209
Function float64_rem(a: float64; b : float64) : float64; compilerproc;
211
-------------------------------------------------------------------------------
212
Returns the result of dividing the double-precision floating-point value `a'
213
by the corresponding value `b'. The operation is performed according to the
214
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
215
-------------------------------------------------------------------------------
217
Function float64_div(a: float64; b : float64) : float64; compilerproc;
219
-------------------------------------------------------------------------------
220
Returns the result of multiplying the double-precision floating-point values
221
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
222
for Binary Floating-Point Arithmetic.
223
-------------------------------------------------------------------------------
225
Function float64_mul( a: float64; b:float64) : float64; compilerproc;
227
-------------------------------------------------------------------------------
228
Returns the result of subtracting the double-precision floating-point values
229
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
230
for Binary Floating-Point Arithmetic.
231
-------------------------------------------------------------------------------
233
Function float64_sub(a: float64; b : float64) : float64; compilerproc;
235
-------------------------------------------------------------------------------
236
Returns the result of adding the double-precision floating-point values `a'
237
and `b'. The operation is performed according to the IEC/IEEE Standard for
238
Binary Floating-Point Arithmetic.
239
-------------------------------------------------------------------------------
241
Function float64_add( a: float64; b : float64) : float64; compilerproc;
243
-------------------------------------------------------------------------------
244
Rounds the double-precision floating-point value `a' to an integer,
245
and returns the result as a double-precision floating-point value. The
246
operation is performed according to the IEC/IEEE Standard for Binary
247
Floating-Point Arithmetic.
248
-------------------------------------------------------------------------------
250
Function float64_round_to_int(a: float64) : float64; compilerproc;
252
-------------------------------------------------------------------------------
253
Returns the result of converting the double-precision floating-point value
254
`a' to the single-precision floating-point format. The conversion is
255
performed according to the IEC/IEEE Standard for Binary Floating-Point
257
-------------------------------------------------------------------------------
259
Function float64_to_float32(a: float64) : float32rec; compilerproc;
261
-------------------------------------------------------------------------------
262
Returns the result of converting the double-precision floating-point value
263
`a' to the 32-bit two's complement integer format. The conversion is
264
performed according to the IEC/IEEE Standard for Binary Floating-Point
265
Arithmetic, except that the conversion is always rounded toward zero.
266
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
267
the conversion overflows, the largest integer with the same sign as `a' is
269
-------------------------------------------------------------------------------
271
Function float64_to_int32_round_to_zero(a: float64 ): int32; compilerproc;
273
-------------------------------------------------------------------------------
274
Returns the result of converting the double-precision floating-point value
275
`a' to the 32-bit two's complement integer format. The conversion is
276
performed according to the IEC/IEEE Standard for Binary Floating-Point
277
Arithmetic---which means in particular that the conversion is rounded
278
according to the current rounding mode. If `a' is a NaN, the largest
279
positive integer is returned. Otherwise, if the conversion overflows, the
280
largest integer with the same sign as `a' is returned.
281
-------------------------------------------------------------------------------
283
Function float64_to_int32(a: float64): int32; compilerproc;
285
-------------------------------------------------------------------------------
286
Returns 1 if the single-precision floating-point value `a' is less than
287
the corresponding value `b', and 0 otherwise. The comparison is performed
288
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
289
-------------------------------------------------------------------------------
291
Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
293
-------------------------------------------------------------------------------
294
Returns 1 if the single-precision floating-point value `a' is less than
295
or equal to the corresponding value `b', and 0 otherwise. The comparison
296
is performed according to the IEC/IEEE Standard for Binary Floating-Point
298
-------------------------------------------------------------------------------
300
Function float32_le( a: float32rec; b : float32rec ):flag; compilerproc;
302
-------------------------------------------------------------------------------
303
Returns 1 if the single-precision floating-point value `a' is equal to
304
the corresponding value `b', and 0 otherwise. The comparison is performed
305
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
306
-------------------------------------------------------------------------------
308
Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
310
-------------------------------------------------------------------------------
311
Returns the square root of the single-precision floating-point value `a'.
312
The operation is performed according to the IEC/IEEE Standard for Binary
313
Floating-Point Arithmetic.
314
-------------------------------------------------------------------------------
316
Function float32_sqrt(a: float32rec ): float32rec; compilerproc;
318
-------------------------------------------------------------------------------
319
Returns the remainder of the single-precision floating-point value `a'
320
with respect to the corresponding value `b'. The operation is performed
321
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
322
-------------------------------------------------------------------------------
324
Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
326
-------------------------------------------------------------------------------
327
Returns the result of dividing the single-precision floating-point value `a'
328
by the corresponding value `b'. The operation is performed according to the
329
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
330
-------------------------------------------------------------------------------
332
Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
334
-------------------------------------------------------------------------------
335
Returns the result of multiplying the single-precision floating-point values
336
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
337
for Binary Floating-Point Arithmetic.
338
-------------------------------------------------------------------------------
340
Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
342
-------------------------------------------------------------------------------
343
Returns the result of subtracting the single-precision floating-point values
344
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
345
for Binary Floating-Point Arithmetic.
346
-------------------------------------------------------------------------------
348
Function float32_sub( a: float32rec ; b:float32rec ): float32rec; compilerproc;
350
-------------------------------------------------------------------------------
351
Returns the result of adding the single-precision floating-point values `a'
352
and `b'. The operation is performed according to the IEC/IEEE Standard for
353
Binary Floating-Point Arithmetic.
354
-------------------------------------------------------------------------------
356
Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
358
-------------------------------------------------------------------------------
359
Rounds the single-precision floating-point value `a' to an integer,
360
and returns the result as a single-precision floating-point value. The
361
operation is performed according to the IEC/IEEE Standard for Binary
362
Floating-Point Arithmetic.
363
-------------------------------------------------------------------------------
365
Function float32_round_to_int( a: float32rec): float32rec; compilerproc;
367
-------------------------------------------------------------------------------
368
Returns the result of converting the single-precision floating-point value
369
`a' to the double-precision floating-point format. The conversion is
370
performed according to the IEC/IEEE Standard for Binary Floating-Point
372
-------------------------------------------------------------------------------
374
Function float32_to_float64( a : float32rec) : Float64; compilerproc;
376
-------------------------------------------------------------------------------
377
Returns the result of converting the single-precision floating-point value
378
`a' to the 32-bit two's complement integer format. The conversion is
379
performed according to the IEC/IEEE Standard for Binary Floating-Point
380
Arithmetic, except that the conversion is always rounded toward zero.
381
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
382
the conversion overflows, the largest integer with the same sign as `a' is
384
-------------------------------------------------------------------------------
386
Function float32_to_int32_round_to_zero( a: Float32rec ): int32; compilerproc;
388
-------------------------------------------------------------------------------
389
Returns the result of converting the single-precision floating-point value
390
`a' to the 32-bit two's complement integer format. The conversion is
391
performed according to the IEC/IEEE Standard for Binary Floating-Point
392
Arithmetic---which means in particular that the conversion is rounded
393
according to the current rounding mode. If `a' is a NaN, the largest
394
positive integer is returned. Otherwise, if the conversion overflows, the
395
largest integer with the same sign as `a' is returned.
396
-------------------------------------------------------------------------------
398
Function float32_to_int32( a : float32rec) : int32; compilerproc;
400
-------------------------------------------------------------------------------
401
Returns the result of converting the 32-bit two's complement integer `a' to
402
the double-precision floating-point format. The conversion is performed
403
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
404
-------------------------------------------------------------------------------
406
Function int32_to_float64( a: int32) : float64; compilerproc;
408
-------------------------------------------------------------------------------
409
Returns the result of converting the 32-bit two's complement integer `a' to
410
the single-precision floating-point format. The conversion is performed
411
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
412
-------------------------------------------------------------------------------
414
Function int32_to_float32( a: int32): float32rec; compilerproc;
416
{*----------------------------------------------------------------------------
417
| Returns the result of converting the 64-bit two's complement integer `a'
418
| to the double-precision floating-point format. The conversion is performed
419
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
420
*----------------------------------------------------------------------------*}
421
Function int64_to_float64( a: int64 ): float64; compilerproc;
423
{*----------------------------------------------------------------------------
424
| Returns the result of converting the 64-bit two's complement integer `a'
425
| to the single-precision floating-point format. The conversion is performed
426
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
427
*----------------------------------------------------------------------------*}
428
Function int64_to_float32( a: int64 ): float32rec; compilerproc;
432
{-------------------------------------------------------------------------------
433
Software IEC/IEEE floating-point underflow tininess-detection mode.
434
-------------------------------------------------------------------------------
436
float_tininess_after_rounding = 0;
437
float_tininess_before_rounding = 1;
440
-------------------------------------------------------------------------------
441
Software IEC/IEEE floating-point rounding mode.
442
-------------------------------------------------------------------------------
446
This is the default mode. It should be used unless there is a specific
447
need for one of the others. In this mode results are rounded to the
448
nearest representable value. If the result is midway between two
449
representable values, the even representable is chosen. Even here
450
means the lowest-order bit is zero. This rounding mode prevents
451
statistical bias and guarantees numeric stability: round-off errors
452
in a lengthy calculation will remain smaller than half of FLT_EPSILON.
454
Round toward plus Infinity.
455
All results are rounded to the smallest representable value which is
456
greater than the result.
458
Round toward minus Infinity.
459
All results are rounded to the largest representable value which is
460
less than the result.
463
All results are rounded to the largest representable value whose
464
magnitude is less than that of the result. In other words, if the
465
result is negative it is rounded up; if it is positive, it is
468
float_round_nearest_even = 0;
469
float_round_down = 1;
471
float_round_to_zero = 3;
474
-------------------------------------------------------------------------------
475
Floating-point rounding mode and exception flags.
476
-------------------------------------------------------------------------------
479
float_rounding_mode : Byte = float_round_nearest_even;
482
-------------------------------------------------------------------------------
483
Underflow tininess-detection mode, statically initialized to default value.
484
(The declaration in `softfloat.h' must match the `int8' type here.)
485
-------------------------------------------------------------------------------
488
const float_detect_tininess: int8 = float_tininess_after_rounding;
490
{$endif not(defined(fpc_softfpu_implementation))}
492
{$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
494
{$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
497
{$if not(defined(fpc_softfpu_interface))}
498
(*****************************************************************************)
499
(*----------------------------------------------------------------------------*)
500
(* Primitive arithmetic functions, including multi-word arithmetic, and *)
501
(* division and square root approximations. (Can be specialized to target if *)
503
(* ---------------------------------------------------------------------------*)
504
(*****************************************************************************)
507
{*----------------------------------------------------------------------------
508
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
509
| and 7, and returns the properly rounded 32-bit integer corresponding to the
510
| input. If `zSign' is 1, the input is negated before being converted to an
511
| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
512
| is simply rounded to an integer, with the inexact exception raised if the
513
| input cannot be represented exactly as an integer. However, if the fixed-
514
| point input is too large, the invalid exception is raised and the largest
515
| positive or negative integer is returned.
516
*----------------------------------------------------------------------------*}
518
function roundAndPackInt32( zSign: flag; absZ : bits64): int32;
521
roundNearestEven: flag;
522
roundIncrement, roundBits: int8;
525
roundingMode := float_rounding_mode;
526
roundNearestEven := ord( roundingMode = float_round_nearest_even );
527
roundIncrement := $40;
528
if ( roundNearestEven=0 ) then
530
if ( roundingMode = float_round_to_zero ) then
535
roundIncrement := $7F;
538
if ( roundingMode = float_round_up ) then
542
if ( roundingMode = float_round_down ) then
547
roundBits := absZ and $7F;
548
absZ := ( absZ + roundIncrement ) shr 7;
549
absZ := absZ and not( ord( ( roundBits xor $40 ) = 0 ) and roundNearestEven );
553
if ( ( absZ shr 32 ) or ( z and ( ord( z < 0 ) xor zSign ) ) )<>0 then
555
float_raise( float_flag_invalid );
557
result:=sbits32($80000000)
562
if ( roundBits<>0 ) then
563
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
567
{*----------------------------------------------------------------------------
568
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
569
| `absZ1', with binary point between bits 63 and 64 (between the input words),
570
| and returns the properly rounded 64-bit integer corresponding to the input.
571
| If `zSign' is 1, the input is negated before being converted to an integer.
572
| Ordinarily, the fixed-point input is simply rounded to an integer, with
573
| the inexact exception raised if the input cannot be represented exactly as
574
| an integer. However, if the fixed-point input is too large, the invalid
575
| exception is raised and the largest positive or negative integer is
577
*----------------------------------------------------------------------------*}
579
function roundAndPackInt64( zSign: flag; absZ0: bits64; absZ1 : bits64): int64;
582
roundNearestEven, increment: flag;
587
roundingMode := float_rounding_mode;
588
roundNearestEven := ord( roundingMode = float_round_nearest_even );
589
increment := ord( sbits64(absZ1) < 0 );
590
if ( roundNearestEven=0 ) then
592
if ( roundingMode = float_round_to_zero ) then
599
increment := ord(( roundingMode = float_round_down ) and (absZ1<>0));
602
increment := ord(( roundingMode = float_round_up ) and (absZ1<>0));
606
if ( increment<>0 ) then
609
if ( absZ0 = 0 ) then
611
absZ0 := absZ0 and not( ord( bits64( absZ1 shl 1 ) = 0 ) and roundNearestEven );
616
if ( (z<>0) and (( ord( z < 0 ) xor zSign )<>0) ) then
619
float_raise( float_flag_invalid );
621
result:=int64($8000000000000000)
623
result:=int64($7FFFFFFFFFFFFFFF);
626
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
631
-------------------------------------------------------------------------------
632
Shifts `a' right by the number of bits given in `count'. If any nonzero
633
bits are shifted off, they are ``jammed'' into the least significant bit of
634
the result by setting the least significant bit to 1. The value of `count'
635
can be arbitrarily large; in particular, if `count' is greater than 32, the
636
result will be either 0 or 1, depending on whether `a' is zero or nonzero.
637
The result is stored in the location pointed to by `zPtr'.
638
-------------------------------------------------------------------------------
640
Procedure shift32RightJamming( a: bits32 ; count: int16 ; VAR zPtr :bits32);
644
if ( count = 0 ) then
647
if ( count < 32 ) then
649
z := ( a shr count ) or bits32( (( a shl ( ( - count ) AND 31 )) ) <> 0);
653
z := bits32( a <> 0 );
658
{*----------------------------------------------------------------------------
659
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
660
| number of bits given in `count'. Any bits shifted off are lost. The value
661
| of `count' can be arbitrarily large; in particular, if `count' is greater
662
| than 128, the result will be 0. The result is broken into two 64-bit pieces
663
| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
664
*----------------------------------------------------------------------------*}
666
procedure shift128Right(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);
671
negCount := ( - count ) and 63;
673
if ( count = 0 ) then
678
else if ( count < 64 ) then
680
z1 := ( a0 shl negCount ) or ( a1 shr count );
685
if ( count shl 64 )<>0 then
686
z1 := a0 shr ( count and 63 )
697
-------------------------------------------------------------------------------
698
Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
699
number of bits given in `count'. Any bits shifted off are lost. The value
700
of `count' can be arbitrarily large; in particular, if `count' is greater
701
than 64, the result will be 0. The result is broken into two 32-bit pieces
702
which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
703
-------------------------------------------------------------------------------
707
a0 :bits32; a1: bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32);
712
negCount := ( - count ) AND 31;
714
if ( count = 0 ) then
719
else if ( count < 32 ) then
721
z1 := ( a0 shl negCount ) OR ( a1 shr count );
727
z1 := ( a0 shr ( count AND 31 ) )
737
-------------------------------------------------------------------------------
738
Shifts the 64-bit value formed by concatenating `a0' and `a1' right by the
739
number of bits given in `count'. If any nonzero bits are shifted off, they
740
are ``jammed'' into the least significant bit of the result by setting the
741
least significant bit to 1. The value of `count' can be arbitrarily large;
742
in particular, if `count' is greater than 64, the result will be either 0
743
or 1, depending on whether the concatenation of `a0' and `a1' is zero or
744
nonzero. The result is broken into two 32-bit pieces which are stored at
745
the locations pointed to by `z0Ptr' and `z1Ptr'.
746
-------------------------------------------------------------------------------
750
a0:bits32; a1: bits32; count:int16; VAR Z0Ptr :bits32;VAR z1Ptr: bits32 );
755
negCount := ( - count ) AND 31;
757
if ( count = 0 ) then
763
if ( count < 32 ) then
765
z1 := ( a0 shl negCount ) OR ( a1 shr count ) OR bits32( ( a1 shl negCount ) <> 0 );
770
if ( count = 32 ) then
772
z1 := a0 OR bits32( a1 <> 0 );
775
if ( count < 64 ) Then
777
z1 := ( a0 shr ( count AND 31 ) ) OR bits32( ( ( a0 shl negCount ) OR a1 ) <> 0 );
781
z1 := bits32( ( a0 OR a1 ) <> 0 );
790
{*----------------------------------------------------------------------------
791
| Shifts `a' right by the number of bits given in `count'. If any nonzero
792
| bits are shifted off, they are ``jammed'' into the least significant bit of
793
| the result by setting the least significant bit to 1. The value of `count'
794
| can be arbitrarily large; in particular, if `count' is greater than 64, the
795
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
796
| The result is stored in the location pointed to by `zPtr'.
797
*----------------------------------------------------------------------------*}
799
procedure shift64RightJamming(a: bits64; count: int16; var zPtr : bits64);
803
if ( count = 0 ) then
807
else if ( count < 64 ) then
809
z := ( a shr count ) or ord( ( a shl ( ( - count ) and 63 ) ) <> 0 );
821
-------------------------------------------------------------------------------
822
Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' right
823
by 32 _plus_ the number of bits given in `count'. The shifted result is
824
at most 64 nonzero bits; these are broken into two 32-bit pieces which are
825
stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
826
off form a third 32-bit result as follows: The _last_ bit shifted off is
827
the most-significant bit of the extra result, and the other 31 bits of the
828
extra result are all zero if and only if _all_but_the_last_ bits shifted off
829
were all zero. This extra result is stored in the location pointed to by
830
`z2Ptr'. The value of `count' can be arbitrarily large.
831
(This routine makes more sense if `a0', `a1', and `a2' are considered
832
to form a fixed-point value with binary point between `a1' and `a2'. This
833
fixed-point value is shifted right by the number of bits given in `count',
834
and the integer part of the result is returned at the locations pointed to
835
by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
836
corrupted as described above, and is returned at the location pointed to by
838
-------------------------------------------------------------------------------
841
shift64ExtraRightJamming(
854
negCount := ( - count ) AND 31;
856
if ( count = 0 ) then
864
if ( count < 32 ) Then
866
z2 := a1 shl negCount;
867
z1 := ( a0 shl negCount ) OR ( a1 shr count );
872
if ( count = 32 ) then
880
if ( count < 64 ) then
882
z2 := a0 shl negCount;
883
z1 := a0 shr ( count AND 31 );
890
z2 := bits32(a0 <> 0);
896
z2 := z2 or bits32( a2 <> 0 );
904
-------------------------------------------------------------------------------
905
Shifts the 64-bit value formed by concatenating `a0' and `a1' left by the
906
number of bits given in `count'. Any bits shifted off are lost. The value
907
of `count' must be less than 32. The result is broken into two 32-bit
908
pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
909
-------------------------------------------------------------------------------
913
a0:bits32; a1:bits32; count:int16; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
916
z1Ptr := a1 shl count;
920
z0Ptr := ( a0 shl count ) OR ( a1 shr ( ( - count ) AND 31 ) );
924
-------------------------------------------------------------------------------
925
Shifts the 96-bit value formed by concatenating `a0', `a1', and `a2' left
926
by the number of bits given in `count'. Any bits shifted off are lost.
927
The value of `count' must be less than 32. The result is broken into three
928
32-bit pieces which are stored at the locations pointed to by `z0Ptr',
929
`z1Ptr', and `z2Ptr'.
930
-------------------------------------------------------------------------------
949
if ( 0 < count ) then
951
negCount := ( ( - count ) AND 31 );
952
z1 := z1 or (a2 shr negCount);
953
z0 := z0 or (a1 shr negCount);
960
{*----------------------------------------------------------------------------
961
| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
962
| number of bits given in `count'. Any bits shifted off are lost. The value
963
| of `count' must be less than 64. The result is broken into two 64-bit
964
| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
965
*----------------------------------------------------------------------------*}
967
procedure shortShift128Left(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; z1Ptr : bits64);inline;
969
z1Ptr := a1 shl count;
973
z0Ptr:=( a0 shl count ) or ( a1 shr ( ( - count ) and 63 ) );
977
-------------------------------------------------------------------------------
978
Adds the 64-bit value formed by concatenating `a0' and `a1' to the 64-bit
979
value formed by concatenating `b0' and `b1'. Addition is modulo 2^64, so
980
any carry out is lost. The result is broken into two 32-bit pieces which
981
are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
982
-------------------------------------------------------------------------------
986
a0:bits32; a1:bits32; b0:bits32; b1:bits32; VAR z0Ptr:bits32; VAR z1Ptr:bits32 );
992
z0Ptr := a0 + b0 + bits32( z1 < a1 );
996
-------------------------------------------------------------------------------
997
Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
998
96-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
999
modulo 2^96, so any carry out is lost. The result is broken into three
1000
32-bit pieces which are stored at the locations pointed to by `z0Ptr',
1001
`z1Ptr', and `z2Ptr'.
1002
-------------------------------------------------------------------------------
1018
carry0, carry1: int8;
1021
carry1 := int8( z2 < a2 );
1023
carry0 := int8( z1 < a1 );
1026
z0 := z0 + bits32( z1 < carry1 );
1034
-------------------------------------------------------------------------------
1035
Subtracts the 64-bit value formed by concatenating `b0' and `b1' from the
1036
64-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
1037
2^64, so any borrow out (carry out) is lost. The result is broken into two
1038
32-bit pieces which are stored at the locations pointed to by `z0Ptr' and
1040
-------------------------------------------------------------------------------
1044
a0: bits32; a1 : bits32; b0 :bits32; b1: bits32; VAR z0Ptr:bits32; VAR z1Ptr: bits32 );
1047
z0Ptr := a0 - b0 - bits32( a1 < b1 );
1051
-------------------------------------------------------------------------------
1052
Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
1053
the 96-bit value formed by concatenating `a0', `a1', and `a2'. Subtraction
1054
is modulo 2^96, so any borrow out (carry out) is lost. The result is broken
1055
into three 32-bit pieces which are stored at the locations pointed to by
1056
`z0Ptr', `z1Ptr', and `z2Ptr'.
1057
-------------------------------------------------------------------------------
1073
borrow0, borrow1: int8;
1076
borrow1 := int8( a2 < b2 );
1078
borrow0 := int8( a1 < b1 );
1080
z0 := z0 - bits32( z1 < borrow1 );
1089
-------------------------------------------------------------------------------
1090
Multiplies `a' by `b' to obtain a 64-bit product. The product is broken
1091
into two 32-bit pieces which are stored at the locations pointed to by
1092
`z0Ptr' and `z1Ptr'.
1093
-------------------------------------------------------------------------------
1095
Procedure mul32To64( a:bits32; b:bits32; VAR z0Ptr: bits32; VAR z1Ptr
1098
aHigh, aLow, bHigh, bLow: bits16;
1099
z0, zMiddleA, zMiddleB, z1: bits32;
1101
aLow := a and $ffff;
1103
bLow := b and $ffff;
1105
z1 := ( bits32( aLow) ) * bLow;
1106
zMiddleA := ( bits32 (aLow) ) * bHigh;
1107
zMiddleB := ( bits32 (aHigh) ) * bLow;
1108
z0 := ( bits32 (aHigh) ) * bHigh;
1109
zMiddleA := zMiddleA + zMiddleB;
1110
z0 := z0 + ( ( bits32 ( zMiddleA < zMiddleB ) ) shl 16 ) + ( zMiddleA shr 16 );
1111
zMiddleA := zmiddleA shl 16;
1112
z1 := z1 + zMiddleA;
1113
z0 := z0 + bits32( z1 < zMiddleA );
1119
-------------------------------------------------------------------------------
1120
Multiplies the 64-bit value formed by concatenating `a0' and `a1' by `b'
1121
to obtain a 96-bit product. The product is broken into three 32-bit pieces
1122
which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
1124
-------------------------------------------------------------------------------
1136
z0, z1, z2, more1: bits32;
1138
mul32To64( a1, b, z1, z2 );
1139
mul32To64( a0, b, z0, more1 );
1140
add64( z0, more1, 0, z1, z0, z1 );
1147
-------------------------------------------------------------------------------
1148
Multiplies the 64-bit value formed by concatenating `a0' and `a1' to the
1149
64-bit value formed by concatenating `b0' and `b1' to obtain a 128-bit
1150
product. The product is broken into four 32-bit pieces which are stored at
1151
the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
1152
-------------------------------------------------------------------------------
1166
z0, z1, z2, z3: bits32;
1167
more1, more2: bits32;
1170
mul32To64( a1, b1, z2, z3 );
1171
mul32To64( a1, b0, z1, more2 );
1172
add64( z1, more2, 0, z2, z1, z2 );
1173
mul32To64( a0, b0, z0, more1 );
1174
add64( z0, more1, 0, z1, z0, z1 );
1175
mul32To64( a0, b1, more1, more2 );
1176
add64( more1, more2, 0, z2, more1, z2 );
1177
add64( z0, z1, 0, more1, z0, z1 );
1186
-------------------------------------------------------------------------------
1187
Returns an approximation to the 32-bit integer quotient obtained by dividing
1188
`b' into the 64-bit value formed by concatenating `a0' and `a1'. The
1189
divisor `b' must be at least 2^31. If q is the exact quotient truncated
1190
toward zero, the approximation returned lies between q and q + 2 inclusive.
1191
If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
1192
unsigned integer is returned.
1193
-------------------------------------------------------------------------------
1195
Function estimateDiv64To32( a0:bits32; a1: bits32; b:bits32): bits32;
1198
rem0, rem1, term0, term1: bits32;
1203
estimateDiv64To32 := $FFFFFFFF;
1207
if ( b0 shl 16 <= a0 ) then
1210
z:= ( a0 div b0 ) shl 16;
1211
mul32To64( b, z, term0, term1 );
1212
sub64( a0, a1, term0, term1, rem0, rem1 );
1213
while ( ( sbits32 (rem0) ) < 0 ) do
1217
add64( rem0, rem1, b0, b1, rem0, rem1 );
1219
rem0 := ( rem0 shl 16 ) OR ( rem1 shr 16 );
1220
if ( b0 shl 16 <= rem0 ) then
1223
z := z or (rem0 div b0);
1224
estimateDiv64To32 := z;
1229
-------------------------------------------------------------------------------
1230
Returns an approximation to the square root of the 32-bit significand given
1231
by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
1232
`aExp' (the least significant bit) is 1, the integer returned approximates
1233
2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
1234
is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
1235
case, the approximation returned lies strictly within +/-2 of the exact
1237
-------------------------------------------------------------------------------
1239
Function estimateSqrt32( aExp: int16; a: bits32 ): bits32;
1240
const sqrtOddAdjustments: array[0..15] of bits16 = (
1241
$0004, $0022, $005D, $00B1, $011D, $019F, $0236, $02E0,
1242
$039C, $0468, $0545, $0631, $072B, $0832, $0946, $0A67
1244
const sqrtEvenAdjustments: array[0..15] of bits16 = (
1245
$0A2D, $08AF, $075A, $0629, $051A, $0429, $0356, $029E,
1246
$0200, $0179, $0109, $00AF, $0068, $0034, $0012, $0002
1253
index := ( a shr 27 ) AND 15;
1254
if ( aExp AND 1 ) <> 0 then
1256
z := $4000 + ( a shr 17 ) - sqrtOddAdjustments[ index ];
1257
z := ( ( a div z ) shl 14 ) + ( z shl 15 );
1262
z := $8000 + ( a shr 17 ) - sqrtEvenAdjustments[ index ];
1264
if ( $20000 <= z ) then
1270
estimateSqrt32 := bits32 ( ( sbits32 (a )) shr 1 );
1274
estimateSqrt32 := ( ( estimateDiv64To32( a, 0, z ) ) shr 1 ) + ( z shr 1 );
1278
-------------------------------------------------------------------------------
1279
Returns the number of leading 0 bits before the most-significant 1 bit of
1280
`a'. If `a' is zero, 32 is returned.
1281
-------------------------------------------------------------------------------
1283
Function countLeadingZeros32( a:bits32 ): int8;
1285
const countLeadingZerosHigh:array[0..255] of int8 = (
1286
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
1287
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1288
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1289
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1290
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1291
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1292
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1293
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1294
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1295
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1296
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1297
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1298
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1299
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1300
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1301
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1308
if ( a < $10000 ) then
1310
shiftCount := shiftcount + 16;
1313
if ( a < $1000000 ) then
1315
shiftCount := shiftcount + 8;
1318
shiftCount := shiftcount + countLeadingZerosHigh[ a shr 24 ];
1319
countLeadingZeros32:= shiftCount;
1322
{*----------------------------------------------------------------------------
1323
| Returns the number of leading 0 bits before the most-significant 1 bit of
1324
| `a'. If `a' is zero, 64 is returned.
1325
*----------------------------------------------------------------------------*}
1327
function countLeadingZeros64( a : bits64): int8;
1332
if ( a < (bits64(1) shl 32 )) then
1333
shiftCount := shiftcount + 32
1336
shiftCount := shiftCount + countLeadingZeros32( a );
1337
countLeadingZeros64:= shiftCount;
1343
-------------------------------------------------------------------------------
1344
Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is
1345
equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
1347
-------------------------------------------------------------------------------
1349
Function eq64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
1351
eq64 := flag( a0 = b0 ) and flag( a1 = b1 );
1355
-------------------------------------------------------------------------------
1356
Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
1357
than or equal to the 64-bit value formed by concatenating `b0' and `b1'.
1358
Otherwise, returns 0.
1359
-------------------------------------------------------------------------------
1361
Function le64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
1364
le64:= flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 <= b1 ) );
1369
-------------------------------------------------------------------------------
1370
Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is less
1371
than the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
1373
-------------------------------------------------------------------------------
1375
Function lt64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
1377
lt64 := flag( a0 < b0 ) or flag( ( a0 = b0 ) and ( a1 < b1 ) );
1381
-------------------------------------------------------------------------------
1382
Returns 1 if the 64-bit value formed by concatenating `a0' and `a1' is not
1383
equal to the 64-bit value formed by concatenating `b0' and `b1'. Otherwise,
1385
-------------------------------------------------------------------------------
1387
Function ne64( a0: bits32; a1:bits32 ;b0:bits32; b1:bits32 ): flag;
1389
ne64:= flag( a0 <> b0 ) or flag( a1 <> b1 );
1392
(*****************************************************************************)
1393
(* End Low-Level arithmetic *)
1394
(*****************************************************************************)
1399
-------------------------------------------------------------------------------
1400
Functions and definitions to determine: (1) whether tininess for underflow
1401
is detected before or after rounding by default, (2) what (if anything)
1402
happens when exceptions are raised, (3) how signaling NaNs are distinguished
1403
from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
1404
are propagated from function inputs to output. These details are ENDIAN
1406
-------------------------------------------------------------------------------
1408
{$IFDEF ENDIAN_LITTLE}
1410
-------------------------------------------------------------------------------
1411
Internal canonical NaN format.
1412
-------------------------------------------------------------------------------
1415
commonNaNT = packed record
1420
-------------------------------------------------------------------------------
1421
The pattern for a default generated single-precision NaN.
1422
-------------------------------------------------------------------------------
1424
const float32_default_nan = $FFC00000;
1427
-------------------------------------------------------------------------------
1428
Returns 1 if the single-precision floating-point value `a' is a NaN;
1429
otherwise returns 0.
1430
-------------------------------------------------------------------------------
1432
Function float32_is_nan( a : float32 ): flag;
1435
float32_is_nan:= flag( $FF000000 < bits32 ( a shl 1 ) );
1440
-------------------------------------------------------------------------------
1441
Returns 1 if the single-precision floating-point value `a' is a signaling
1442
NaN; otherwise returns 0.
1443
-------------------------------------------------------------------------------
1445
Function float32_is_signaling_nan( a : float32 ): flag;
1448
float32_is_signaling_nan := flag
1449
( ( ( a shr 22 ) and $1FF ) = $1FE ) and( a and $003FFFFF );
1454
-------------------------------------------------------------------------------
1455
Returns the result of converting the single-precision floating-point NaN
1456
`a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
1457
exception is raised.
1458
-------------------------------------------------------------------------------
1460
Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
1464
if ( float32_is_signaling_nan( a ) <> 0) then
1465
float_raise( float_flag_invalid );
1474
-------------------------------------------------------------------------------
1475
Returns the result of converting the canonical NaN `a' to the single-
1476
precision floating-point format.
1477
-------------------------------------------------------------------------------
1479
Function commonNaNToFloat32( a : commonNaNT ): float32;
1481
commonNaNToFloat32 := ( ( bits32 (a.sign) ) shl 31 ) or $7FC00000 or ( a.high shr 9 );
1485
-------------------------------------------------------------------------------
1486
Takes two single-precision floating-point values `a' and `b', one of which
1487
is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
1488
signaling NaN, the invalid exception is raised.
1489
-------------------------------------------------------------------------------
1491
Function propagateFloat32NaN( a : float32 ; b: float32 ): float32;
1493
aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
1494
label returnLargerSignificand;
1496
aIsNaN := float32_is_nan( a );
1497
aIsSignalingNaN := float32_is_signaling_nan( a );
1498
bIsNaN := float32_is_nan( b );
1499
bIsSignalingNaN := float32_is_signaling_nan( b );
1500
a := a or $00400000;
1501
b := b or $00400000;
1502
if ( aIsSignalingNaN or bIsSignalingNaN ) <> 0 then
1503
float_raise( float_flag_invalid );
1504
if ( aIsSignalingNaN )<> 0 then
1506
if ( bIsSignalingNaN ) <> 0 then
1507
goto returnLargerSignificand;
1509
propagateFloat32NaN := b
1511
propagateFloat32NaN := a;
1514
else if ( aIsNaN <> 0) then
1516
if ( bIsSignalingNaN or not bIsNaN )<> 0 then
1518
propagateFloat32NaN := a;
1521
returnLargerSignificand:
1522
if ( bits32 ( a shl 1 ) < bits32 ( b shl 1 ) ) then
1524
propagateFloat32NaN := b;
1527
if ( bits32 ( b shl 1 ) < bits32 ( a shl 1 ) ) then
1529
propagateFloat32NaN := a;
1532
propagateFloat32NaN := a
1534
propagateFloat32NaN := b;
1539
propagateFloat32NaN := b;
1546
-------------------------------------------------------------------------------
1547
The pattern for a default generated double-precision NaN. The `high' and
1548
`low' values hold the most- and least-significant bits, respectively.
1549
-------------------------------------------------------------------------------
1552
float64_default_nan_high = $FFF80000;
1553
float64_default_nan_low = $00000000;
1556
-------------------------------------------------------------------------------
1557
Returns 1 if the double-precision floating-point value `a' is a NaN;
1558
otherwise returns 0.
1559
-------------------------------------------------------------------------------
1561
Function float64_is_nan( a : float64 ) : flag;
1565
flag( $FFE00000 <= bits32 ( a.high shl 1 ) )
1566
and ( a.low or ( a.high and $000FFFFF ) );
1571
-------------------------------------------------------------------------------
1572
Returns 1 if the double-precision floating-point value `a' is a signaling
1573
NaN; otherwise returns 0.
1574
-------------------------------------------------------------------------------
1576
Function float64_is_signaling_nan( a : float64 ): flag;
1579
float64_is_signaling_nan :=
1580
flag( ( ( a.high shr 19 ) and $FFF ) = $FFE )
1581
and ( a.low or ( a.high and $0007FFFF ) );
1585
-------------------------------------------------------------------------------
1586
Returns the result of converting the double-precision floating-point NaN
1587
`a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
1588
exception is raised.
1589
-------------------------------------------------------------------------------
1591
Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
1595
if ( float64_is_signaling_nan( a )<>0 ) then
1596
float_raise( float_flag_invalid );
1597
z.sign := a.high shr 31;
1598
shortShift64Left( a.high, a.low, 12, z.high, z.low );
1604
-------------------------------------------------------------------------------
1605
Returns the result of converting the canonical NaN `a' to the double-
1606
precision floating-point format.
1607
-------------------------------------------------------------------------------
1609
Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
1613
shift64Right( a.high, a.low, 12, z.high, z.low );
1614
z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
1619
-------------------------------------------------------------------------------
1620
Takes two double-precision floating-point values `a' and `b', one of which
1621
is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
1622
signaling NaN, the invalid exception is raised.
1623
-------------------------------------------------------------------------------
1625
Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
1627
aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
1628
label returnLargerSignificand;
1630
aIsNaN := float64_is_nan( a );
1631
aIsSignalingNaN := float64_is_signaling_nan( a );
1632
bIsNaN := float64_is_nan( b );
1633
bIsSignalingNaN := float64_is_signaling_nan( b );
1634
a.high := a.high or $00080000;
1635
b.high := b.high or $00080000;
1636
if ( aIsSignalingNaN or bIsSignalingNaN )<> 0 then
1637
float_raise( float_flag_invalid );
1638
if ( aIsSignalingNaN )<>0 then
1640
if ( bIsSignalingNaN )<>0 then
1641
goto returnLargerSignificand;
1648
else if ( aIsNaN )<> 0 then
1650
if ( bIsSignalingNaN or not bIsNaN ) <> 0 then
1655
returnLargerSignificand:
1656
if ( lt64( a.high shl 1, a.low, b.high shl 1, b.low ) ) <> 0 then
1661
if ( lt64( b.high shl 1, b.low, a.high shl 1, a.low ) ) <> 0 then
1666
if a.high < b.high then
1679
{*----------------------------------------------------------------------------
1680
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
1681
| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
1683
*----------------------------------------------------------------------------*}
1685
function lt128(a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
1687
result := ord(( a0 < b0 ) or ( ( a0 = b0 ) and ( a1 < b1 ) ));
1690
{*----------------------------------------------------------------------------
1691
| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
1692
| otherwise returns 0.
1693
*----------------------------------------------------------------------------*}
1695
function float128_is_nan( a : float128): flag;
1697
result:= ord(( int64( $FFFE000000000000 ) <= bits64( a.high shl 1 ) )
1698
and ( (a.low<>0) or (( a.high and int64( $0000FFFFFFFFFFFF ) )<>0 ) ));
1701
{*----------------------------------------------------------------------------
1702
| Returns 1 if the quadruple-precision floating-point value `a' is a
1703
| signaling NaN; otherwise returns 0.
1704
*----------------------------------------------------------------------------*}
1706
function float128_is_signaling_nan( a : float128): flag;
1708
result:=ord(( ( ( a.high shr 47 ) and $FFFF ) = $FFFE ) and
1709
( (a.low<>0) or (( a.high and int64( $00007FFFFFFFFFFF ) )<>0) ));
1712
{*----------------------------------------------------------------------------
1713
| Returns the result of converting the quadruple-precision floating-point NaN
1714
| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
1715
| exception is raised.
1716
*----------------------------------------------------------------------------*}
1718
function float128ToCommonNaN( a : float128): commonNaNT;
1723
if ( float128_is_signaling_nan( a )<>0) then
1724
float_raise( float_flag_invalid );
1725
z.sign := a.high shr 63;
1726
shortShift128Left( a.high, a.low, 16, qhigh, qlow );
1727
z.high:=qhigh shr 32;
1728
z.low:=qhigh and $ffffffff;
1732
{*----------------------------------------------------------------------------
1733
| Returns the result of converting the canonical NaN `a' to the quadruple-
1734
| precision floating-point format.
1735
*----------------------------------------------------------------------------*}
1737
function commonNaNToFloat128( a : commonNaNT): float128;
1741
shift128Right( a.high, a.low, 16, z.high, z.low );
1742
z.high := z.high or ( ( bits64(a.sign) ) shl 63 ) or int64( $7FFF800000000000 );
1746
{*----------------------------------------------------------------------------
1747
| Takes two quadruple-precision floating-point values `a' and `b', one of
1748
| which is a NaN, and returns the appropriate NaN result. If either `a' or
1749
| `b' is a signaling NaN, the invalid exception is raised.
1750
*----------------------------------------------------------------------------*}
1752
function propagateFloat128NaN( a: float128; b : float128): float128;
1754
aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
1756
returnLargerSignificand;
1758
aIsNaN := float128_is_nan( a );
1759
aIsSignalingNaN := float128_is_signaling_nan( a );
1760
bIsNaN := float128_is_nan( b );
1761
bIsSignalingNaN := float128_is_signaling_nan( b );
1762
a.high := a.high or int64( $0000800000000000 );
1763
b.high := b.high or int64( $0000800000000000 );
1764
if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
1765
float_raise( float_flag_invalid );
1766
if ( aIsSignalingNaN )<>0 then
1768
if ( bIsSignalingNaN )<>0 then
1769
goto returnLargerSignificand;
1776
else if ( aIsNaN )<>0 then
1778
if ( bIsSignalingNaN or not( bIsNaN) )<>0 then
1783
returnLargerSignificand:
1784
if ( lt128( a.high shl 1, a.low, b.high shl 1, b.low ) )<>0 then
1789
if ( lt128( b.high shl 1, b.low, a.high shl 1, a.low ) )<>0 then
1794
if ( a.high < b.high ) then
1808
(*----------------------------------------------------------------------------
1809
| Internal canonical NaN format.
1810
*----------------------------------------------------------------------------*)
1812
commonNANT = packed record
1817
(*----------------------------------------------------------------------------
1818
| The pattern for a default generated single-precision NaN.
1819
*----------------------------------------------------------------------------*)
1820
const float32_default_nan = $7FFFFFFF;
1822
(*----------------------------------------------------------------------------
1823
| Returns 1 if the single-precision floating-point value `a' is a NaN;
1824
| otherwise returns 0.
1825
*----------------------------------------------------------------------------*)
1826
function float32_is_nan(a: float32): flag;
1828
float32_is_nan := flag( $FF000000 < bits32( a shl 1 ) );
1831
(*----------------------------------------------------------------------------
1832
| Returns 1 if the single-precision floating-point value `a' is a signaling
1833
| NaN; otherwise returns 0.
1834
*----------------------------------------------------------------------------*)
1835
function float32_is_signaling_nan(a: float32):flag;
1837
float32_is_signaling_nan := flag( ( ( a shr 22 ) and $1FF ) = $1FE ) and flag( boolean((a and $003FFFFF)<>0) );
1840
(*----------------------------------------------------------------------------
1841
| Returns the result of converting the single-precision floating-point NaN
1842
| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
1843
| exception is raised.
1844
*----------------------------------------------------------------------------*)
1845
Procedure float32ToCommonNaN( a: float32; VAR c:commonNaNT );
1849
if float32_is_signaling_nan(a)<>0 then
1850
float_raise(float_flag_invalid);
1857
(*----------------------------------------------------------------------------
1858
| Returns the result of converting the canonical NaN `a' to the single-
1859
| precision floating-point format.
1860
*----------------------------------------------------------------------------*)
1861
function CommonNanToFloat32(a : CommonNaNT): float32;
1863
CommonNanToFloat32:= ( ( bits32( a.sign )) shl 31 ) OR $7FC00000 OR ( a.high shr 9 );
1866
(*----------------------------------------------------------------------------
1867
| Takes two single-precision floating-point values `a' and `b', one of which
1868
| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
1869
| signaling NaN, the invalid exception is raised.
1870
*----------------------------------------------------------------------------*)
1871
function propagateFloat32NaN( a: float32 ; b: float32): float32;
1873
aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN: flag;
1875
aIsNaN := float32_is_nan( a );
1876
aIsSignalingNaN := float32_is_signaling_nan( a );
1877
bIsNaN := float32_is_nan( b );
1878
bIsSignalingNaN := float32_is_signaling_nan( b );
1879
a := a or $00400000;
1880
b := b or $00400000;
1881
if ( aIsSignalingNaN or bIsSignalingNaN )<>0 then
1882
float_raise( float_flag_invalid );
1883
if bIsSignalingNaN<>0 then
1884
propagateFloat32Nan := b
1885
else if aIsSignalingNan<>0 then
1886
propagateFloat32Nan := a
1887
else if bIsNan<>0 then
1888
propagateFloat32Nan := b
1890
propagateFloat32Nan := a;
1894
(*----------------------------------------------------------------------------
1895
| The pattern for a default generated double-precision NaN. The `high' and
1896
| `low' values hold the most- and least-significant bits, respectively.
1897
*----------------------------------------------------------------------------*)
1899
float64_default_nan_high = $7FFFFFFF;
1900
float64_default_nan_low = $FFFFFFFF;
1902
(*----------------------------------------------------------------------------
1903
| Returns 1 if the double-precision floating-point value `a' is a NaN;
1904
| otherwise returns 0.
1905
*----------------------------------------------------------------------------*)
1907
function float64_is_nan(a: float64): flag;
1909
float64_is_nan := flag (
1910
( $FFE00000 <= bits32 ( a.high shl 1 ) )
1911
and ( (a.low<>0) or (( a.high and $000FFFFF )<>0) ));
1914
(*----------------------------------------------------------------------------
1915
| Returns 1 if the double-precision floating-point value `a' is a signaling
1916
| NaN; otherwise returns 0.
1917
*----------------------------------------------------------------------------*)
1918
function float64_is_signaling_nan( a:float64): flag;
1920
float64_is_signaling_nan := flag(
1921
( ( ( a.high shr 19 ) and $FFF ) = $FFE )
1922
and ( (a.low<>0) or ( ( a.high and $0007FFFF )<>0) ));
1926
(*----------------------------------------------------------------------------
1927
| Returns the result of converting the double-precision floating-point NaN
1928
| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
1929
| exception is raised.
1930
*----------------------------------------------------------------------------*)
1931
Procedure float64ToCommonNaN( a : float64; VAR c:commonNaNT );
1935
if ( float64_is_signaling_nan( a )<>0 ) then
1936
float_raise( float_flag_invalid );
1937
z.sign := a.high shr 31;
1938
shortShift64Left( a.high, a.low, 12, z.high, z.low );
1942
(*----------------------------------------------------------------------------
1943
| Returns the result of converting the canonical NaN `a' to the double-
1944
| precision floating-point format.
1945
*----------------------------------------------------------------------------*)
1946
Procedure commonNaNToFloat64( a : commonNaNT; VAR c: float64 );
1950
shift64Right( a.high, a.low, 12, z.high, z.low );
1951
z.high := z.high or ( ( bits32 (a.sign) ) shl 31 ) or $7FF80000;
1955
(*----------------------------------------------------------------------------
1956
| Takes two double-precision floating-point values `a' and `b', one of which
1957
| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
1958
| signaling NaN, the invalid exception is raised.
1959
*----------------------------------------------------------------------------*)
1960
Procedure propagateFloat64NaN( a: float64; b: float64 ; VAR c: float64 );
1962
aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN : flag;
1964
aIsNaN := float64_is_nan( a );
1965
aIsSignalingNaN := float64_is_signaling_nan( a );
1966
bIsNaN := float64_is_nan( b );
1967
bIsSignalingNaN := float64_is_signaling_nan( b );
1968
a.high := a.high or $00080000;
1969
b.high := b.high or $00080000;
1970
if ( (aIsSignalingNaN<>0) or (bIsSignalingNaN<>0) ) then
1971
float_raise( float_flag_invalid );
1972
if bIsSignalingNaN<>0 then
1974
else if aIsSignalingNan<>0 then
1976
else if bIsNan<>0 then
1984
(****************************************************************************)
1985
(* END ENDIAN SPECIFIC CODE *)
1986
(****************************************************************************)
1990
-------------------------------------------------------------------------------
1991
Returns the fraction bits of the single-precision floating-point value `a'.
1992
-------------------------------------------------------------------------------
1994
Function ExtractFloat32Frac(a : Float32) : Bits32;
1996
ExtractFloat32Frac := A AND $007FFFFF;
2001
-------------------------------------------------------------------------------
2002
Returns the exponent bits of the single-precision floating-point value `a'.
2003
-------------------------------------------------------------------------------
2005
Function extractFloat32Exp( a: float32 ): Int16;
2007
extractFloat32Exp := (a shr 23) AND $FF;
2011
-------------------------------------------------------------------------------
2012
Returns the sign bit of the single-precision floating-point value `a'.
2013
-------------------------------------------------------------------------------
2015
Function extractFloat32Sign( a: float32 ): Flag;
2017
extractFloat32Sign := a shr 31;
2021
-------------------------------------------------------------------------------
2022
Normalizes the subnormal single-precision floating-point value represented
2023
by the denormalized significand `aSig'. The normalized exponent and
2024
significand are stored at the locations pointed to by `zExpPtr' and
2025
`zSigPtr', respectively.
2026
-------------------------------------------------------------------------------
2028
Procedure normalizeFloat32Subnormal( aSig : bits32; VAR zExpPtr: Int16; VAR zSigPtr :bits32);
2033
shiftCount := countLeadingZeros32( aSig ) - 8;
2034
zSigPtr := aSig shl shiftCount;
2035
zExpPtr := 1 - shiftCount;
2039
-------------------------------------------------------------------------------
2040
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2041
single-precision floating-point value, returning the result. After being
2042
shifted into the proper positions, the three fields are simply added
2043
together to form the result. This means that any integer portion of `zSig'
2044
will be added into the exponent. Since a properly normalized significand
2045
will have an integer portion equal to 1, the `zExp' input should be 1 less
2046
than the desired result exponent whenever `zSig' is a complete, normalized
2048
-------------------------------------------------------------------------------
2050
Function packFloat32( zSign: Flag; zExp : Int16; zSig: Bits32 ): Float32;
2053
packFloat32 := ( ( bits32( zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 23 )
2058
-------------------------------------------------------------------------------
2059
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2060
and significand `zSig', and returns the proper single-precision floating-
2061
point value corresponding to the abstract input. Ordinarily, the abstract
2062
value is simply rounded and packed into the single-precision format, with
2063
the inexact exception raised if the abstract input cannot be represented
2064
exactly. However, if the abstract value is too large, the overflow and
2065
inexact exceptions are raised and an infinity or maximal finite value is
2066
returned. If the abstract value is too small, the input value is rounded to
2067
a subnormal number, and the underflow and inexact exceptions are raised if
2068
the abstract input cannot be represented exactly as a subnormal single-
2069
precision floating-point number.
2070
The input significand `zSig' has its binary point between bits 30
2071
and 29, which is 7 bits to the left of the usual location. This shifted
2072
significand must be normalized or smaller. If `zSig' is not normalized,
2073
`zExp' must be 0; in that case, the result returned is a subnormal number,
2074
and it must not require rounding. In the usual case that `zSig' is
2075
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2076
The handling of underflow and overflow follows the IEC/IEEE Standard for
2077
Binary Floating-Point Arithmetic.
2078
-------------------------------------------------------------------------------
2080
Function roundAndPackFloat32( zSign : Flag; zExp : Int16; zSig : Bits32 ) : float32;
2082
roundingMode : BYTE;
2083
roundNearestEven : Flag;
2084
roundIncrement, roundBits : BYTE;
2087
roundingMode := float_rounding_mode;
2088
if (roundingMode = float_round_nearest_even) then
2090
roundNearestEven := Flag(TRUE);
2093
roundNearestEven := Flag(FALSE);
2094
roundIncrement := $40;
2095
if ( Boolean(roundNearestEven) = FALSE) then
2097
if ( roundingMode = float_round_to_zero ) Then
2099
roundIncrement := 0;
2103
roundIncrement := $7F;
2104
if ( zSign <> 0 ) then
2106
if roundingMode = float_round_up then roundIncrement := 0;
2110
if roundingMode = float_round_down then roundIncrement := 0;
2114
roundBits := zSig AND $7F;
2115
if ($FD <= bits16 (zExp) ) then
2117
if (( $FD < zExp ) OR ( zExp = $FD ) AND ( sbits32 ( zSig + roundIncrement ) < 0 ) ) then
2119
float_raise( float_flag_overflow OR float_flag_inexact );
2120
roundAndPackFloat32:=packFloat32( zSign, $FF, 0 ) - Flag( roundIncrement = 0 );
2123
if ( zExp < 0 ) then
2126
flag(( float_detect_tininess = float_tininess_before_rounding )
2128
OR ( (zSig + roundIncrement) < $80000000 ));
2129
shift32RightJamming( zSig, - zExp, zSig );
2131
roundBits := zSig AND $7F;
2132
if ( (isTiny = flag(TRUE)) and (roundBits<>0) ) then
2133
float_raise( float_flag_underflow );
2136
if ( roundBits )<> 0 then
2137
softfloat_exception_flags := float_flag_inexact OR softfloat_exception_flags;
2138
zSig := ( zSig + roundIncrement ) shr 7;
2139
zSig := zSig AND not bits32( bits32( ( roundBits XOR $40 ) = 0 ) and roundNearestEven );
2140
if ( zSig = 0 ) then zExp := 0;
2141
roundAndPackFloat32 := packFloat32( zSign, zExp, zSig );
2146
-------------------------------------------------------------------------------
2147
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2148
and significand `zSig', and returns the proper single-precision floating-
2149
point value corresponding to the abstract input. This routine is just like
2150
`roundAndPackFloat32' except that `zSig' does not have to be normalized.
2151
Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2152
floating-point exponent.
2153
-------------------------------------------------------------------------------
2155
Function normalizeRoundAndPackFloat32( zSign: flag; zExp: int16; zSig:bits32 ): float32;
2159
shiftCount := countLeadingZeros32( zSig ) - 1;
2160
normalizeRoundAndPackFloat32 := roundAndPackFloat32( zSign, zExp - shiftCount, zSig shl shiftCount );
2164
-------------------------------------------------------------------------------
2165
Returns the most-significant 20 fraction bits of the double-precision
2166
floating-point value `a'.
2167
-------------------------------------------------------------------------------
2169
Function extractFloat64Frac0(a: float64): bits32;
2171
extractFloat64Frac0 := a.high and $000FFFFF;
2175
-------------------------------------------------------------------------------
2176
Returns the least-significant 32 fraction bits of the double-precision
2177
floating-point value `a'.
2178
-------------------------------------------------------------------------------
2180
Function extractFloat64Frac1(a: float64): bits32;
2182
extractFloat64Frac1 := a.low;
2186
-------------------------------------------------------------------------------
2187
Returns the exponent bits of the double-precision floating-point value `a'.
2188
-------------------------------------------------------------------------------
2190
Function extractFloat64Exp(a: float64): int16;
2192
extractFloat64Exp:= ( a.high shr 20 ) AND $7FF;
2196
-------------------------------------------------------------------------------
2197
Returns the sign bit of the double-precision floating-point value `a'.
2198
-------------------------------------------------------------------------------
2200
Function extractFloat64Sign(a: float64) : flag;
2202
extractFloat64Sign := a.high shr 31;
2206
-------------------------------------------------------------------------------
2207
Normalizes the subnormal double-precision floating-point value represented
2208
by the denormalized significand formed by the concatenation of `aSig0' and
2209
`aSig1'. The normalized exponent is stored at the location pointed to by
2210
`zExpPtr'. The most significant 21 bits of the normalized significand are
2211
stored at the location pointed to by `zSig0Ptr', and the least significant
2212
32 bits of the normalized significand are stored at the location pointed to
2214
-------------------------------------------------------------------------------
2216
Procedure normalizeFloat64Subnormal(
2219
VAR zExpPtr : Int16;
2220
VAR zSig0Ptr : Bits32;
2221
VAR zSig1Ptr : Bits32
2226
if ( aSig0 = 0 ) then
2228
shiftCount := countLeadingZeros32( aSig1 ) - 11;
2229
if ( shiftCount < 0 ) then
2231
zSig0Ptr := aSig1 shr ( - shiftCount );
2232
zSig1Ptr := aSig1 shl ( shiftCount AND 31 );
2236
zSig0Ptr := aSig1 shl shiftCount;
2239
zExpPtr := - shiftCount - 31;
2243
shiftCount := countLeadingZeros32( aSig0 ) - 11;
2244
shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2245
zExpPtr := 1 - shiftCount;
2250
-------------------------------------------------------------------------------
2251
Packs the sign `zSign', the exponent `zExp', and the significand formed by
2252
the concatenation of `zSig0' and `zSig1' into a double-precision floating-
2253
point value, returning the result. After being shifted into the proper
2254
positions, the three fields `zSign', `zExp', and `zSig0' are simply added
2255
together to form the most significant 32 bits of the result. This means
2256
that any integer portion of `zSig0' will be added into the exponent. Since
2257
a properly normalized significand will have an integer portion equal to 1,
2258
the `zExp' input should be 1 less than the desired result exponent whenever
2259
`zSig0' and `zSig1' concatenated form a complete, normalized significand.
2260
-------------------------------------------------------------------------------
2263
packFloat64( zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1 : Bits32; VAR c : float64);
2269
z.high := ( ( bits32 (zSign) ) shl 31 ) + ( ( bits32 (zExp) ) shl 20 ) + zSig0;
2274
{*----------------------------------------------------------------------------
2275
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2276
| double-precision floating-point value, returning the result. After being
2277
| shifted into the proper positions, the three fields are simply added
2278
| together to form the result. This means that any integer portion of `zSig'
2279
| will be added into the exponent. Since a properly normalized significand
2280
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2281
| than the desired result exponent whenever `zSig' is a complete, normalized
2283
*----------------------------------------------------------------------------*}
2285
function packFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;inline;
2287
result := float64(( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 52 ) + zSig);
2291
-------------------------------------------------------------------------------
2292
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2293
and extended significand formed by the concatenation of `zSig0', `zSig1',
2294
and `zSig2', and returns the proper double-precision floating-point value
2295
corresponding to the abstract input. Ordinarily, the abstract value is
2296
simply rounded and packed into the double-precision format, with the inexact
2297
exception raised if the abstract input cannot be represented exactly.
2298
However, if the abstract value is too large, the overflow and inexact
2299
exceptions are raised and an infinity or maximal finite value is returned.
2300
If the abstract value is too small, the input value is rounded to a
2301
subnormal number, and the underflow and inexact exceptions are raised if the
2302
abstract input cannot be represented exactly as a subnormal double-precision
2303
floating-point number.
2304
The input significand must be normalized or smaller. If the input
2305
significand is not normalized, `zExp' must be 0; in that case, the result
2306
returned is a subnormal number, and it must not require rounding. In the
2307
usual case that the input significand is normalized, `zExp' must be 1 less
2308
than the ``true'' floating-point exponent. The handling of underflow and
2309
overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2310
-------------------------------------------------------------------------------
2313
roundAndPackFloat64(
2314
zSign: Flag; zExp: Int16; zSig0: Bits32; zSig1: Bits32; zSig2: Bits32; Var c: Float64 );
2316
roundingMode : Int8;
2317
roundNearestEven, increment, isTiny : Flag;
2320
roundingMode := float_rounding_mode;
2321
roundNearestEven := flag( roundingMode = float_round_nearest_even );
2322
increment := flag( sbits32 (zSig2) < 0 );
2323
if ( roundNearestEven = flag(FALSE) ) then
2325
if ( roundingMode = float_round_to_zero ) then
2329
if ( zSign )<> 0 then
2331
increment := flag( roundingMode = float_round_down ) and zSig2;
2335
increment := flag( roundingMode = float_round_up ) and zSig2;
2339
if ( $7FD <= bits16 (zExp) ) then
2343
and (eq64( $001FFFFF, $FFFFFFFF, zSig0, zSig1 )<>0)
2348
float_raise( float_flag_overflow OR float_flag_inexact );
2349
if (( roundingMode = float_round_to_zero )
2350
or ( (zSign<>0) and ( roundingMode = float_round_up ) )
2351
or ( (zSign = 0) and ( roundingMode = float_round_down ) )
2354
packFloat64( zSign, $7FE, $000FFFFF, $FFFFFFFF, c );
2357
packFloat64( zSign, $7FF, 0, 0, c );
2360
if ( zExp < 0 ) then
2363
flag( float_detect_tininess = float_tininess_before_rounding )
2364
or flag( zExp < -1 )
2365
or flag(increment = 0)
2366
or flag(lt64( zSig0, zSig1, $001FFFFF, $FFFFFFFF)<>0);
2367
shift64ExtraRightJamming(
2368
zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
2370
if ( isTiny<>0) and (zSig2<>0 ) then float_raise( float_flag_underflow );
2371
if ( roundNearestEven )<>0 then
2373
increment := flag( sbits32 (zSig2) < 0 );
2377
if ( zSign )<>0 then
2379
increment := flag( roundingMode = float_round_down ) and zSig2;
2383
increment := flag( roundingMode = float_round_up ) and zSig2;
2388
if ( zSig2 )<>0 then
2389
softfloat_exception_flags := softfloat_exception_flags OR float_flag_inexact;
2390
if ( increment )<>0 then
2392
add64( zSig0, zSig1, 0, 1, zSig0, zSig1 );
2393
zSig1 := zSig1 and not ( bits32(flag( zSig2 + zSig2 = 0 )) and roundNearestEven );
2397
if ( ( zSig0 or zSig1 ) = 0 ) then zExp := 0;
2399
packFloat64( zSign, zExp, zSig0, zSig1, c );
2402
{*----------------------------------------------------------------------------
2403
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2404
| and significand `zSig', and returns the proper double-precision floating-
2405
| point value corresponding to the abstract input. Ordinarily, the abstract
2406
| value is simply rounded and packed into the double-precision format, with
2407
| the inexact exception raised if the abstract input cannot be represented
2408
| exactly. However, if the abstract value is too large, the overflow and
2409
| inexact exceptions are raised and an infinity or maximal finite value is
2410
| returned. If the abstract value is too small, the input value is rounded
2411
| to a subnormal number, and the underflow and inexact exceptions are raised
2412
| if the abstract input cannot be represented exactly as a subnormal double-
2413
| precision floating-point number.
2414
| The input significand `zSig' has its binary point between bits 62
2415
| and 61, which is 10 bits to the left of the usual location. This shifted
2416
| significand must be normalized or smaller. If `zSig' is not normalized,
2417
| `zExp' must be 0; in that case, the result returned is a subnormal number,
2418
| and it must not require rounding. In the usual case that `zSig' is
2419
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2420
| The handling of underflow and overflow follows the IEC/IEEE Standard for
2421
| Binary Floating-Point Arithmetic.
2422
*----------------------------------------------------------------------------*}
2424
function roundAndPackFloat64( zSign: flag; zExp: int16; zSig : bits64): float64;
2427
roundNearestEven: flag;
2428
roundIncrement, roundBits: int16;
2431
roundingMode := float_rounding_mode;
2432
roundNearestEven := ord( roundingMode = float_round_nearest_even );
2433
roundIncrement := $200;
2434
if ( roundNearestEven=0 ) then
2436
if ( roundingMode = float_round_to_zero ) then
2438
roundIncrement := 0;
2441
roundIncrement := $3FF;
2442
if ( zSign<>0 ) then
2444
if ( roundingMode = float_round_up ) then
2445
roundIncrement := 0;
2448
if ( roundingMode = float_round_down ) then
2449
roundIncrement := 0;
2453
roundBits := zSig and $3FF;
2454
if ( $7FD <= bits16(zExp) ) then
2456
if ( ( $7FD < zExp )
2457
or ( ( zExp = $7FD )
2458
and ( sbits64( zSig + roundIncrement ) < 0 ) )
2461
float_raise( float_flag_overflow or float_flag_inexact );
2462
result := float64(qword(packFloat64( zSign, $7FF, 0 )) - ord( roundIncrement = 0 ));
2465
if ( zExp < 0 ) then
2468
( float_detect_tininess = float_tininess_before_rounding )
2470
or ( (zSig + roundIncrement) < int64( $8000000000000000 ) ) );
2471
shift64RightJamming( zSig, - zExp, zSig );
2473
roundBits := zSig and $3FF;
2474
if ( isTiny and roundBits )<>0 then
2475
float_raise( float_flag_underflow );
2478
if ( roundBits<>0 ) then
2479
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
2480
zSig := ( zSig + roundIncrement ) shr 10;
2481
zSig := zSig and not( ord( ( roundBits xor $200 ) = 0 ) and roundNearestEven );
2482
if ( zSig = 0 ) then
2484
result:=packFloat64( zSign, zExp, zSig );
2488
-------------------------------------------------------------------------------
2489
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2490
and significand formed by the concatenation of `zSig0' and `zSig1', and
2491
returns the proper double-precision floating-point value corresponding
2492
to the abstract input. This routine is just like `roundAndPackFloat64'
2493
except that the input significand has fewer bits and does not have to be
2494
normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
2496
-------------------------------------------------------------------------------
2499
normalizeRoundAndPackFloat64(
2500
zSign:flag; zExp:int16; zSig0:bits32; zSig1:bits32; VAR c: float64 );
2506
if ( zSig0 = 0 ) then
2512
shiftCount := countLeadingZeros32( zSig0 ) - 11;
2513
if ( 0 <= shiftCount ) then
2516
shortShift64Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
2520
shift64ExtraRightJamming
2521
(zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
2523
zExp := zExp - shiftCount;
2524
roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, c );
2528
-------------------------------------------------------------------------------
2529
Returns the result of converting the 32-bit two's complement integer `a' to
2530
the single-precision floating-point format. The conversion is performed
2531
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2532
-------------------------------------------------------------------------------
2534
Function int32_to_float32( a: int32): float32rec; compilerproc;
2541
int32_to_float32.float32 := 0;
2544
if ( a = sbits32 ($80000000) ) then
2546
int32_to_float32.float32 := packFloat32( 1, $9E, 0 );
2549
zSign := flag( a < 0 );
2552
int32_to_float32.float32:=
2553
normalizeRoundAndPackFloat32( zSign, $9C, a );
2558
-------------------------------------------------------------------------------
2559
Returns the result of converting the 32-bit two's complement integer `a' to
2560
the double-precision floating-point format. The conversion is performed
2561
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2562
-------------------------------------------------------------------------------
2564
Function int32_to_float64( a: int32) : float64;{$ifdef fpc} [public,Alias:'INT32_TO_FLOAT64'];compilerproc;{$endif}
2569
zSig0, zSig1 : bits32;
2574
packFloat64( 0, 0, 0, 0, result );
2577
zSign := flag( a < 0 );
2582
shiftCount := countLeadingZeros32( absA ) - 11;
2583
if ( 0 <= shiftCount ) then
2585
zSig0 := absA shl shiftCount;
2590
shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
2592
packFloat64( zSign, $412 - shiftCount, zSig0, zSig1, result );
2596
-------------------------------------------------------------------------------
2597
Returns the result of converting the single-precision floating-point value
2598
`a' to the 32-bit two's complement integer format. The conversion is
2599
performed according to the IEC/IEEE Standard for Binary Floating-Point
2600
Arithmetic---which means in particular that the conversion is rounded
2601
according to the current rounding mode. If `a' is a NaN, the largest
2602
positive integer is returned. Otherwise, if the conversion overflows, the
2603
largest integer with the same sign as `a' is returned.
2604
-------------------------------------------------------------------------------
2606
Function float32_to_int32( a : float32rec) : int32;compilerproc;
2609
aExp, shiftCount: int16;
2610
aSig, aSigExtra: bits32;
2615
aSig := extractFloat32Frac( a.float32 );
2616
aExp := extractFloat32Exp( a.float32 );
2617
aSign := extractFloat32Sign( a.float32 );
2618
shiftCount := aExp - $96;
2619
if ( 0 <= shiftCount ) then
2621
if ( $9E <= aExp ) then
2623
if ( a.float32 <> $CF000000 ) then
2625
float_raise( float_flag_invalid );
2626
if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
2628
float32_to_int32 := $7FFFFFFF;
2632
float32_to_int32 := sbits32 ($80000000);
2635
z := ( aSig or $00800000 ) shl shiftCount;
2636
if ( aSign<>0 ) then z := - z;
2640
if ( aExp < $7E ) then
2642
aSigExtra := aExp OR aSig;
2647
aSig := aSig OR $00800000;
2648
aSigExtra := aSig shl ( shiftCount and 31 );
2649
z := aSig shr ( - shiftCount );
2651
if ( aSigExtra<>0 ) then
2652
softfloat_exception_flags := softfloat_exception_flags
2653
or float_flag_inexact;
2654
roundingMode := float_rounding_mode;
2655
if ( roundingMode = float_round_nearest_even ) then
2657
if ( sbits32 (aSigExtra) < 0 ) then
2660
if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
2663
if ( aSign<>0 ) then
2668
aSigExtra := flag( aSigExtra <> 0 );
2669
if ( aSign<>0 ) then
2671
z := z + (flag( roundingMode = float_round_down ) and aSigExtra);
2676
z := z + (flag( roundingMode = float_round_up ) and aSigExtra);
2680
float32_to_int32 := z;
2684
-------------------------------------------------------------------------------
2685
Returns the result of converting the single-precision floating-point value
2686
`a' to the 32-bit two's complement integer format. The conversion is
2687
performed according to the IEC/IEEE Standard for Binary Floating-Point
2688
Arithmetic, except that the conversion is always rounded toward zero.
2689
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2690
the conversion overflows, the largest integer with the same sign as `a' is
2692
-------------------------------------------------------------------------------
2694
Function float32_to_int32_round_to_zero( a: Float32rec ): int32;compilerproc;
2697
aExp, shiftCount : int16;
2701
aSig := extractFloat32Frac( a.float32 );
2702
aExp := extractFloat32Exp( a.float32 );
2703
aSign := extractFloat32Sign( a.float32 );
2704
shiftCount := aExp - $9E;
2705
if ( 0 <= shiftCount ) then
2707
if ( a.float32 <> $CF000000 ) then
2709
float_raise( float_flag_invalid );
2710
if ( (aSign=0) or ( ( aExp = $FF ) and (aSig<>0) ) ) then
2712
float32_to_int32_round_to_zero := $7FFFFFFF;
2716
float32_to_int32_round_to_zero:= sbits32 ($80000000);
2720
if ( aExp <= $7E ) then
2722
if ( aExp or aSig )<>0 then
2723
softfloat_exception_flags :=
2724
softfloat_exception_flags or float_flag_inexact;
2725
float32_to_int32_round_to_zero := 0;
2728
aSig := ( aSig or $00800000 ) shl 8;
2729
z := aSig shr ( - shiftCount );
2730
if ( bits32 ( aSig shl ( shiftCount and 31 ) )<> 0 ) then
2732
softfloat_exception_flags :=
2733
softfloat_exception_flags or float_flag_inexact;
2735
if ( aSign<>0 ) then z := - z;
2736
float32_to_int32_round_to_zero := z;
2740
-------------------------------------------------------------------------------
2741
Returns the result of converting the single-precision floating-point value
2742
`a' to the double-precision floating-point format. The conversion is
2743
performed according to the IEC/IEEE Standard for Binary Floating-Point
2745
-------------------------------------------------------------------------------
2747
Function float32_to_float64( a : float32rec) : Float64;compilerproc;
2751
aSig, zSig0, zSig1: bits32;
2754
aSig := extractFloat32Frac( a.float32 );
2755
aExp := extractFloat32Exp( a.float32 );
2756
aSign := extractFloat32Sign( a.float32 );
2757
if ( aExp = $FF ) then
2761
float32ToCommonNaN(a.float32, tmp);
2762
commonNaNToFloat64(tmp , result);
2765
packFloat64( aSign, $7FF, 0, 0, result);
2768
if ( aExp = 0 ) then
2770
if ( aSig = 0 ) then
2772
packFloat64( aSign, 0, 0, 0, result );
2775
normalizeFloat32Subnormal( aSig, aExp, aSig );
2778
shift64Right( aSig, 0, 3, zSig0, zSig1 );
2779
packFloat64( aSign, aExp + $380, zSig0, zSig1, result );
2783
-------------------------------------------------------------------------------
2784
Rounds the single-precision floating-point value `a' to an integer,
2785
and returns the result as a single-precision floating-point value. The
2786
operation is performed according to the IEC/IEEE Standard for Binary
2787
Floating-Point Arithmetic.
2788
-------------------------------------------------------------------------------
2790
Function float32_round_to_int( a: float32rec): float32rec;compilerproc;
2794
lastBitMask, roundBitsMask: bits32;
2798
aExp := extractFloat32Exp( a.float32 );
2799
if ( $96 <= aExp ) then
2801
if ( ( aExp = $FF ) and (extractFloat32Frac( a.float32 )<>0) ) then
2803
float32_round_to_int.float32 := propagateFloat32NaN( a.float32, a.float32 );
2806
float32_round_to_int:=a;
2809
if ( aExp <= $7E ) then
2811
if ( bits32 ( a.float32 shl 1 ) = 0 ) then
2813
float32_round_to_int:=a;
2816
softfloat_exception_flags
2817
:= softfloat_exception_flags OR float_flag_inexact;
2818
aSign := extractFloat32Sign( a.float32 );
2820
case ( float_rounding_mode ) of
2821
float_round_nearest_even:
2823
if ( ( aExp = $7E ) and (extractFloat32Frac( a.float32 )<>0) ) then
2825
float32_round_to_int.float32 := packFloat32( aSign, $7F, 0 );
2832
float32_round_to_int.float32 := $BF800000
2834
float32_round_to_int.float32 := 0;
2840
float32_round_to_int.float32 := $80000000
2842
float32_round_to_int.float32 := $3F800000;
2846
float32_round_to_int.float32 := packFloat32( aSign, 0, 0 );
2849
{_____________________________!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!}
2850
lastBitMask := lastBitMask shl ($96 - aExp);
2851
roundBitsMask := lastBitMask - 1;
2853
roundingMode := float_rounding_mode;
2854
if ( roundingMode = float_round_nearest_even ) then
2856
z := z + (lastBitMask shr 1);
2857
if ( ( z and roundBitsMask ) = 0 ) then
2858
z := z and not lastBitMask;
2860
else if ( roundingMode <> float_round_to_zero ) then
2862
if ( (extractFloat32Sign( z ) xor flag(roundingMode = float_round_up ))<>0 ) then
2864
z := z + roundBitsMask;
2867
z := z and not roundBitsMask;
2868
if ( z <> a.float32 ) then
2869
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
2870
float32_round_to_int.float32 := z;
2874
-------------------------------------------------------------------------------
2875
Returns the result of adding the absolute values of the single-precision
2876
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2877
before being returned. `zSign' is ignored if the result is a NaN.
2878
The addition is performed according to the IEC/IEEE Standard for Binary
2879
Floating-Point Arithmetic.
2880
-------------------------------------------------------------------------------
2882
Function addFloat32Sigs( a:float32; b: float32; zSign:flag ): float32;
2884
aExp, bExp, zExp: int16;
2885
aSig, bSig, zSig: bits32;
2889
aSig:=extractFloat32Frac( a );
2890
aExp:=extractFloat32Exp( a );
2891
bSig:=extractFloat32Frac( b );
2892
bExp := extractFloat32Exp( b );
2893
expDiff := aExp - bExp;
2896
if ( 0 < expDiff ) then
2898
if ( aExp = $FF ) then
2900
if ( aSig <> 0) then
2902
addFloat32Sigs := propagateFloat32NaN( a, b );
2905
addFloat32Sigs := a;
2908
if ( bExp = 0 ) then
2914
bSig := bSig or $20000000;
2916
shift32RightJamming( bSig, expDiff, bSig );
2920
If ( expDiff < 0 ) then
2922
if ( bExp = $FF ) then
2926
addFloat32Sigs := propagateFloat32NaN( a, b );
2930
addFloat32Sigs := packFloat32( zSign, $FF, 0 );
2933
if ( aExp = 0 ) then
2939
aSig := aSig OR $20000000;
2941
shift32RightJamming( aSig, - expDiff, aSig );
2946
if ( aExp = $FF ) then
2948
if ( aSig OR bSig )<> 0 then
2950
addFloat32Sigs := propagateFloat32NaN( a, b );
2953
addFloat32Sigs := a;
2956
if ( aExp = 0 ) then
2958
addFloat32Sigs := packFloat32( zSign, 0, ( aSig + bSig ) shr 6 );
2961
zSig := $40000000 + aSig + bSig;
2965
aSig := aSig OR $20000000;
2966
zSig := ( aSig + bSig ) shl 1;
2968
if ( sbits32 (zSig) < 0 ) then
2970
zSig := aSig + bSig;
2974
addFloat32Sigs := roundAndPackFloat32( zSign, zExp, zSig );
2978
-------------------------------------------------------------------------------
2979
Returns the result of subtracting the absolute values of the single-
2980
precision floating-point values `a' and `b'. If `zSign' is 1, the
2981
difference is negated before being returned. `zSign' is ignored if the
2982
result is a NaN. The subtraction is performed according to the IEC/IEEE
2983
Standard for Binary Floating-Point Arithmetic.
2984
-------------------------------------------------------------------------------
2986
Function subFloat32Sigs( a:float32; b:float32; zSign:flag ): float32;
2988
aExp, bExp, zExp: int16;
2989
aSig, bSig, zSig: bits32;
2995
label normalizeRoundAndPack;
2997
aSig := extractFloat32Frac( a );
2998
aExp := extractFloat32Exp( a );
2999
bSig := extractFloat32Frac( b );
3000
bExp := extractFloat32Exp( b );
3001
expDiff := aExp - bExp;
3004
if ( 0 < expDiff ) then goto aExpBigger;
3005
if ( expDiff < 0 ) then goto bExpBigger;
3006
if ( aExp = $FF ) then
3008
if ( aSig OR bSig )<> 0 then
3010
subFloat32Sigs := propagateFloat32NaN( a, b );
3013
float_raise( float_flag_invalid );
3014
subFloat32Sigs := float32_default_nan;
3017
if ( aExp = 0 ) then
3022
if ( bSig < aSig ) Then goto aBigger;
3023
if ( aSig < bSig ) Then goto bBigger;
3024
subFloat32Sigs := packFloat32( flag(float_rounding_mode = float_round_down), 0, 0 );
3027
if ( bExp = $FF ) then
3031
subFloat32Sigs := propagateFloat32NaN( a, b );
3034
subFloat32Sigs := packFloat32( zSign XOR 1, $FF, 0 );
3037
if ( aExp = 0 ) then
3043
aSig := aSig OR $40000000;
3045
shift32RightJamming( aSig, - expDiff, aSig );
3046
bSig := bSig OR $40000000;
3048
zSig := bSig - aSig;
3050
zSign := zSign xor 1;
3051
goto normalizeRoundAndPack;
3053
if ( aExp = $FF ) then
3055
if ( aSig <> 0) then
3057
subFloat32Sigs := propagateFloat32NaN( a, b );
3060
subFloat32Sigs := a;
3063
if ( bExp = 0 ) then
3069
bSig := bSig OR $40000000;
3071
shift32RightJamming( bSig, expDiff, bSig );
3072
aSig := aSig OR $40000000;
3074
zSig := aSig - bSig;
3076
normalizeRoundAndPack:
3078
subFloat32Sigs := normalizeRoundAndPackFloat32( zSign, zExp, zSig );
3082
-------------------------------------------------------------------------------
3083
Returns the result of adding the single-precision floating-point values `a'
3084
and `b'. The operation is performed according to the IEC/IEEE Standard for
3085
Binary Floating-Point Arithmetic.
3086
-------------------------------------------------------------------------------
3088
Function float32_add( a: float32rec; b:float32rec ): float32rec; compilerproc;
3092
aSign := extractFloat32Sign( a.float32 );
3093
bSign := extractFloat32Sign( b.float32 );
3094
if ( aSign = bSign ) then
3096
float32_add.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
3100
float32_add.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
3105
-------------------------------------------------------------------------------
3106
Returns the result of subtracting the single-precision floating-point values
3107
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
3108
for Binary Floating-Point Arithmetic.
3109
-------------------------------------------------------------------------------
3111
Function float32_sub( a: float32rec ; b:float32rec ): float32rec;compilerproc;
3115
aSign := extractFloat32Sign( a.float32 );
3116
bSign := extractFloat32Sign( b.float32 );
3117
if ( aSign = bSign ) then
3119
float32_sub.float32 := subFloat32Sigs( a.float32, b.float32, aSign );
3123
float32_sub.float32 := addFloat32Sigs( a.float32, b.float32, aSign );
3128
-------------------------------------------------------------------------------
3129
Returns the result of multiplying the single-precision floating-point values
3130
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
3131
for Binary Floating-Point Arithmetic.
3132
-------------------------------------------------------------------------------
3134
Function float32_mul(a: float32rec; b: float32rec ) : float32rec; compilerproc;
3137
aSign, bSign, zSign: flag;
3138
aExp, bExp, zExp : int16;
3139
aSig, bSig, zSig0, zSig1: bits32;
3141
aSig := extractFloat32Frac( a.float32 );
3142
aExp := extractFloat32Exp( a.float32 );
3143
aSign := extractFloat32Sign( a.float32 );
3144
bSig := extractFloat32Frac( b.float32 );
3145
bExp := extractFloat32Exp( b.float32 );
3146
bSign := extractFloat32Sign( b.float32 );
3147
zSign := aSign xor bSign;
3148
if ( aExp = $FF ) then
3150
if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig<>0) ) ) then
3152
float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
3154
if ( ( bExp OR bSig ) = 0 ) then
3156
float_raise( float_flag_invalid );
3157
float32_mul.float32 := float32_default_nan;
3160
float32_mul.float32 := packFloat32( zSign, $FF, 0 );
3163
if ( bExp = $FF ) then
3165
if ( bSig <> 0 ) then
3167
float32_mul.float32 := propagateFloat32NaN( a.float32, b.float32 );
3170
if ( ( aExp OR aSig ) = 0 ) then
3172
float_raise( float_flag_invalid );
3173
float32_mul.float32 := float32_default_nan;
3176
float32_mul.float32 := packFloat32( zSign, $FF, 0 );
3179
if ( aExp = 0 ) then
3181
if ( aSig = 0 ) then
3183
float32_mul.float32 := packFloat32( zSign, 0, 0 );
3186
normalizeFloat32Subnormal( aSig, aExp, aSig );
3188
if ( bExp = 0 ) then
3190
if ( bSig = 0 ) then
3192
float32_mul.float32 := packFloat32( zSign, 0, 0 );
3195
normalizeFloat32Subnormal( bSig, bExp, bSig );
3197
zExp := aExp + bExp - $7F;
3198
aSig := ( aSig OR $00800000 ) shl 7;
3199
bSig := ( bSig OR $00800000 ) shl 8;
3200
mul32To64( aSig, bSig, zSig0, zSig1 );
3201
zSig0 := zSig0 OR bits32( zSig1 <> 0 );
3202
if ( 0 <= sbits32 ( zSig0 shl 1 ) ) then
3204
zSig0 := zSig0 shl 1;
3207
float32_mul.float32 := roundAndPackFloat32( zSign, zExp, zSig0 );
3211
-------------------------------------------------------------------------------
3212
Returns the result of dividing the single-precision floating-point value `a'
3213
by the corresponding value `b'. The operation is performed according to the
3214
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3215
-------------------------------------------------------------------------------
3217
Function float32_div(a: float32rec;b: float32rec ): float32rec; compilerproc;
3219
aSign, bSign, zSign: flag;
3220
aExp, bExp, zExp: int16;
3221
aSig, bSig, zSig, rem0, rem1, term0, term1: bits32;
3223
aSig := extractFloat32Frac( a.float32 );
3224
aExp := extractFloat32Exp( a.float32 );
3225
aSign := extractFloat32Sign( a.float32 );
3226
bSig := extractFloat32Frac( b.float32 );
3227
bExp := extractFloat32Exp( b.float32 );
3228
bSign := extractFloat32Sign( b.float32 );
3229
zSign := aSign xor bSign;
3230
if ( aExp = $FF ) then
3232
if ( aSig <> 0 ) then
3234
float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
3237
if ( bExp = $FF ) then
3239
if ( bSig <> 0) then
3241
float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
3243
float_raise( float_flag_invalid );
3244
float32_div.float32 := float32_default_nan;
3247
float32_div.float32 := packFloat32( zSign, $FF, 0 );
3250
if ( bExp = $FF ) then
3252
if ( bSig <> 0) then
3254
float32_div.float32 := propagateFloat32NaN( a.float32, b.float32 );
3257
float32_div.float32 := packFloat32( zSign, 0, 0 );
3260
if ( bExp = 0 ) Then
3262
if ( bSig = 0 ) Then
3264
if ( ( aExp OR aSig ) = 0 ) then
3266
float_raise( float_flag_invalid );
3267
float32_div.float32 := float32_default_nan;
3270
float_raise( float_flag_divbyzero );
3271
float32_div.float32 := packFloat32( zSign, $FF, 0 );
3274
normalizeFloat32Subnormal( bSig, bExp, bSig );
3276
if ( aExp = 0 ) Then
3278
if ( aSig = 0 ) Then
3280
float32_div.float32 := packFloat32( zSign, 0, 0 );
3283
normalizeFloat32Subnormal( aSig, aExp, aSig );
3285
zExp := aExp - bExp + $7D;
3286
aSig := ( aSig OR $00800000 ) shl 7;
3287
bSig := ( bSig OR $00800000 ) shl 8;
3288
if ( bSig <= ( aSig + aSig ) ) then
3293
zSig := estimateDiv64To32( aSig, 0, bSig );
3294
if ( ( zSig and $3F ) <= 2 ) then
3296
mul32To64( bSig, zSig, term0, term1 );
3297
sub64( aSig, 0, term0, term1, rem0, rem1 );
3298
while ( sbits32 (rem0) < 0 ) do
3301
add64( rem0, rem1, 0, bSig, rem0, rem1 );
3303
zSig := zSig or bits32( rem1 <> 0 );
3305
float32_div.float32 := roundAndPackFloat32( zSign, zExp, zSig );
3310
-------------------------------------------------------------------------------
3311
Returns the remainder of the single-precision floating-point value `a'
3312
with respect to the corresponding value `b'. The operation is performed
3313
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3314
-------------------------------------------------------------------------------
3316
Function float32_rem(a: float32rec; b: float32rec ):float32rec; compilerproc;
3318
aSign, bSign, zSign: flag;
3319
aExp, bExp, expDiff: int16;
3320
aSig, bSig, q, allZero, alternateASig: bits32;
3323
aSig := extractFloat32Frac( a.float32 );
3324
aExp := extractFloat32Exp( a.float32 );
3325
aSign := extractFloat32Sign( a.float32 );
3326
bSig := extractFloat32Frac( b.float32 );
3327
bExp := extractFloat32Exp( b.float32 );
3328
bSign := extractFloat32Sign( b.float32 );
3329
if ( aExp = $FF ) then
3331
if ( (aSig<>0) OR ( ( bExp = $FF ) AND (bSig <>0)) ) then
3333
float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
3336
float_raise( float_flag_invalid );
3337
float32_rem.float32 := float32_default_nan;
3340
if ( bExp = $FF ) then
3342
if ( bSig <> 0 ) then
3344
float32_rem.float32 := propagateFloat32NaN( a.float32, b.float32 );
3350
if ( bExp = 0 ) then
3352
if ( bSig = 0 ) then
3354
float_raise( float_flag_invalid );
3355
float32_rem.float32 := float32_default_nan;
3358
normalizeFloat32Subnormal( bSig, bExp, bSig );
3360
if ( aExp = 0 ) then
3362
if ( aSig = 0 ) then
3367
normalizeFloat32Subnormal( aSig, aExp, aSig );
3369
expDiff := aExp - bExp;
3370
aSig := ( aSig OR $00800000 ) shl 8;
3371
bSig := ( bSig OR $00800000 ) shl 8;
3372
if ( expDiff < 0 ) then
3374
if ( expDiff < -1 ) then
3381
q := bits32( bSig <= aSig );
3383
aSig := aSig - bSig;
3384
expDiff := expDiff - 32;
3385
while ( 0 < expDiff ) do
3387
q := estimateDiv64To32( aSig, 0, bSig );
3392
aSig := - ( ( bSig shr 2 ) * q );
3393
expDiff := expDiff - 30;
3395
expDiff := expDiff + 32;
3396
if ( 0 < expDiff ) then
3398
q := estimateDiv64To32( aSig, 0, bSig );
3403
q := q shr (32 - expDiff);
3405
aSig := ( ( aSig shr 1 ) shl ( expDiff - 1 ) ) - bSig * q;
3413
alternateASig := aSig;
3415
aSig := aSig - bSig;
3416
Until not ( 0 <= sbits32 (aSig) );
3417
sigMean := aSig + alternateASig;
3418
if ( ( sigMean < 0 ) OR ( ( sigMean = 0 ) AND (( q and 1 )<>0) ) ) then
3420
aSig := alternateASig;
3422
zSign := flag( sbits32 (aSig) < 0 );
3423
if ( zSign<>0 ) then
3425
float32_rem.float32 := normalizeRoundAndPackFloat32( aSign xor zSign, bExp, aSig );
3429
-------------------------------------------------------------------------------
3430
Returns the square root of the single-precision floating-point value `a'.
3431
The operation is performed according to the IEC/IEEE Standard for Binary
3432
Floating-Point Arithmetic.
3433
-------------------------------------------------------------------------------
3435
Function float32_sqrt(a: float32rec ): float32rec;compilerproc;
3439
aSig, zSig, rem0, rem1, term0, term1: bits32;
3442
aSig := extractFloat32Frac( a.float32 );
3443
aExp := extractFloat32Exp( a.float32 );
3444
aSign := extractFloat32Sign( a.float32 );
3445
if ( aExp = $FF ) then
3447
if ( aSig <> 0) then
3449
float32_sqrt.float32 := propagateFloat32NaN( a.float32, 0 );
3452
if ( aSign = 0) then
3457
float_raise( float_flag_invalid );
3458
float32_sqrt.float32 := float32_default_nan;
3461
if ( aSign <> 0) then
3463
if ( ( aExp OR aSig ) = 0 ) then
3468
float_raise( float_flag_invalid );
3469
float32_sqrt.float32 := float32_default_nan;
3472
if ( aExp = 0 ) then
3474
if ( aSig = 0 ) then
3476
float32_sqrt.float32 := 0;
3479
normalizeFloat32Subnormal( aSig, aExp, aSig );
3481
zExp := ( ( aExp - $7F ) shr 1 ) + $7E;
3482
aSig := ( aSig OR $00800000 ) shl 8;
3483
zSig := estimateSqrt32( aExp, aSig ) + 2;
3484
if ( ( zSig and $7F ) <= 5 ) then
3486
if ( zSig < 2 ) then
3493
aSig := aSig shr (aExp and 1);
3494
mul32To64( zSig, zSig, term0, term1 );
3495
sub64( aSig, 0, term0, term1, rem0, rem1 );
3496
while ( sbits32 (rem0) < 0 ) do
3499
shortShift64Left( 0, zSig, 1, term0, term1 );
3500
term1 := term1 or 1;
3501
add64( rem0, rem1, term0, term1, rem0, rem1 );
3503
zSig := zSig OR bits32( ( rem0 OR rem1 ) <> 0 );
3506
shift32RightJamming( zSig, 1, zSig );
3508
float32_sqrt.float32 := roundAndPackFloat32( 0, zExp, zSig );
3512
-------------------------------------------------------------------------------
3513
Returns 1 if the single-precision floating-point value `a' is equal to
3514
the corresponding value `b', and 0 otherwise. The comparison is performed
3515
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3516
-------------------------------------------------------------------------------
3518
Function float32_eq( a:float32rec; b:float32rec): flag; compilerproc;
3520
if ((( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0))
3521
OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
3524
if ( (float32_is_signaling_nan( a.float32 )<>0) OR (float32_is_signaling_nan( b.float32 )<>0) ) then
3526
float_raise( float_flag_invalid );
3531
float32_eq := flag( a.float32 = b.float32 ) OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
3535
-------------------------------------------------------------------------------
3536
Returns 1 if the single-precision floating-point value `a' is less than
3537
or equal to the corresponding value `b', and 0 otherwise. The comparison
3538
is performed according to the IEC/IEEE Standard for Binary Floating-Point
3540
-------------------------------------------------------------------------------
3542
Function float32_le( a: float32rec; b : float32rec ):flag;compilerproc;
3547
if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 )<>0) )
3548
OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 )<>0) )
3551
float_raise( float_flag_invalid );
3555
aSign := extractFloat32Sign( a.float32 );
3556
bSign := extractFloat32Sign( b.float32 );
3557
if ( aSign <> bSign ) then
3559
float32_le := aSign OR flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) = 0 );
3562
float32_le := flag(flag( a.float32 = b.float32 ) OR flag( aSign xor flag( a.float32 < b.float32 ) ));
3567
-------------------------------------------------------------------------------
3568
Returns 1 if the single-precision floating-point value `a' is less than
3569
the corresponding value `b', and 0 otherwise. The comparison is performed
3570
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571
-------------------------------------------------------------------------------
3573
Function float32_lt( a:float32rec ; b : float32rec): flag; compilerproc;
3578
if ( ( ( extractFloat32Exp( a.float32 ) = $FF ) AND (extractFloat32Frac( a.float32 ) <>0))
3579
OR ( ( extractFloat32Exp( b.float32 ) = $FF ) AND (extractFloat32Frac( b.float32 ) <>0) )
3582
float_raise( float_flag_invalid );
3586
aSign := extractFloat32Sign( a.float32 );
3587
bSign := extractFloat32Sign( b.float32 );
3588
if ( aSign <> bSign ) then
3590
float32_lt := aSign AND flag( bits32 ( ( a.float32 OR b.float32 ) shl 1 ) <> 0 );
3593
float32_lt := flag(flag( a.float32 <> b.float32 ) AND flag( aSign xor flag( a.float32 < b.float32 ) ));
3598
-------------------------------------------------------------------------------
3599
Returns 1 if the single-precision floating-point value `a' is equal to
3600
the corresponding value `b', and 0 otherwise. The invalid exception is
3601
raised if either operand is a NaN. Otherwise, the comparison is performed
3602
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3603
-------------------------------------------------------------------------------
3605
Function float32_eq_signaling( a: float32; b: float32) : flag;
3608
if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a ) <> 0))
3609
OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b ) <> 0))
3612
float_raise( float_flag_invalid );
3613
float32_eq_signaling := 0;
3616
float32_eq_signaling := (flag( a = b ) OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 ));
3620
-------------------------------------------------------------------------------
3621
Returns 1 if the single-precision floating-point value `a' is less than or
3622
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3623
cause an exception. Otherwise, the comparison is performed according to the
3624
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625
-------------------------------------------------------------------------------
3627
Function float32_le_quiet( a: float32 ; b : float32 ): flag;
3632
if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
3633
OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
3636
if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
3638
float_raise( float_flag_invalid );
3640
float32_le_quiet := 0;
3643
aSign := extractFloat32Sign( a );
3644
bSign := extractFloat32Sign( b );
3645
if ( aSign <> bSign ) then
3647
float32_le_quiet := aSign OR flag( bits32 ( ( a OR b ) shl 1 ) = 0 );
3650
float32_le_quiet := flag(flag( a = b ) OR flag( aSign xor flag( a < b ) ));
3654
-------------------------------------------------------------------------------
3655
Returns 1 if the single-precision floating-point value `a' is less than
3656
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3657
exception. Otherwise, the comparison is performed according to the IEC/IEEE
3658
Standard for Binary Floating-Point Arithmetic.
3659
-------------------------------------------------------------------------------
3661
Function float32_lt_quiet( a: float32 ; b: float32 ): flag;
3665
if ( ( ( extractFloat32Exp( a ) = $FF ) AND (extractFloat32Frac( a )<>0) )
3666
OR ( ( extractFloat32Exp( b ) = $FF ) AND (extractFloat32Frac( b )<>0) )
3669
if ( (float32_is_signaling_nan( a )<>0) OR (float32_is_signaling_nan( b )<>0) ) then
3671
float_raise( float_flag_invalid );
3673
float32_lt_quiet := 0;
3676
aSign := extractFloat32Sign( a );
3677
bSign := extractFloat32Sign( b );
3678
if ( aSign <> bSign ) then
3680
float32_lt_quiet := aSign AND flag( bits32 ( ( a OR b ) shl 1 ) <> 0 );
3683
float32_lt_quiet := flag(flag( a <> b ) AND ( aSign xor flag( a < b ) ));
3687
-------------------------------------------------------------------------------
3688
Returns the result of converting the double-precision floating-point value
3689
`a' to the 32-bit two's complement integer format. The conversion is
3690
performed according to the IEC/IEEE Standard for Binary Floating-Point
3691
Arithmetic---which means in particular that the conversion is rounded
3692
according to the current rounding mode. If `a' is a NaN, the largest
3693
positive integer is returned. Otherwise, if the conversion overflows, the
3694
largest integer with the same sign as `a' is returned.
3695
-------------------------------------------------------------------------------
3697
Function float64_to_int32(a: float64): int32;{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32'];compilerproc;{$endif}
3700
aExp, shiftCount: int16;
3701
aSig0, aSig1, absZ, aSigExtra: bits32;
3706
aSig1 := extractFloat64Frac1( a );
3707
aSig0 := extractFloat64Frac0( a );
3708
aExp := extractFloat64Exp( a );
3709
aSign := extractFloat64Sign( a );
3710
shiftCount := aExp - $413;
3711
if ( 0 <= shiftCount ) then
3713
if ( $41E < aExp ) then
3715
if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
3720
aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
3721
if ( $80000000 < absZ ) then
3726
aSig1 := flag( aSig1 <> 0 );
3727
if ( aExp < $3FE ) then
3729
aSigExtra := aExp OR aSig0 OR aSig1;
3734
aSig0 := aSig0 OR $00100000;
3735
aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
3736
absZ := aSig0 shr ( - shiftCount );
3739
roundingMode := float_rounding_mode;
3740
if ( roundingMode = float_round_nearest_even ) then
3742
if ( sbits32(aSigExtra) < 0 ) then
3745
if ( bits32 ( aSigExtra shl 1 ) = 0 ) then
3746
absZ := absZ and not 1;
3755
aSigExtra := bits32( aSigExtra <> 0 );
3756
if ( aSign <> 0) then
3759
+ ( int32( roundingMode = float_round_down ) and aSigExtra ) );
3763
z := absZ + ( int32( roundingMode = float_round_up ) and aSigExtra );
3766
if ( (( aSign xor flag( z < 0 ) )<>0) AND (z<>0) ) then
3769
float_raise( float_flag_invalid );
3770
if (aSign <> 0 ) then
3771
float64_to_int32 := sbits32 ($80000000)
3773
float64_to_int32 := $7FFFFFFF;
3776
if ( aSigExtra <> 0) then
3777
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
3778
float64_to_int32 := z;
3783
-------------------------------------------------------------------------------
3784
Returns the result of converting the double-precision floating-point value
3785
`a' to the 32-bit two's complement integer format. The conversion is
3786
performed according to the IEC/IEEE Standard for Binary Floating-Point
3787
Arithmetic, except that the conversion is always rounded toward zero.
3788
If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3789
the conversion overflows, the largest integer with the same sign as `a' is
3791
-------------------------------------------------------------------------------
3793
Function float64_to_int32_round_to_zero(a: float64 ): int32;
3794
{$ifdef fpc} [public,Alias:'FLOAT64_TO_INT32_ROUND_TO_ZERO'];compilerproc;{$endif}
3797
aExp, shiftCount: int16;
3798
aSig0, aSig1, absZ, aSigExtra: bits32;
3802
aSig1 := extractFloat64Frac1( a );
3803
aSig0 := extractFloat64Frac0( a );
3804
aExp := extractFloat64Exp( a );
3805
aSign := extractFloat64Sign( a );
3806
shiftCount := aExp - $413;
3807
if ( 0 <= shiftCount ) then
3809
if ( $41E < aExp ) then
3811
if ( ( aExp = $7FF ) AND (( aSig0 OR aSig1 )<>0) ) then
3816
aSig0 OR $00100000, aSig1, shiftCount, absZ, aSigExtra );
3820
if ( aExp < $3FF ) then
3822
if ( aExp OR aSig0 OR aSig1 )<>0 then
3824
softfloat_exception_flags :=
3825
softfloat_exception_flags or float_flag_inexact;
3827
float64_to_int32_round_to_zero := 0;
3830
aSig0 := aSig0 or $00100000;
3831
aSigExtra := ( aSig0 shl ( shiftCount and 31 ) ) OR aSig1;
3832
absZ := aSig0 shr ( - shiftCount );
3838
if ( (( aSign xor flag( z < 0 )) <> 0) AND (z<>0) ) then
3841
float_raise( float_flag_invalid );
3842
if (aSign <> 0) then
3843
float64_to_int32_round_to_zero := sbits32 ($80000000)
3845
float64_to_int32_round_to_zero := $7FFFFFFF;
3848
if ( aSigExtra <> 0) then
3849
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
3850
float64_to_int32_round_to_zero := z;
3854
-------------------------------------------------------------------------------
3855
Returns the result of converting the double-precision floating-point value
3856
`a' to the single-precision floating-point format. The conversion is
3857
performed according to the IEC/IEEE Standard for Binary Floating-Point
3859
-------------------------------------------------------------------------------
3861
Function float64_to_float32(a: float64 ): float32rec;compilerproc;
3865
aSig0, aSig1, zSig: bits32;
3869
aSig1 := extractFloat64Frac1( a );
3870
aSig0 := extractFloat64Frac0( a );
3871
aExp := extractFloat64Exp( a );
3872
aSign := extractFloat64Sign( a );
3873
if ( aExp = $7FF ) then
3875
if ( aSig0 OR aSig1 ) <> 0 then
3877
float64ToCommonNaN( a, tmp );
3878
float64_to_float32.float32 := commonNaNToFloat32( tmp );
3881
float64_to_float32.float32 := packFloat32( aSign, $FF, 0 );
3884
shift64RightJamming( aSig0, aSig1, 22, allZero, zSig );
3885
if ( aExp <> 0) then
3886
zSig := zSig OR $40000000;
3887
float64_to_float32.float32 := roundAndPackFloat32( aSign, aExp - $381, zSig );
3891
-------------------------------------------------------------------------------
3892
Rounds the double-precision floating-point value `a' to an integer,
3893
and returns the result as a double-precision floating-point value. The
3894
operation is performed according to the IEC/IEEE Standard for Binary
3895
Floating-Point Arithmetic.
3896
-------------------------------------------------------------------------------
3898
function float64_round_to_int(a: float64) : Float64;{$ifdef fpc} [public,Alias:'FLOAT64_ROUND_TO_INT'];compilerproc;{$endif}
3903
lastBitMask, roundBitsMask: bits32;
3907
aExp := extractFloat64Exp( a );
3908
if ( $413 <= aExp ) then
3910
if ( $433 <= aExp ) then
3912
if ( ( aExp = $7FF )
3915
( extractFloat64Frac0( a ) OR extractFloat64Frac1( a )
3919
propagateFloat64NaN( a, a, result );
3926
lastBitMask := ( lastBitMask shl ( $432 - aExp ) ) shl 1;
3927
roundBitsMask := lastBitMask - 1;
3929
roundingMode := float_rounding_mode;
3930
if ( roundingMode = float_round_nearest_even ) then
3932
if ( lastBitMask <> 0) then
3934
add64( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
3935
if ( ( z.low and roundBitsMask ) = 0 ) then
3936
z.low := z.low and not lastBitMask;
3940
if ( sbits32 (z.low) < 0 ) then
3943
if ( bits32 ( z.low shl 1 ) = 0 ) then
3944
z.high := z.high and not 1;
3948
else if ( roundingMode <> float_round_to_zero ) then
3950
if ( extractFloat64Sign( z )
3951
xor flag( roundingMode = float_round_up ) )<> 0 then
3953
add64( z.high, z.low, 0, roundBitsMask, z.high, z.low );
3956
z.low := z.low and not roundBitsMask;
3960
if ( aExp <= $3FE ) then
3962
if ( ( ( bits32 ( a.high shl 1 ) ) OR a.low ) = 0 ) then
3967
softfloat_exception_flags := softfloat_exception_flags or
3969
aSign := extractFloat64Sign( a );
3970
case ( float_rounding_mode ) of
3971
float_round_nearest_even:
3973
if ( ( aExp = $3FE )
3974
AND ( (extractFloat64Frac0( a ) OR extractFloat64Frac1( a ) )<>0)
3977
packFloat64( aSign, $3FF, 0, 0, result );
3985
packFloat64( 1, $3FF, 0, 0, result )
3987
packFloat64( 0, 0, 0, 0, result );
3993
packFloat64( 1, 0, 0, 0, result )
3995
packFloat64( 0, $3FF, 0, 0, result );
3999
packFloat64( aSign, 0, 0, 0, result );
4003
lastBitMask := lastBitMask shl ($413 - aExp);
4004
roundBitsMask := lastBitMask - 1;
4007
roundingMode := float_rounding_mode;
4008
if ( roundingMode = float_round_nearest_even ) then
4010
z.high := z.high + lastBitMask shr 1;
4011
if ( ( ( z.high and roundBitsMask ) OR a.low ) = 0 ) then
4013
z.high := z.high and not lastBitMask;
4016
else if ( roundingMode <> float_round_to_zero ) then
4018
if ( extractFloat64Sign( z )
4019
xor flag( roundingMode = float_round_up ) )<> 0 then
4021
z.high := z.high or bits32( a.low <> 0 );
4022
z.high := z.high + roundBitsMask;
4025
z.high := z.high and not roundBitsMask;
4027
if ( ( z.low <> a.low ) OR ( z.high <> a.high ) ) then
4029
softfloat_exception_flags :=
4030
softfloat_exception_flags or float_flag_inexact;
4037
-------------------------------------------------------------------------------
4038
Returns the result of adding the absolute values of the double-precision
4039
floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4040
before being returned. `zSign' is ignored if the result is a NaN.
4041
The addition is performed according to the IEC/IEEE Standard for Binary
4042
Floating-Point Arithmetic.
4043
-------------------------------------------------------------------------------
4045
Procedure addFloat64Sigs( a:float64 ; b: float64 ; zSign:flag; Var out: float64 );
4047
aExp, bExp, zExp: int16;
4048
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
4053
aSig1 := extractFloat64Frac1( a );
4054
aSig0 := extractFloat64Frac0( a );
4055
aExp := extractFloat64Exp( a );
4056
bSig1 := extractFloat64Frac1( b );
4057
bSig0 := extractFloat64Frac0( b );
4058
bExp := extractFloat64Exp( b );
4059
expDiff := aExp - bExp;
4060
if ( 0 < expDiff ) then
4062
if ( aExp = $7FF ) then
4064
if ( aSig0 OR aSig1 ) <> 0 then
4066
propagateFloat64NaN( a, b, out );
4072
if ( bExp = 0 ) then
4078
bSig0 := bSig0 or $00100000;
4080
shift64ExtraRightJamming(
4081
bSig0, bSig1, 0, expDiff, bSig0, bSig1, zSig2 );
4084
else if ( expDiff < 0 ) then
4086
if ( bExp = $7FF ) then
4088
if ( bSig0 OR bSig1 ) <> 0 then
4090
propagateFloat64NaN( a, b, out );
4093
packFloat64( zSign, $7FF, 0, 0, out );
4095
if ( aExp = 0 ) then
4101
aSig0 := aSig0 or $00100000;
4103
shift64ExtraRightJamming(
4104
aSig0, aSig1, 0, - expDiff, aSig0, aSig1, zSig2 );
4109
if ( aExp = $7FF ) then
4111
if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
4113
propagateFloat64NaN( a, b, out );
4119
add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
4120
if ( aExp = 0 ) then
4122
packFloat64( zSign, 0, zSig0, zSig1, out );
4126
zSig0 := zSig0 or $00200000;
4130
aSig0 := aSig0 or $00100000;
4131
add64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
4133
if ( zSig0 < $00200000 ) then
4137
shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
4139
roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, out );
4144
-------------------------------------------------------------------------------
4145
Returns the result of subtracting the absolute values of the double-
4146
precision floating-point values `a' and `b'. If `zSign' is 1, the
4147
difference is negated before being returned. `zSign' is ignored if the
4148
result is a NaN. The subtraction is performed according to the IEC/IEEE
4149
Standard for Binary Floating-Point Arithmetic.
4150
-------------------------------------------------------------------------------
4152
Procedure subFloat64Sigs( a:float64; b: float64 ; zSign:flag; Var out: float64 );
4154
aExp, bExp, zExp: int16;
4155
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits32;
4162
label normalizeRoundAndPack;
4164
aSig1 := extractFloat64Frac1( a );
4165
aSig0 := extractFloat64Frac0( a );
4166
aExp := extractFloat64Exp( a );
4167
bSig1 := extractFloat64Frac1( b );
4168
bSig0 := extractFloat64Frac0( b );
4169
bExp := extractFloat64Exp( b );
4170
expDiff := aExp - bExp;
4171
shortShift64Left( aSig0, aSig1, 10, aSig0, aSig1 );
4172
shortShift64Left( bSig0, bSig1, 10, bSig0, bSig1 );
4173
if ( 0 < expDiff ) then goto aExpBigger;
4174
if ( expDiff < 0 ) then goto bExpBigger;
4175
if ( aExp = $7FF ) then
4177
if ( aSig0 OR aSig1 OR bSig0 OR bSig1 ) <> 0 then
4179
propagateFloat64NaN( a, b, out );
4182
float_raise( float_flag_invalid );
4183
z.low := float64_default_nan_low;
4184
z.high := float64_default_nan_high;
4188
if ( aExp = 0 ) then
4193
if ( bSig0 < aSig0 ) then goto aBigger;
4194
if ( aSig0 < bSig0 ) then goto bBigger;
4195
if ( bSig1 < aSig1 ) then goto aBigger;
4196
if ( aSig1 < bSig1 ) then goto bBigger;
4197
packFloat64( flag(float_rounding_mode = float_round_down), 0, 0, 0 , out);
4200
if ( bExp = $7FF ) then
4202
if ( bSig0 OR bSig1 ) <> 0 then
4204
propagateFloat64NaN( a, b, out );
4207
packFloat64( zSign xor 1, $7FF, 0, 0, out );
4210
if ( aExp = 0 ) then
4216
aSig0 := aSig0 or $40000000;
4218
shift64RightJamming( aSig0, aSig1, - expDiff, aSig0, aSig1 );
4219
bSig0 := bSig0 or $40000000;
4221
sub64( bSig0, bSig1, aSig0, aSig1, zSig0, zSig1 );
4223
zSign := zSign xor 1;
4224
goto normalizeRoundAndPack;
4226
if ( aExp = $7FF ) then
4228
if ( aSig0 OR aSig1 ) <> 0 then
4230
propagateFloat64NaN( a, b, out );
4236
if ( bExp = 0 ) then
4242
bSig0 := bSig0 or $40000000;
4244
shift64RightJamming( bSig0, bSig1, expDiff, bSig0, bSig1 );
4245
aSig0 := aSig0 or $40000000;
4247
sub64( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1 );
4249
normalizeRoundAndPack:
4251
normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1, out );
4256
-------------------------------------------------------------------------------
4257
Returns the result of adding the double-precision floating-point values `a'
4258
and `b'. The operation is performed according to the IEC/IEEE Standard for
4259
Binary Floating-Point Arithmetic.
4260
-------------------------------------------------------------------------------
4262
Function float64_add( a: float64; b : float64) : Float64;
4263
{$ifdef fpc}[public,Alias:'FLOAT64_ADD'];compilerproc;{$endif}
4267
aSign := extractFloat64Sign( a );
4268
bSign := extractFloat64Sign( b );
4269
if ( aSign = bSign ) then
4271
addFloat64Sigs( a, b, aSign, result );
4275
subFloat64Sigs( a, b, aSign, result );
4280
-------------------------------------------------------------------------------
4281
Returns the result of subtracting the double-precision floating-point values
4282
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
4283
for Binary Floating-Point Arithmetic.
4284
-------------------------------------------------------------------------------
4286
Function float64_sub(a: float64; b : float64) : Float64;
4287
{$ifdef fpc}[public,Alias:'FLOAT64_SUB'];compilerproc;{$endif}
4291
aSign := extractFloat64Sign( a );
4292
bSign := extractFloat64Sign( b );
4293
if ( aSign = bSign ) then
4295
subFloat64Sigs( a, b, aSign, result );
4299
addFloat64Sigs( a, b, aSign, result );
4304
-------------------------------------------------------------------------------
4305
Returns the result of multiplying the double-precision floating-point values
4306
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
4307
for Binary Floating-Point Arithmetic.
4308
-------------------------------------------------------------------------------
4310
Function float64_mul( a: float64; b:float64) : Float64;
4311
{$ifdef fpc}[public,Alias:'FLOAT64_MUL'];compilerproc;{$endif}
4313
aSign, bSign, zSign: flag;
4314
aExp, bExp, zExp: int16;
4315
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits32;
4319
aSig1 := extractFloat64Frac1( a );
4320
aSig0 := extractFloat64Frac0( a );
4321
aExp := extractFloat64Exp( a );
4322
aSign := extractFloat64Sign( a );
4323
bSig1 := extractFloat64Frac1( b );
4324
bSig0 := extractFloat64Frac0( b );
4325
bExp := extractFloat64Exp( b );
4326
bSign := extractFloat64Sign( b );
4327
zSign := aSign xor bSign;
4328
if ( aExp = $7FF ) then
4330
if ( (( aSig0 OR aSig1 ) <>0)
4331
OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
4333
propagateFloat64NaN( a, b, result );
4336
if ( ( bExp OR bSig0 OR bSig1 ) = 0 ) then goto invalid;
4337
packFloat64( zSign, $7FF, 0, 0, result );
4340
if ( bExp = $7FF ) then
4342
if ( bSig0 OR bSig1 )<> 0 then
4344
propagateFloat64NaN( a, b, result );
4347
if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
4350
float_raise( float_flag_invalid );
4351
z.low := float64_default_nan_low;
4352
z.high := float64_default_nan_high;
4356
packFloat64( zSign, $7FF, 0, 0, result );
4359
if ( aExp = 0 ) then
4361
if ( ( aSig0 OR aSig1 ) = 0 ) then
4363
packFloat64( zSign, 0, 0, 0, result );
4366
normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
4368
if ( bExp = 0 ) then
4370
if ( ( bSig0 OR bSig1 ) = 0 ) then
4372
packFloat64( zSign, 0, 0, 0, result );
4375
normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
4377
zExp := aExp + bExp - $400;
4378
aSig0 := aSig0 or $00100000;
4379
shortShift64Left( bSig0, bSig1, 12, bSig0, bSig1 );
4380
mul64To128( aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3 );
4381
add64( zSig0, zSig1, aSig0, aSig1, zSig0, zSig1 );
4382
zSig2 := zSig2 or flag( zSig3 <> 0 );
4383
if ( $00200000 <= zSig0 ) then
4385
shift64ExtraRightJamming(
4386
zSig0, zSig1, zSig2, 1, zSig0, zSig1, zSig2 );
4389
roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
4393
-------------------------------------------------------------------------------
4394
Returns the result of dividing the double-precision floating-point value `a'
4395
by the corresponding value `b'. The operation is performed according to the
4396
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4397
-------------------------------------------------------------------------------
4399
Function float64_div(a: float64; b : float64) : Float64;
4400
{$ifdef fpc}[public,Alias:'FLOAT64_DIV'];compilerproc;{$endif}
4402
aSign, bSign, zSign: flag;
4403
aExp, bExp, zExp: int16;
4404
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits32;
4405
rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
4409
aSig1 := extractFloat64Frac1( a );
4410
aSig0 := extractFloat64Frac0( a );
4411
aExp := extractFloat64Exp( a );
4412
aSign := extractFloat64Sign( a );
4413
bSig1 := extractFloat64Frac1( b );
4414
bSig0 := extractFloat64Frac0( b );
4415
bExp := extractFloat64Exp( b );
4416
bSign := extractFloat64Sign( b );
4417
zSign := aSign xor bSign;
4418
if ( aExp = $7FF ) then
4420
if ( aSig0 OR aSig1 )<> 0 then
4422
propagateFloat64NaN( a, b, result );
4425
if ( bExp = $7FF ) then
4427
if ( bSig0 OR bSig1 )<>0 then
4429
propagateFloat64NaN( a, b, result );
4434
packFloat64( zSign, $7FF, 0, 0, result );
4437
if ( bExp = $7FF ) then
4439
if ( bSig0 OR bSig1 )<> 0 then
4441
propagateFloat64NaN( a, b, result );
4444
packFloat64( zSign, 0, 0, 0, result );
4447
if ( bExp = 0 ) then
4449
if ( ( bSig0 OR bSig1 ) = 0 ) then
4451
if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
4454
float_raise( float_flag_invalid );
4455
z.low := float64_default_nan_low;
4456
z.high := float64_default_nan_high;
4460
float_raise( float_flag_divbyzero );
4461
packFloat64( zSign, $7FF, 0, 0, result );
4464
normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
4466
if ( aExp = 0 ) then
4468
if ( ( aSig0 OR aSig1 ) = 0 ) then
4470
packFloat64( zSign, 0, 0, 0, result );
4473
normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
4475
zExp := aExp - bExp + $3FD;
4476
shortShift64Left( aSig0 OR $00100000, aSig1, 11, aSig0, aSig1 );
4477
shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
4478
if ( le64( bSig0, bSig1, aSig0, aSig1 )<>0 ) then
4480
shift64Right( aSig0, aSig1, 1, aSig0, aSig1 );
4483
zSig0 := estimateDiv64To32( aSig0, aSig1, bSig0 );
4484
mul64By32To96( bSig0, bSig1, zSig0, term0, term1, term2 );
4485
sub96( aSig0, aSig1, 0, term0, term1, term2, rem0, rem1, rem2 );
4486
while ( sbits32 (rem0) < 0 ) do
4489
add96( rem0, rem1, rem2, 0, bSig0, bSig1, rem0, rem1, rem2 );
4491
zSig1 := estimateDiv64To32( rem1, rem2, bSig0 );
4492
if ( ( zSig1 and $3FF ) <= 4 ) then
4494
mul64By32To96( bSig0, bSig1, zSig1, term1, term2, term3 );
4495
sub96( rem1, rem2, 0, term1, term2, term3, rem1, rem2, rem3 );
4496
while ( sbits32 (rem1) < 0 ) do
4499
add96( rem1, rem2, rem3, 0, bSig0, bSig1, rem1, rem2, rem3 );
4501
zSig1 := zSig1 or flag( ( rem1 OR rem2 OR rem3 ) <> 0 );
4503
shift64ExtraRightJamming( zSig0, zSig1, 0, 11, zSig0, zSig1, zSig2 );
4504
roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2, result );
4509
-------------------------------------------------------------------------------
4510
Returns the remainder of the double-precision floating-point value `a'
4511
with respect to the corresponding value `b'. The operation is performed
4512
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4513
-------------------------------------------------------------------------------
4515
Function float64_rem(a: float64; b : float64) : float64;
4516
{$ifdef fpc}[public,Alias:'FLOAT64_REM'];compilerproc;{$endif}
4518
aSign, bSign, zSign: flag;
4519
aExp, bExp, expDiff: int16;
4520
aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits32;
4521
allZero, alternateASig0, alternateASig1, sigMean1: bits32;
4526
aSig1 := extractFloat64Frac1( a );
4527
aSig0 := extractFloat64Frac0( a );
4528
aExp := extractFloat64Exp( a );
4529
aSign := extractFloat64Sign( a );
4530
bSig1 := extractFloat64Frac1( b );
4531
bSig0 := extractFloat64Frac0( b );
4532
bExp := extractFloat64Exp( b );
4533
bSign := extractFloat64Sign( b );
4534
if ( aExp = $7FF ) then
4536
if ((( aSig0 OR aSig1 )<>0)
4537
OR ( ( bExp = $7FF ) AND (( bSig0 OR bSig1 )<>0) ) ) then
4539
propagateFloat64NaN( a, b, result );
4544
if ( bExp = $7FF ) then
4546
if ( bSig0 OR bSig1 ) <> 0 then
4548
propagateFloat64NaN( a, b, result );
4554
if ( bExp = 0 ) then
4556
if ( ( bSig0 OR bSig1 ) = 0 ) then
4559
float_raise( float_flag_invalid );
4560
z.low := float64_default_nan_low;
4561
z.high := float64_default_nan_high;
4565
normalizeFloat64Subnormal( bSig0, bSig1, bExp, bSig0, bSig1 );
4567
if ( aExp = 0 ) then
4569
if ( ( aSig0 OR aSig1 ) = 0 ) then
4574
normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
4576
expDiff := aExp - bExp;
4577
if ( expDiff < -1 ) then
4583
aSig0 OR $00100000, aSig1, 11 - flag( expDiff < 0 ), aSig0, aSig1 );
4584
shortShift64Left( bSig0 OR $00100000, bSig1, 11, bSig0, bSig1 );
4585
q := le64( bSig0, bSig1, aSig0, aSig1 );
4587
sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
4588
expDiff := expDiff - 32;
4589
while ( 0 < expDiff ) do
4591
q := estimateDiv64To32( aSig0, aSig1, bSig0 );
4596
mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
4597
shortShift96Left( term0, term1, term2, 29, term1, term2, allZero );
4598
shortShift64Left( aSig0, aSig1, 29, aSig0, allZero );
4599
sub64( aSig0, 0, term1, term2, aSig0, aSig1 );
4600
expDiff := expDiff - 29;
4602
if ( -32 < expDiff ) then
4604
q := estimateDiv64To32( aSig0, aSig1, bSig0 );
4609
q := q shr (- expDiff);
4610
shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
4611
expDiff := expDiff + 24;
4612
if ( expDiff < 0 ) then
4614
shift64Right( aSig0, aSig1, - expDiff, aSig0, aSig1 );
4618
shortShift64Left( aSig0, aSig1, expDiff, aSig0, aSig1 );
4620
mul64By32To96( bSig0, bSig1, q, term0, term1, term2 );
4621
sub64( aSig0, aSig1, term1, term2, aSig0, aSig1 );
4625
shift64Right( aSig0, aSig1, 8, aSig0, aSig1 );
4626
shift64Right( bSig0, bSig1, 8, bSig0, bSig1 );
4629
alternateASig0 := aSig0;
4630
alternateASig1 := aSig1;
4632
sub64( aSig0, aSig1, bSig0, bSig1, aSig0, aSig1 );
4633
Until not ( 0 <= sbits32 (aSig0) );
4635
aSig0, aSig1, alternateASig0, alternateASig1, bits32(sigMean0), sigMean1 );
4636
if ( ( sigMean0 < 0 )
4637
OR ( ( ( sigMean0 OR sigMean1 ) = 0 ) AND (( q AND 1 )<>0) ) ) then
4639
aSig0 := alternateASig0;
4640
aSig1 := alternateASig1;
4642
zSign := flag( sbits32 (aSig0) < 0 );
4643
if ( zSign <> 0 ) then
4644
sub64( 0, 0, aSig0, aSig1, aSig0, aSig1 );
4645
normalizeRoundAndPackFloat64( aSign xor zSign, bExp - 4, aSig0, aSig1, result );
4649
-------------------------------------------------------------------------------
4650
Returns the square root of the double-precision floating-point value `a'.
4651
The operation is performed according to the IEC/IEEE Standard for Binary
4652
Floating-Point Arithmetic.
4653
-------------------------------------------------------------------------------
4655
Procedure float64_sqrt( a: float64; var out: float64 );
4656
{$ifdef fpc}[public,Alias:'FLOAT64_SQRT'];compilerproc;{$endif}
4660
aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits32;
4661
rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits32;
4665
aSig1 := extractFloat64Frac1( a );
4666
aSig0 := extractFloat64Frac0( a );
4667
aExp := extractFloat64Exp( a );
4668
aSign := extractFloat64Sign( a );
4669
if ( aExp = $7FF ) then
4671
if ( aSig0 OR aSig1 ) <> 0 then
4673
propagateFloat64NaN( a, a, out );
4676
if ( aSign = 0) then
4683
if ( aSign <> 0 ) then
4685
if ( ( aExp OR aSig0 OR aSig1 ) = 0 ) then
4691
float_raise( float_flag_invalid );
4692
z.low := float64_default_nan_low;
4693
z.high := float64_default_nan_high;
4697
if ( aExp = 0 ) then
4699
if ( ( aSig0 OR aSig1 ) = 0 ) then
4701
packFloat64( 0, 0, 0, 0, out );
4704
normalizeFloat64Subnormal( aSig0, aSig1, aExp, aSig0, aSig1 );
4706
zExp := ( ( aExp - $3FF ) shr 1 ) + $3FE;
4707
aSig0 := aSig0 or $00100000;
4708
shortShift64Left( aSig0, aSig1, 11, term0, term1 );
4709
zSig0 := ( estimateSqrt32( aExp, term0 ) shr 1 ) + 1;
4710
if ( zSig0 = 0 ) then
4712
doubleZSig0 := zSig0 + zSig0;
4713
shortShift64Left( aSig0, aSig1, 9 - ( aExp and 1 ), aSig0, aSig1 );
4714
mul32To64( zSig0, zSig0, term0, term1 );
4715
sub64( aSig0, aSig1, term0, term1, rem0, rem1 );
4716
while ( sbits32 (rem0) < 0 ) do
4719
doubleZSig0 := doubleZSig0 - 2;
4720
add64( rem0, rem1, 0, doubleZSig0 OR 1, rem0, rem1 );
4722
zSig1 := estimateDiv64To32( rem1, 0, doubleZSig0 );
4723
if ( ( zSig1 and $1FF ) <= 5 ) then
4725
if ( zSig1 = 0 ) then
4727
mul32To64( doubleZSig0, zSig1, term1, term2 );
4728
sub64( rem1, 0, term1, term2, rem1, rem2 );
4729
mul32To64( zSig1, zSig1, term2, term3 );
4730
sub96( rem1, rem2, 0, 0, term2, term3, rem1, rem2, rem3 );
4731
while ( sbits32 (rem1) < 0 ) do
4734
shortShift64Left( 0, zSig1, 1, term2, term3 );
4735
term3 := term3 or 1;
4736
term2 := term2 or doubleZSig0;
4737
add96( rem1, rem2, rem3, 0, term2, term3, rem1, rem2, rem3 );
4739
zSig1 := zSig1 or bits32( ( rem1 OR rem2 OR rem3 ) <> 0 );
4741
shift64ExtraRightJamming( zSig0, zSig1, 0, 10, zSig0, zSig1, zSig2 );
4742
roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2, out );
4746
-------------------------------------------------------------------------------
4747
Returns 1 if the double-precision floating-point value `a' is equal to
4748
the corresponding value `b', and 0 otherwise. The comparison is performed
4749
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4750
-------------------------------------------------------------------------------
4752
Function float64_eq(a: float64; b: float64): flag;
4753
{$ifdef fpc}[public,Alias:'FLOAT64_EQ'];compilerproc;{$endif}
4757
( extractFloat64Exp( a ) = $7FF )
4760
(extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
4764
( extractFloat64Exp( b ) = $7FF )
4766
(extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
4771
if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
4772
float_raise( float_flag_invalid );
4778
AND ( ( a.high = b.high )
4780
AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
4785
-------------------------------------------------------------------------------
4786
Returns 1 if the double-precision floating-point value `a' is less than
4787
or equal to the corresponding value `b', and 0 otherwise. The comparison
4788
is performed according to the IEC/IEEE Standard for Binary Floating-Point
4790
-------------------------------------------------------------------------------
4792
Function float64_le(a: float64;b: float64): flag;
4793
{$ifdef fpc}[public,Alias:'FLOAT64_LE'];compilerproc;{$endif}
4799
( extractFloat64Exp( a ) = $7FF )
4802
(extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
4806
( extractFloat64Exp( b ) = $7FF )
4808
(extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
4813
float_raise( float_flag_invalid );
4817
aSign := extractFloat64Sign( a );
4818
bSign := extractFloat64Sign( b );
4819
if ( aSign <> bSign ) then
4823
OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
4828
float64_le := le64( b.high, b.low, a.high, a.low )
4830
float64_le := le64( a.high, a.low, b.high, b.low );
4834
-------------------------------------------------------------------------------
4835
Returns 1 if the double-precision floating-point value `a' is less than
4836
the corresponding value `b', and 0 otherwise. The comparison is performed
4837
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4838
-------------------------------------------------------------------------------
4840
Function float64_lt(a: float64;b: float64): flag;
4841
{$ifdef fpc}[public,Alias:'FLOAT64_LT'];compilerproc;{$endif}
4847
( extractFloat64Exp( a ) = $7FF )
4850
(extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
4854
( extractFloat64Exp( b ) = $7FF )
4856
(extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
4861
float_raise( float_flag_invalid );
4865
aSign := extractFloat64Sign( a );
4866
bSign := extractFloat64Sign( b );
4867
if ( aSign <> bSign ) then
4871
AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
4876
float64_lt := lt64( b.high, b.low, a.high, a.low )
4878
float64_lt := lt64( a.high, a.low, b.high, b.low );
4882
-------------------------------------------------------------------------------
4883
Returns 1 if the double-precision floating-point value `a' is equal to
4884
the corresponding value `b', and 0 otherwise. The invalid exception is
4885
raised if either operand is a NaN. Otherwise, the comparison is performed
4886
according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4887
-------------------------------------------------------------------------------
4889
Function float64_eq_signaling( a: float64; b: float64): flag;
4894
( extractFloat64Exp( a ) = $7FF )
4897
(extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
4901
( extractFloat64Exp( b ) = $7FF )
4903
(extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
4908
float_raise( float_flag_invalid );
4909
float64_eq_signaling := 0;
4912
float64_eq_signaling := flag(
4914
AND ( ( a.high = b.high )
4916
AND ( bits32 ( ( a.high OR b.high ) shl 1 ) = 0 ) )
4921
-------------------------------------------------------------------------------
4922
Returns 1 if the double-precision floating-point value `a' is less than or
4923
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4924
cause an exception. Otherwise, the comparison is performed according to the
4925
IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4926
-------------------------------------------------------------------------------
4928
Function float64_le_quiet(a: float64 ; b: float64 ): flag;
4930
aSign, bSign : flag;
4934
( extractFloat64Exp( a ) = $7FF )
4937
(extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
4941
( extractFloat64Exp( b ) = $7FF )
4943
(extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
4948
if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
4949
float_raise( float_flag_invalid );
4950
float64_le_quiet := 0;
4953
aSign := extractFloat64Sign( a );
4954
bSign := extractFloat64Sign( b );
4955
if ( aSign <> bSign ) then
4957
float64_le_quiet := flag
4959
OR ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
4964
float64_le_quiet := le64( b.high, b.low, a.high, a.low )
4966
float64_le_quiet := le64( a.high, a.low, b.high, b.low );
4970
-------------------------------------------------------------------------------
4971
Returns 1 if the double-precision floating-point value `a' is less than
4972
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4973
exception. Otherwise, the comparison is performed according to the IEC/IEEE
4974
Standard for Binary Floating-Point Arithmetic.
4975
-------------------------------------------------------------------------------
4977
Function float64_lt_quiet(a: float64; b: float64 ): Flag;
4983
( extractFloat64Exp( a ) = $7FF )
4986
(extractFloat64Frac0( a ) OR extractFloat64Frac1( a )) <>0
4990
( extractFloat64Exp( b ) = $7FF )
4992
(extractFloat64Frac0( b ) OR (extractFloat64Frac1( b )) <> 0
4997
if ( (float64_is_signaling_nan( a )<>0) OR (float64_is_signaling_nan( b )<>0) ) then
4998
float_raise( float_flag_invalid );
4999
float64_lt_quiet := 0;
5002
aSign := extractFloat64Sign( a );
5003
bSign := extractFloat64Sign( b );
5004
if ( aSign <> bSign ) then
5006
float64_lt_quiet := flag(
5008
AND ( ( ( bits32 ( ( a.high OR b.high ) shl 1 ) ) OR a.low OR b.low )
5013
float64_lt_quiet := lt64( b.high, b.low, a.high, a.low )
5015
float64_lt_quiet := lt64( a.high, a.low, b.high, b.low );
5019
{*----------------------------------------------------------------------------
5020
| Returns the result of converting the 64-bit two's complement integer `a'
5021
| to the single-precision floating-point format. The conversion is performed
5022
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5023
*----------------------------------------------------------------------------*}
5024
function int64_to_float32( a: int64 ): float32rec; compilerproc;
5034
int64_to_float32.float32 := 0;
5040
zSign := flag(FALSE);
5045
shiftCount := countLeadingZeros64( absA ) - 40;
5046
if ( 0 <= shiftCount ) then
5048
int64_to_float32.float32:= packFloat32( zSign, $95 - shiftCount, absA shl shiftCount );
5052
shiftCount := shiftCount + 7;
5053
if ( shiftCount < 0 ) then
5055
intval.low := int64rec(AbsA).low;
5056
intval.high := int64rec(AbsA).high;
5057
shift64RightJamming( intval.low, intval.high, - shiftCount,
5058
intval.low, intval.high);
5059
int64rec(absA).low := intval.low;
5060
int64rec(absA).high := intval.high;
5063
absA := absA shl shiftCount;
5064
int64_to_float32.float32:=roundAndPackFloat32( zSign, $9C - shiftCount, absA );
5069
{*----------------------------------------------------------------------------
5070
| Returns the result of converting the 64-bit two's complement integer `a'
5071
| to the double-precision floating-point format. The conversion is performed
5072
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073
*----------------------------------------------------------------------------*}
5074
function int64_to_float64( a: int64 ): float64;
5075
{$ifdef fpc}[public,Alias:'INT64_TO_FLOAT64'];compilerproc;{$endif}
5078
float_result : float64;
5082
zSig0, zSig1 : bits32;
5086
packFloat64( 0, 0, 0, 0, result );
5089
zSign := flag( a < 0 );
5094
shiftCount := countLeadingZeros64( absA ) - 11;
5095
if ( 0 <= shiftCount ) then
5097
absA := absA shl shiftcount;
5098
zSig0:=int64rec(absA).high;
5099
zSig1:=int64rec(absA).low;
5103
shift64Right( absA, 0, - shiftCount, zSig0, zSig1 );
5105
packFloat64( zSign, $432 - shiftCount, zSig0, zSig1, float_result );
5106
int64_to_float64:= float_result;
5110
{*----------------------------------------------------------------------------
5111
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
5112
| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
5113
| Otherwise, returns 0.
5114
*----------------------------------------------------------------------------*}
5116
function eq128( a0: bits64; a1: bits64; b0: bits64; b1 : bits64): flag;inline;
5118
result := ord(( a0 = b0 ) and ( a1 = b1 ));
5121
{*----------------------------------------------------------------------------
5122
| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
5123
| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
5124
| any carry out is lost. The result is broken into two 64-bit pieces which
5125
| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
5126
*----------------------------------------------------------------------------*}
5128
procedure add128(a0: bits64; a1: bits64; b0: bits64; b1: bits64; var z0Ptr: bits64; var z1Ptr : bits64);inline;
5134
z0Ptr := a0 + b0 + ord( z1 < a1 );
5138
{*----------------------------------------------------------------------------
5139
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
5140
| by 64 _plus_ the number of bits given in `count'. The shifted result is
5141
| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
5142
| stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
5143
| off form a third 64-bit result as follows: The _last_ bit shifted off is
5144
| the most-significant bit of the extra result, and the other 63 bits of the
5145
| extra result are all zero if and only if _all_but_the_last_ bits shifted off
5146
| were all zero. This extra result is stored in the location pointed to by
5147
| `z2Ptr'. The value of `count' can be arbitrarily large.
5148
| (This routine makes more sense if `a0', `a1', and `a2' are considered
5149
| to form a fixed-point value with binary point between `a1' and `a2'. This
5150
| fixed-point value is shifted right by the number of bits given in `count',
5151
| and the integer part of the result is returned at the locations pointed to
5152
| by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
5153
| corrupted as described above, and is returned at the location pointed to by
5155
*----------------------------------------------------------------------------*}
5157
procedure shift128ExtraRightJamming(
5169
negCount := ( - count ) and 63;
5171
if ( count = 0 ) then
5178
if ( count < 64 ) then
5180
z2 := a1 shr negCount;
5181
z1 := ( a0 shl negCount ) or ( a1 shr count );
5185
if ( count = 64 ) then
5192
if ( count < 128 ) then
5194
z2 := a0 shl negCount;
5195
z1 := a0 shr ( count and 63 );
5198
if ( count = 128 ) then
5201
z2 := ord( a0 <> 0 );
5207
z2 := z2 or ord( a2 <> 0 );
5215
{*----------------------------------------------------------------------------
5216
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
5217
| _plus_ the number of bits given in `count'. The shifted result is at most
5218
| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
5219
| bits shifted off form a second 64-bit result as follows: The _last_ bit
5220
| shifted off is the most-significant bit of the extra result, and the other
5221
| 63 bits of the extra result are all zero if and only if _all_but_the_last_
5222
| bits shifted off were all zero. This extra result is stored in the location
5223
| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
5224
| (This routine makes more sense if `a0' and `a1' are considered to form
5225
| a fixed-point value with binary point between `a0' and `a1'. This fixed-
5226
| point value is shifted right by the number of bits given in `count', and
5227
| the integer part of the result is returned at the location pointed to by
5228
| `z0Ptr'. The fractional part of the result may be slightly corrupted as
5229
| described above, and is returned at the location pointed to by `z1Ptr'.)
5230
*----------------------------------------------------------------------------*}
5232
procedure shift64ExtraRightJamming(a0: bits64; a1: bits64; count: int16; var z0Ptr: bits64; var z1Ptr : bits64);
5237
negCount := ( - count ) and 63;
5239
if ( count = 0 ) then
5244
else if ( count < 64 ) then
5246
z1 := ( a0 shl negCount ) or ord( a1 <> 0 );
5250
if ( count = 64 ) then
5252
z1 := a0 or ord( a1 <> 0 );
5255
z1 := ord( ( a0 or a1 ) <> 0 );
5263
{$ifdef FPC_SOFTFLOAT_FLOATX80}
5265
{*----------------------------------------------------------------------------
5266
| Returns the fraction bits of the extended double-precision floating-point
5268
*----------------------------------------------------------------------------*}
5270
function extractFloatx80Frac(a : floatx80): bits64;inline;
5275
{*----------------------------------------------------------------------------
5276
| Returns the exponent bits of the extended double-precision floating-point
5278
*----------------------------------------------------------------------------*}
5280
function extractFloatx80Exp(a : floatx80): int32;inline;
5282
result:=a.high and $7FFF;
5285
{*----------------------------------------------------------------------------
5286
| Returns the sign bit of the extended double-precision floating-point value
5288
*----------------------------------------------------------------------------*}
5290
function extractFloatx80Sign(a : floatx80): flag;inline;
5292
result:=a.high shr 15;
5295
{*----------------------------------------------------------------------------
5296
| Normalizes the subnormal extended double-precision floating-point value
5297
| represented by the denormalized significand `aSig'. The normalized exponent
5298
| and significand are stored at the locations pointed to by `zExpPtr' and
5299
| `zSigPtr', respectively.
5300
*----------------------------------------------------------------------------*}
5302
procedure normalizeFloatx80Subnormal( aSig: bits64; var zExpPtr: int32; var zSigPtr : bits64);
5306
shiftCount := countLeadingZeros64( aSig );
5307
zSigPtr := aSig shl shiftCount;
5308
zExpPtr := 1 - shiftCount;
5311
{*----------------------------------------------------------------------------
5312
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
5313
| extended double-precision floating-point value, returning the result.
5314
*----------------------------------------------------------------------------*}
5316
function packFloatx80( zSign: flag; zExp: int32; zSig : bits64): floatx80;
5321
z.high := ( bits16(zSign) shl 15 ) + zExp;
5325
{*----------------------------------------------------------------------------
5326
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
5327
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
5328
| and returns the proper extended double-precision floating-point value
5329
| corresponding to the abstract input. Ordinarily, the abstract value is
5330
| rounded and packed into the extended double-precision format, with the
5331
| inexact exception raised if the abstract input cannot be represented
5332
| exactly. However, if the abstract value is too large, the overflow and
5333
| inexact exceptions are raised and an infinity or maximal finite value is
5334
| returned. If the abstract value is too small, the input value is rounded to
5335
| a subnormal number, and the underflow and inexact exceptions are raised if
5336
| the abstract input cannot be represented exactly as a subnormal extended
5337
| double-precision floating-point number.
5338
| If `roundingPrecision' is 32 or 64, the result is rounded to the same
5339
| number of bits as single or double precision, respectively. Otherwise, the
5340
| result is rounded to the full precision of the extended double-precision
5342
| The input significand must be normalized or smaller. If the input
5343
| significand is not normalized, `zExp' must be 0; in that case, the result
5344
| returned is a subnormal number, and it must not require rounding. The
5345
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
5346
| Floating-Point Arithmetic.
5347
*----------------------------------------------------------------------------*}
5349
function roundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
5352
roundNearestEven, increment, isTiny: flag;
5353
roundIncrement, roundMask, roundBits: int64;
5357
roundingMode := float_rounding_mode;
5358
roundNearestEven := flag( roundingMode = float_round_nearest_even );
5359
if ( roundingPrecision = 80 ) then
5361
if ( roundingPrecision = 64 ) then
5363
roundIncrement := int64( $0000000000000400 );
5364
roundMask := int64( $00000000000007FF );
5366
else if ( roundingPrecision = 32 ) then
5368
roundIncrement := int64( $0000008000000000 );
5369
roundMask := int64( $000000FFFFFFFFFF );
5374
zSig0 := zSig0 or ord( zSig1 <> 0 );
5375
if ( not (roundNearestEven<>0) ) then
5377
if ( roundingMode = float_round_to_zero ) then
5379
roundIncrement := 0;
5382
roundIncrement := roundMask;
5383
if ( zSign<>0 ) then
5385
if ( roundingMode = float_round_up ) then
5386
roundIncrement := 0;
5389
if ( roundingMode = float_round_down ) then
5390
roundIncrement := 0;
5394
roundBits := zSig0 and roundMask;
5395
if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
5396
if ( ( $7FFE < zExp )
5397
or ( ( zExp = $7FFE ) and ( zSig0 + roundIncrement < zSig0 ) )
5401
if ( zExp <= 0 ) begin
5403
( float_detect_tininess = float_tininess_before_rounding )
5405
or ( zSig0 <= zSig0 + roundIncrement );
5406
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
5408
roundBits := zSig0 and roundMask;
5409
if ( isTiny and roundBits ) float_raise( float_flag_underflow );
5410
if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
5411
zSig0 += roundIncrement;
5412
if ( (sbits64) zSig0 < 0 ) zExp := 1;
5413
roundIncrement := roundMask + 1;
5414
if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
5415
roundMask |= roundIncrement;
5417
zSig0 &= ~ roundMask;
5418
result:=packFloatx80( zSign, zExp, zSig0 );
5421
if ( roundBits ) softfloat_exception_flags |= float_flag_inexact;
5422
zSig0 += roundIncrement;
5423
if ( zSig0 < roundIncrement ) begin
5425
zSig0 := LIT64( $8000000000000000 );
5427
roundIncrement := roundMask + 1;
5428
if ( roundNearestEven and ( roundBits shl 1 = roundIncrement ) ) begin
5429
roundMask |= roundIncrement;
5431
zSig0 &= ~ roundMask;
5432
if ( zSig0 = 0 ) zExp := 0;
5433
result:=packFloatx80( zSign, zExp, zSig0 );
5435
increment := ( (sbits64) zSig1 < 0 );
5436
if ( ! roundNearestEven ) begin
5437
if ( roundingMode = float_round_to_zero ) begin
5442
increment := ( roundingMode = float_round_down ) and zSig1;
5445
increment := ( roundingMode = float_round_up ) and zSig1;
5449
if ( $7FFD <= (bits32) ( zExp - 1 ) ) begin
5450
if ( ( $7FFE < zExp )
5451
or ( ( zExp = $7FFE )
5452
and ( zSig0 = LIT64( $FFFFFFFFFFFFFFFF ) )
5458
float_raise( float_flag_overflow or float_flag_inexact );
5459
if ( ( roundingMode = float_round_to_zero )
5460
or ( zSign and ( roundingMode = float_round_up ) )
5461
or ( ! zSign and ( roundingMode = float_round_down ) )
5463
result:=packFloatx80( zSign, $7FFE, ~ roundMask );
5465
result:=packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
5467
if ( zExp <= 0 ) begin
5469
( float_detect_tininess = float_tininess_before_rounding )
5472
or ( zSig0 < LIT64( $FFFFFFFFFFFFFFFF ) );
5473
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
5475
if ( isTiny and zSig1 ) float_raise( float_flag_underflow );
5476
if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
5477
if ( roundNearestEven ) begin
5478
increment := ( (sbits64) zSig1 < 0 );
5482
increment := ( roundingMode = float_round_down ) and zSig1;
5485
increment := ( roundingMode = float_round_up ) and zSig1;
5488
if ( increment ) begin
5491
~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
5492
if ( (sbits64) zSig0 < 0 ) zExp := 1;
5494
result:=packFloatx80( zSign, zExp, zSig0 );
5497
if ( zSig1 ) softfloat_exception_flags |= float_flag_inexact;
5498
if ( increment ) begin
5500
if ( zSig0 = 0 ) begin
5502
zSig0 := LIT64( $8000000000000000 );
5505
zSig0 &= ~ ( ( (bits64) ( zSig1 shl 1 ) = 0 ) and roundNearestEven );
5509
if ( zSig0 = 0 ) zExp := 0;
5511
result:=packFloatx80( zSign, zExp, zSig0 );
5515
{*----------------------------------------------------------------------------
5516
| Takes an abstract floating-point value having sign `zSign', exponent
5517
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
5518
| and returns the proper extended double-precision floating-point value
5519
| corresponding to the abstract input. This routine is just like
5520
| `roundAndPackFloatx80' except that the input significand does not have to be
5522
*----------------------------------------------------------------------------*}
5524
function normalizeRoundAndPackFloatx80(roundingPrecision: int8; zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): floatx80;
5528
if ( zSig0 = 0 ) begin
5533
shiftCount := countLeadingZeros64( zSig0 );
5534
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5535
zExp := eExp - shiftCount;
5537
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
5541
{*----------------------------------------------------------------------------
5542
| Returns the result of converting the extended double-precision floating-
5543
| point value `a' to the 32-bit two's complement integer format. The
5544
| conversion is performed according to the IEC/IEEE Standard for Binary
5545
| Floating-Point Arithmetic---which means in particular that the conversion
5546
| is rounded according to the current rounding mode. If `a' is a NaN, the
5547
| largest positive integer is returned. Otherwise, if the conversion
5548
| overflows, the largest integer with the same sign as `a' is returned.
5549
*----------------------------------------------------------------------------*}
5551
function floatx80_to_int32(a: floatx80): int32;
5554
aExp, shiftCount: int32;
5557
aSig := extractFloatx80Frac( a );
5558
aExp := extractFloatx80Exp( a );
5559
aSign := extractFloatx80Sign( a );
5560
if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
5561
shiftCount := $4037 - aExp;
5562
if ( shiftCount <= 0 ) shiftCount := 1;
5563
shift64RightJamming( aSig, shiftCount, &aSig );
5564
result := roundAndPackInt32( aSign, aSig );
5568
{*----------------------------------------------------------------------------
5569
| Returns the result of converting the extended double-precision floating-
5570
| point value `a' to the 32-bit two's complement integer format. The
5571
| conversion is performed according to the IEC/IEEE Standard for Binary
5572
| Floating-Point Arithmetic, except that the conversion is always rounded
5573
| toward zero. If `a' is a NaN, the largest positive integer is returned.
5574
| Otherwise, if the conversion overflows, the largest integer with the same
5575
| sign as `a' is returned.
5576
*----------------------------------------------------------------------------*}
5578
function floatx80_to_int32_round_to_zero(a: floatx80): int32;
5581
aExp, shiftCount: int32;
5582
aSig, savedASig: bits64;
5585
aSig := extractFloatx80Frac( a );
5586
aExp := extractFloatx80Exp( a );
5587
aSign := extractFloatx80Sign( a );
5588
if ( $401E < aExp ) begin
5589
if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) aSign := 0;
5592
else if ( aExp < $3FFF ) begin
5593
if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
5596
shiftCount := $403E - aExp;
5598
aSig >>= shiftCount;
5600
if ( aSign ) z := - z;
5601
if ( ( z < 0 ) xor aSign ) begin
5603
float_raise( float_flag_invalid );
5604
result := aSign ? (sbits32) $80000000 : $7FFFFFFF;
5606
if ( ( aSig shl shiftCount ) <> savedASig ) begin
5607
softfloat_exception_flags or= float_flag_inexact;
5613
{*----------------------------------------------------------------------------
5614
| Returns the result of converting the extended double-precision floating-
5615
| point value `a' to the 64-bit two's complement integer format. The
5616
| conversion is performed according to the IEC/IEEE Standard for Binary
5617
| Floating-Point Arithmetic---which means in particular that the conversion
5618
| is rounded according to the current rounding mode. If `a' is a NaN,
5619
| the largest positive integer is returned. Otherwise, if the conversion
5620
| overflows, the largest integer with the same sign as `a' is returned.
5621
*----------------------------------------------------------------------------*}
5623
function floatx80_to_int64(a: floatx80): int64;
5626
aExp, shiftCount: int32;
5627
aSig, aSigExtra: bits64;
5630
aSig := extractFloatx80Frac( a );
5631
aExp := extractFloatx80Exp( a );
5632
aSign := extractFloatx80Sign( a );
5633
shiftCount := $403E - aExp;
5634
if ( shiftCount <= 0 ) begin
5635
if ( shiftCount ) begin
5636
float_raise( float_flag_invalid );
5638
or ( ( aExp = $7FFF )
5639
and ( aSig <> LIT64( $8000000000000000 ) ) )
5641
result := LIT64( $7FFFFFFFFFFFFFFF );
5643
result := (sbits64) LIT64( $8000000000000000 );
5648
shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5650
result := roundAndPackInt64( aSign, aSig, aSigExtra );
5654
{*----------------------------------------------------------------------------
5655
| Returns the result of converting the extended double-precision floating-
5656
| point value `a' to the 64-bit two's complement integer format. The
5657
| conversion is performed according to the IEC/IEEE Standard for Binary
5658
| Floating-Point Arithmetic, except that the conversion is always rounded
5659
| toward zero. If `a' is a NaN, the largest positive integer is returned.
5660
| Otherwise, if the conversion overflows, the largest integer with the same
5661
| sign as `a' is returned.
5662
*----------------------------------------------------------------------------*}
5664
function floatx80_to_int64_round_to_zero(a: floatx80): int64;
5667
aExp, shiftCount: int32;
5671
aSig := extractFloatx80Frac( a );
5672
aExp := extractFloatx80Exp( a );
5673
aSign := extractFloatx80Sign( a );
5674
shiftCount := aExp - $403E;
5675
if ( 0 <= shiftCount ) begin
5676
aSig &= LIT64( $7FFFFFFFFFFFFFFF );
5677
if ( ( a.high <> $C03E ) or aSig ) begin
5678
float_raise( float_flag_invalid );
5679
if ( ! aSign or ( ( aExp = $7FFF ) and aSig ) ) begin
5680
result := LIT64( $7FFFFFFFFFFFFFFF );
5683
result := (sbits64) LIT64( $8000000000000000 );
5685
else if ( aExp < $3FFF ) begin
5686
if ( aExp or aSig ) softfloat_exception_flags or= float_flag_inexact;
5689
z := aSig>>( - shiftCount );
5690
if ( (bits64) ( aSig shl ( shiftCount and 63 ) ) ) begin
5691
softfloat_exception_flags or= float_flag_inexact;
5693
if ( aSign ) z := - z;
5698
{*----------------------------------------------------------------------------
5699
| Returns the result of converting the extended double-precision floating-
5700
| point value `a' to the single-precision floating-point format. The
5701
| conversion is performed according to the IEC/IEEE Standard for Binary
5702
| Floating-Point Arithmetic.
5703
*----------------------------------------------------------------------------*}
5705
function floatx80_to_float32(a: floatx80): float32;
5711
aSig := extractFloatx80Frac( a );
5712
aExp := extractFloatx80Exp( a );
5713
aSign := extractFloatx80Sign( a );
5714
if ( aExp = $7FFF ) begin
5715
if ( (bits64) ( aSig shl 1 ) ) begin
5716
result := commonNaNToFloat32( floatx80ToCommonNaN( a ) );
5718
result := packFloat32( aSign, $FF, 0 );
5720
shift64RightJamming( aSig, 33, &aSig );
5721
if ( aExp or aSig ) aExp -= $3F81;
5722
result := roundAndPackFloat32( aSign, aExp, aSig );
5726
{*----------------------------------------------------------------------------
5727
| Returns the result of converting the extended double-precision floating-
5728
| point value `a' to the double-precision floating-point format. The
5729
| conversion is performed according to the IEC/IEEE Standard for Binary
5730
| Floating-Point Arithmetic.
5731
*----------------------------------------------------------------------------*}
5733
function floatx80_to_float64(a: floatx80): float64;
5739
aSig := extractFloatx80Frac( a );
5740
aExp := extractFloatx80Exp( a );
5741
aSign := extractFloatx80Sign( a );
5742
if ( aExp = $7FFF ) begin
5743
if ( (bits64) ( aSig shl 1 ) ) begin
5744
result := commonNaNToFloat64( floatx80ToCommonNaN( a ) );
5746
result := packFloat64( aSign, $7FF, 0 );
5748
shift64RightJamming( aSig, 1, &zSig );
5749
if ( aExp or aSig ) aExp -= $3C01;
5750
result := roundAndPackFloat64( aSign, aExp, zSig );
5754
{$ifdef FPC_SOFTFLOAT_FLOAT128}
5755
{*----------------------------------------------------------------------------
5756
| Returns the result of converting the extended double-precision floating-
5757
| point value `a' to the quadruple-precision floating-point format. The
5758
| conversion is performed according to the IEC/IEEE Standard for Binary
5759
| Floating-Point Arithmetic.
5760
*----------------------------------------------------------------------------*}
5762
function floatx80_to_float128(a: floatx80): float128;
5766
aSig, zSig0, zSig1: bits64;
5768
aSig := extractFloatx80Frac( a );
5769
aExp := extractFloatx80Exp( a );
5770
aSign := extractFloatx80Sign( a );
5771
if ( ( aExp = $7FFF ) and (bits64) ( aSig shl 1 ) ) begin
5772
result := commonNaNToFloat128( floatx80ToCommonNaN( a ) );
5774
shift128Right( aSig shl 1, 0, 16, &zSig0, &zSig1 );
5775
result := packFloat128( aSign, aExp, zSig0, zSig1 );
5779
{$endif FPC_SOFTFLOAT_FLOAT128}
5781
{*----------------------------------------------------------------------------
5782
| Rounds the extended double-precision floating-point value `a' to an integer,
5783
| and Returns the result as an extended quadruple-precision floating-point
5784
| value. The operation is performed according to the IEC/IEEE Standard for
5785
| Binary Floating-Point Arithmetic.
5786
*----------------------------------------------------------------------------*}
5788
function floatx80_round_to_int(a: floatx80): floatx80;
5792
lastBitMask, roundBitsMask: bits64;
5796
aExp := extractFloatx80Exp( a );
5797
if ( $403E <= aExp ) begin
5798
if ( ( aExp = $7FFF ) and (bits64) ( extractFloatx80Frac( a ) shl 1 ) ) begin
5799
result := propagateFloatx80NaN( a, a );
5803
if ( aExp < $3FFF ) begin
5805
and ( (bits64) ( extractFloatx80Frac( a ) shl 1 ) = 0 ) ) begin
5808
softfloat_exception_flags or= float_flag_inexact;
5809
aSign := extractFloatx80Sign( a );
5810
switch ( float_rounding_mode ) begin
5811
case float_round_nearest_even:
5812
if ( ( aExp = $3FFE ) and (bits64) ( extractFloatx80Frac( a ) shl 1 )
5815
packFloatx80( aSign, $3FFF, LIT64( $8000000000000000 ) );
5818
case float_round_down:
5821
packFloatx80( 1, $3FFF, LIT64( $8000000000000000 ) )
5822
: packFloatx80( 0, 0, 0 );
5823
case float_round_up:
5825
aSign ? packFloatx80( 1, 0, 0 )
5826
: packFloatx80( 0, $3FFF, LIT64( $8000000000000000 ) );
5828
result := packFloatx80( aSign, 0, 0 );
5831
lastBitMask shl = $403E - aExp;
5832
roundBitsMask := lastBitMask - 1;
5834
roundingMode := float_rounding_mode;
5835
if ( roundingMode = float_round_nearest_even ) begin
5836
z.low += lastBitMask>>1;
5837
if ( ( z.low and roundBitsMask ) = 0 ) z.low &= ~ lastBitMask;
5839
else if ( roundingMode <> float_round_to_zero ) begin
5840
if ( extractFloatx80Sign( z ) xor ( roundingMode = float_round_up ) ) begin
5841
z.low += roundBitsMask;
5844
z.low &= ~ roundBitsMask;
5845
if ( z.low = 0 ) begin
5847
z.low := LIT64( $8000000000000000 );
5849
if ( z.low <> a.low ) softfloat_exception_flags or= float_flag_inexact;
5854
{*----------------------------------------------------------------------------
5855
| Returns the result of adding the absolute values of the extended double-
5856
| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5857
| negated before being returned. `zSign' is ignored if the result is a NaN.
5858
| The addition is performed according to the IEC/IEEE Standard for Binary
5859
| Floating-Point Arithmetic.
5860
*----------------------------------------------------------------------------*}
5862
function addFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
5864
aExp, bExp, zExp: int32;
5865
aSig, bSig, zSig0, zSig1: bits64;
5868
aSig := extractFloatx80Frac( a );
5869
aExp := extractFloatx80Exp( a );
5870
bSig := extractFloatx80Frac( b );
5871
bExp := extractFloatx80Exp( b );
5872
expDiff := aExp - bExp;
5873
if ( 0 < expDiff ) begin
5874
if ( aExp = $7FFF ) begin
5875
if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
5878
if ( bExp = 0 ) --expDiff;
5879
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5882
else if ( expDiff < 0 ) begin
5883
if ( bExp = $7FFF ) begin
5884
if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
5885
result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
5887
if ( aExp = 0 ) ++expDiff;
5888
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5892
if ( aExp = $7FFF ) begin
5893
if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
5894
result := propagateFloatx80NaN( a, b );
5899
zSig0 := aSig + bSig;
5900
if ( aExp = 0 ) begin
5901
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5907
zSig0 := aSig + bSig;
5908
if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
5910
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5911
zSig0 or= LIT64( $8000000000000000 );
5915
roundAndPackFloatx80(
5916
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
5920
{*----------------------------------------------------------------------------
5921
| Returns the result of subtracting the absolute values of the extended
5922
| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5923
| difference is negated before being returned. `zSign' is ignored if the
5924
| result is a NaN. The subtraction is performed according to the IEC/IEEE
5925
| Standard for Binary Floating-Point Arithmetic.
5926
*----------------------------------------------------------------------------*}
5928
function subFloatx80Sigs(a: floatx80; b: floatx80; zSign : flag): floatx80;
5930
aExp, bExp, zExp: int32;
5931
aSig, bSig, zSig0, zSig1: bits64;
5935
aSig := extractFloatx80Frac( a );
5936
aExp := extractFloatx80Exp( a );
5937
bSig := extractFloatx80Frac( b );
5938
bExp := extractFloatx80Exp( b );
5939
expDiff := aExp - bExp;
5940
if ( 0 < expDiff ) goto aExpBigger;
5941
if ( expDiff < 0 ) goto bExpBigger;
5942
if ( aExp = $7FFF ) begin
5943
if ( (bits64) ( ( aSig or bSig ) shl 1 ) ) begin
5944
result := propagateFloatx80NaN( a, b );
5946
float_raise( float_flag_invalid );
5947
z.low := floatx80_default_nan_low;
5948
z.high := floatx80_default_nan_high;
5951
if ( aExp = 0 ) begin
5956
if ( bSig < aSig ) goto aBigger;
5957
if ( aSig < bSig ) goto bBigger;
5958
result := packFloatx80( float_rounding_mode = float_round_down, 0, 0 );
5960
if ( bExp = $7FFF ) begin
5961
if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
5962
result := packFloatx80( zSign xor 1, $7FFF, LIT64( $8000000000000000 ) );
5964
if ( aExp = 0 ) ++expDiff;
5965
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5967
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5970
goto normalizeRoundAndPack;
5972
if ( aExp = $7FFF ) begin
5973
if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
5976
if ( bExp = 0 ) --expDiff;
5977
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5979
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5981
normalizeRoundAndPack:
5983
normalizeRoundAndPackFloatx80(
5984
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
5988
{*----------------------------------------------------------------------------
5989
| Returns the result of adding the extended double-precision floating-point
5990
| values `a' and `b'. The operation is performed according to the IEC/IEEE
5991
| Standard for Binary Floating-Point Arithmetic.
5992
*----------------------------------------------------------------------------*}
5994
function floatx80_add(a: floatx80; b: floatx80): floatx80;
5998
aSign := extractFloatx80Sign( a );
5999
bSign := extractFloatx80Sign( b );
6000
if ( aSign = bSign ) begin
6001
result := addFloatx80Sigs( a, b, aSign );
6004
result := subFloatx80Sigs( a, b, aSign );
6009
{*----------------------------------------------------------------------------
6010
| Returns the result of subtracting the extended double-precision floating-
6011
| point values `a' and `b'. The operation is performed according to the
6012
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6013
*----------------------------------------------------------------------------*}
6015
function floatx80_sub(a: floatx80; b: floatx80 ): floatx80;
6019
aSign := extractFloatx80Sign( a );
6020
bSign := extractFloatx80Sign( b );
6021
if ( aSign = bSign ) begin
6022
result := subFloatx80Sigs( a, b, aSign );
6025
result := addFloatx80Sigs( a, b, aSign );
6030
{*----------------------------------------------------------------------------
6031
| Returns the result of multiplying the extended double-precision floating-
6032
| point values `a' and `b'. The operation is performed according to the
6033
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6034
*----------------------------------------------------------------------------*}
6036
function floatx80_mul(a: floatx80; b: floatx80): floatx80;
6038
aSign, bSign, zSign: flag;
6039
aExp, bExp, zExp: int32;
6040
aSig, bSig, zSig0, zSig1: bits64;
6043
aSig := extractFloatx80Frac( a );
6044
aExp := extractFloatx80Exp( a );
6045
aSign := extractFloatx80Sign( a );
6046
bSig := extractFloatx80Frac( b );
6047
bExp := extractFloatx80Exp( b );
6048
bSign := extractFloatx80Sign( b );
6049
zSign := aSign xor bSign;
6050
if ( aExp = $7FFF ) begin
6051
if ( (bits64) ( aSig shl 1 )
6052
or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
6053
result := propagateFloatx80NaN( a, b );
6055
if ( ( bExp or bSig ) = 0 ) goto invalid;
6056
result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
6058
if ( bExp = $7FFF ) begin
6059
if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
6060
if ( ( aExp or aSig ) = 0 ) begin
6062
float_raise( float_flag_invalid );
6063
z.low := floatx80_default_nan_low;
6064
z.high := floatx80_default_nan_high;
6067
result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
6069
if ( aExp = 0 ) begin
6070
if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
6071
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6073
if ( bExp = 0 ) begin
6074
if ( bSig = 0 ) result := packFloatx80( zSign, 0, 0 );
6075
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6077
zExp := aExp + bExp - $3FFE;
6078
mul64To128( aSig, bSig, &zSig0, &zSig1 );
6079
if ( 0 < (sbits64) zSig0 ) begin
6080
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6084
roundAndPackFloatx80(
6085
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
6089
{*----------------------------------------------------------------------------
6090
| Returns the result of dividing the extended double-precision floating-point
6091
| value `a' by the corresponding value `b'. The operation is performed
6092
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6093
*----------------------------------------------------------------------------*}
6095
function floatx80_div(a: floatx80; b: floatx80 ): floatx80;
6097
aSign, bSign, zSign: flag;
6098
aExp, bExp, zExp: int32;
6099
aSig, bSig, zSig0, zSig1: bits64;
6100
rem0, rem1, rem2, term0, term1, term2: bits64;
6103
aSig := extractFloatx80Frac( a );
6104
aExp := extractFloatx80Exp( a );
6105
aSign := extractFloatx80Sign( a );
6106
bSig := extractFloatx80Frac( b );
6107
bExp := extractFloatx80Exp( b );
6108
bSign := extractFloatx80Sign( b );
6109
zSign := aSign xor bSign;
6110
if ( aExp = $7FFF ) begin
6111
if ( (bits64) ( aSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
6112
if ( bExp = $7FFF ) begin
6113
if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
6116
result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
6118
if ( bExp = $7FFF ) begin
6119
if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
6120
result := packFloatx80( zSign, 0, 0 );
6122
if ( bExp = 0 ) begin
6123
if ( bSig = 0 ) begin
6124
if ( ( aExp or aSig ) = 0 ) begin
6126
float_raise( float_flag_invalid );
6127
z.low := floatx80_default_nan_low;
6128
z.high := floatx80_default_nan_high;
6131
float_raise( float_flag_divbyzero );
6132
result := packFloatx80( zSign, $7FFF, LIT64( $8000000000000000 ) );
6134
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6136
if ( aExp = 0 ) begin
6137
if ( aSig = 0 ) result := packFloatx80( zSign, 0, 0 );
6138
normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6140
zExp := aExp - bExp + $3FFE;
6142
if ( bSig <= aSig ) begin
6143
shift128Right( aSig, 0, 1, &aSig, &rem1 );
6146
zSig0 := estimateDiv128To64( aSig, rem1, bSig );
6147
mul64To128( bSig, zSig0, &term0, &term1 );
6148
sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6149
while ( (sbits64) rem0 < 0 ) begin
6151
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6153
zSig1 := estimateDiv128To64( rem1, 0, bSig );
6154
if ( (bits64) ( zSig1 shl 1 ) <= 8 ) begin
6155
mul64To128( bSig, zSig1, &term1, &term2 );
6156
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6157
while ( (sbits64) rem1 < 0 ) begin
6159
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6161
zSig1 or= ( ( rem1 or rem2 ) <> 0 );
6164
roundAndPackFloatx80(
6165
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
6169
{*----------------------------------------------------------------------------
6170
| Returns the remainder of the extended double-precision floating-point value
6171
| `a' with respect to the corresponding value `b'. The operation is performed
6172
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6173
*----------------------------------------------------------------------------*}
6175
function floatx80_rem(a: floatx80; b: floatx80 ): floatx80;
6177
aSign, bSign, zSign: flag;
6178
aExp, bExp, expDiff: int32;
6179
aSig0, aSig1, bSig: bits64;
6180
q, term0, term1, alternateASig0, alternateASig1: bits64;
6183
aSig0 := extractFloatx80Frac( a );
6184
aExp := extractFloatx80Exp( a );
6185
aSign := extractFloatx80Sign( a );
6186
bSig := extractFloatx80Frac( b );
6187
bExp := extractFloatx80Exp( b );
6188
bSign := extractFloatx80Sign( b );
6189
if ( aExp = $7FFF ) begin
6190
if ( (bits64) ( aSig0 shl 1 )
6191
or ( ( bExp = $7FFF ) and (bits64) ( bSig shl 1 ) ) ) begin
6192
result := propagateFloatx80NaN( a, b );
6196
if ( bExp = $7FFF ) begin
6197
if ( (bits64) ( bSig shl 1 ) ) result := propagateFloatx80NaN( a, b );
6200
if ( bExp = 0 ) begin
6201
if ( bSig = 0 ) begin
6203
float_raise( float_flag_invalid );
6204
z.low := floatx80_default_nan_low;
6205
z.high := floatx80_default_nan_high;
6208
normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6210
if ( aExp = 0 ) begin
6211
if ( (bits64) ( aSig0 shl 1 ) = 0 ) result := a;
6212
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6214
bSig or= LIT64( $8000000000000000 );
6216
expDiff := aExp - bExp;
6218
if ( expDiff < 0 ) begin
6219
if ( expDiff < -1 ) result := a;
6220
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6223
q := ( bSig <= aSig0 );
6224
if ( q ) aSig0 -= bSig;
6226
while ( 0 < expDiff ) begin
6227
q := estimateDiv128To64( aSig0, aSig1, bSig );
6228
q := ( 2 < q ) ? q - 2 : 0;
6229
mul64To128( bSig, q, &term0, &term1 );
6230
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6231
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6235
if ( 0 < expDiff ) begin
6236
q := estimateDiv128To64( aSig0, aSig1, bSig );
6237
q := ( 2 < q ) ? q - 2 : 0;
6239
mul64To128( bSig, q shl ( 64 - expDiff ), &term0, &term1 );
6240
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6241
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6242
while ( le128( term0, term1, aSig0, aSig1 ) ) begin
6244
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6251
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6252
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6253
or ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6256
aSig0 := alternateASig0;
6257
aSig1 := alternateASig1;
6261
normalizeRoundAndPackFloatx80(
6262
80, zSign, bExp + expDiff, aSig0, aSig1 );
6266
{*----------------------------------------------------------------------------
6267
| Returns the square root of the extended double-precision floating-point
6268
| value `a'. The operation is performed according to the IEC/IEEE Standard
6269
| for Binary Floating-Point Arithmetic.
6270
*----------------------------------------------------------------------------*}
6272
function floatx80_sqrt(a: floatx80): floatx80;
6276
aSig0, aSig1, zSig0, zSig1, doubleZSig0: bits64;
6277
rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
6282
aSig0 := extractFloatx80Frac( a );
6283
aExp := extractFloatx80Exp( a );
6284
aSign := extractFloatx80Sign( a );
6285
if ( aExp = $7FFF ) begin
6286
if ( (bits64) ( aSig0 shl 1 ) ) result := propagateFloatx80NaN( a, a );
6287
if ( ! aSign ) result := a;
6291
if ( ( aExp or aSig0 ) = 0 ) result := a;
6293
float_raise( float_flag_invalid );
6294
z.low := floatx80_default_nan_low;
6295
z.high := floatx80_default_nan_high;
6298
if ( aExp = 0 ) begin
6299
if ( aSig0 = 0 ) result := packFloatx80( 0, 0, 0 );
6300
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6302
zExp := ( ( aExp - $3FFF )>>1 ) + $3FFF;
6303
zSig0 := estimateSqrt32( aExp, aSig0>>32 );
6304
shift128Right( aSig0, 0, 2 + ( aExp and 1 ), &aSig0, &aSig1 );
6305
zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
6306
doubleZSig0 := zSig0 shl 1;
6307
mul64To128( zSig0, zSig0, &term0, &term1 );
6308
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6309
while ( (sbits64) rem0 < 0 ) begin
6312
add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
6314
zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
6315
if ( ( zSig1 and LIT64( $3FFFFFFFFFFFFFFF ) ) <= 5 ) begin
6316
if ( zSig1 = 0 ) zSig1 := 1;
6317
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6318
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6319
mul64To128( zSig1, zSig1, &term2, &term3 );
6320
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6321
while ( (sbits64) rem1 < 0 ) begin
6323
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6325
term2 or= doubleZSig0;
6326
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6328
zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
6330
shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6331
zSig0 or= doubleZSig0;
6333
roundAndPackFloatx80(
6334
floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
6338
{*----------------------------------------------------------------------------
6339
| Returns 1 if the extended double-precision floating-point value `a' is
6340
| equal to the corresponding value `b', and 0 otherwise. The comparison is
6341
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6343
*----------------------------------------------------------------------------*}
6345
function floatx80_eq(a: floatx80; b: floatx80 ): flag;
6347
if ( ( ( extractFloatx80Exp( a ) = $7FFF )
6348
and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
6349
or ( ( extractFloatx80Exp( b ) = $7FFF )
6350
and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
6352
if ( floatx80_is_signaling_nan( a )
6353
or floatx80_is_signaling_nan( b ) ) begin
6354
float_raise( float_flag_invalid );
6360
and ( ( a.high = b.high )
6362
and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
6367
{*----------------------------------------------------------------------------
6368
| Returns 1 if the extended double-precision floating-point value `a' is
6369
| less than or equal to the corresponding value `b', and 0 otherwise. The
6370
| comparison is performed according to the IEC/IEEE Standard for Binary
6371
| Floating-Point Arithmetic.
6372
*----------------------------------------------------------------------------*}
6374
function floatx80_le(a: floatx80; b: floatx80 ): flag;
6378
if ( ( ( extractFloatx80Exp( a ) = $7FFF )
6379
and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
6380
or ( ( extractFloatx80Exp( b ) = $7FFF )
6381
and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
6383
float_raise( float_flag_invalid );
6386
aSign := extractFloatx80Sign( a );
6387
bSign := extractFloatx80Sign( b );
6388
if ( aSign <> bSign ) begin
6391
or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
6395
aSign ? le128( b.high, b.low, a.high, a.low )
6396
: le128( a.high, a.low, b.high, b.low );
6400
{*----------------------------------------------------------------------------
6401
| Returns 1 if the extended double-precision floating-point value `a' is
6402
| less than the corresponding value `b', and 0 otherwise. The comparison
6403
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6405
*----------------------------------------------------------------------------*}
6407
function floatx80_lt(a: floatx80; b: floatx80 ): flag;
6411
if ( ( ( extractFloatx80Exp( a ) = $7FFF )
6412
and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
6413
or ( ( extractFloatx80Exp( b ) = $7FFF )
6414
and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
6416
float_raise( float_flag_invalid );
6419
aSign := extractFloatx80Sign( a );
6420
bSign := extractFloatx80Sign( b );
6421
if ( aSign <> bSign ) begin
6424
and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
6428
aSign ? lt128( b.high, b.low, a.high, a.low )
6429
: lt128( a.high, a.low, b.high, b.low );
6433
{*----------------------------------------------------------------------------
6434
| Returns 1 if the extended double-precision floating-point value `a' is equal
6435
| to the corresponding value `b', and 0 otherwise. The invalid exception is
6436
| raised if either operand is a NaN. Otherwise, the comparison is performed
6437
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6438
*----------------------------------------------------------------------------*}
6440
function floatx80_eq_signaling(a: floatx80; b: floatx80 ): flag;
6442
if ( ( ( extractFloatx80Exp( a ) = $7FFF )
6443
and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
6444
or ( ( extractFloatx80Exp( b ) = $7FFF )
6445
and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
6447
float_raise( float_flag_invalid );
6452
and ( ( a.high = b.high )
6454
and ( (bits16) ( ( a.high or b.high ) shl 1 ) = 0 ) )
6459
{*----------------------------------------------------------------------------
6460
| Returns 1 if the extended double-precision floating-point value `a' is less
6461
| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
6462
| do not cause an exception. Otherwise, the comparison is performed according
6463
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6464
*----------------------------------------------------------------------------*}
6466
function floatx80_le_quiet(a: floatx80; b: floatx80 ): flag;
6470
if ( ( ( extractFloatx80Exp( a ) = $7FFF )
6471
and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
6472
or ( ( extractFloatx80Exp( b ) = $7FFF )
6473
and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
6475
if ( floatx80_is_signaling_nan( a )
6476
or floatx80_is_signaling_nan( b ) ) begin
6477
float_raise( float_flag_invalid );
6481
aSign := extractFloatx80Sign( a );
6482
bSign := extractFloatx80Sign( b );
6483
if ( aSign <> bSign ) begin
6486
or ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
6490
aSign ? le128( b.high, b.low, a.high, a.low )
6491
: le128( a.high, a.low, b.high, b.low );
6495
{*----------------------------------------------------------------------------
6496
| Returns 1 if the extended double-precision floating-point value `a' is less
6497
| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
6498
| an exception. Otherwise, the comparison is performed according to the
6499
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6500
*----------------------------------------------------------------------------*}
6502
function floatx80_lt_quiet(a: floatx80; b: floatx80 ): flag;
6506
if ( ( ( extractFloatx80Exp( a ) = $7FFF )
6507
and (bits64) ( extractFloatx80Frac( a ) shl 1 ) )
6508
or ( ( extractFloatx80Exp( b ) = $7FFF )
6509
and (bits64) ( extractFloatx80Frac( b ) shl 1 ) )
6511
if ( floatx80_is_signaling_nan( a )
6512
or floatx80_is_signaling_nan( b ) ) begin
6513
float_raise( float_flag_invalid );
6517
aSign := extractFloatx80Sign( a );
6518
bSign := extractFloatx80Sign( b );
6519
if ( aSign <> bSign ) begin
6522
and ( ( ( (bits16) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
6526
aSign ? lt128( b.high, b.low, a.high, a.low )
6527
: lt128( a.high, a.low, b.high, b.low );
6531
{$endif FPC_SOFTFLOAT_FLOATX80}
6534
{$ifdef FPC_SOFTFLOAT_FLOAT128}
6536
{*----------------------------------------------------------------------------
6537
| Returns the least-significant 64 fraction bits of the quadruple-precision
6538
| floating-point value `a'.
6539
*----------------------------------------------------------------------------*}
6541
function extractFloat128Frac1(a : float128): bits64;
6546
{*----------------------------------------------------------------------------
6547
| Returns the most-significant 48 fraction bits of the quadruple-precision
6548
| floating-point value `a'.
6549
*----------------------------------------------------------------------------*}
6551
function extractFloat128Frac0(a : float128): bits64;
6553
result:=a.high and int64($0000FFFFFFFFFFFF);
6556
{*----------------------------------------------------------------------------
6557
| Returns the exponent bits of the quadruple-precision floating-point value
6559
*----------------------------------------------------------------------------*}
6561
function extractFloat128Exp(a : float128): int32;
6563
result:=( a.high shr 48 ) and $7FFF;
6566
{*----------------------------------------------------------------------------
6567
| Returns the sign bit of the quadruple-precision floating-point value `a'.
6568
*----------------------------------------------------------------------------*}
6570
function extractFloat128Sign(a : float128): flag;
6572
result:=a.high shr 63;
6575
{*----------------------------------------------------------------------------
6576
| Normalizes the subnormal quadruple-precision floating-point value
6577
| represented by the denormalized significand formed by the concatenation of
6578
| `aSig0' and `aSig1'. The normalized exponent is stored at the location
6579
| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
6580
| significand are stored at the location pointed to by `zSig0Ptr', and the
6581
| least significant 64 bits of the normalized significand are stored at the
6582
| location pointed to by `zSig1Ptr'.
6583
*----------------------------------------------------------------------------*}
6585
procedure normalizeFloat128Subnormal(
6589
var zSig0Ptr: bits64;
6590
var zSig1Ptr: bits64);
6594
if ( aSig0 = 0 ) then
6596
shiftCount := countLeadingZeros64( aSig1 ) - 15;
6597
if ( shiftCount < 0 ) then
6599
zSig0Ptr := aSig1 shr ( - shiftCount );
6600
zSig1Ptr := aSig1 shl ( shiftCount and 63 );
6603
zSig0Ptr := aSig1 shl shiftCount;
6606
zExpPtr := - shiftCount - 63;
6609
shiftCount := countLeadingZeros64( aSig0 ) - 15;
6610
shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
6611
zExpPtr := 1 - shiftCount;
6616
{*----------------------------------------------------------------------------
6617
| Packs the sign `zSign', the exponent `zExp', and the significand formed
6618
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
6619
| floating-point value, returning the result. After being shifted into the
6620
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
6621
| added together to form the most significant 32 bits of the result. This
6622
| means that any integer portion of `zSig0' will be added into the exponent.
6623
| Since a properly normalized significand will have an integer portion equal
6624
| to 1, the `zExp' input should be 1 less than the desired result exponent
6625
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
6627
*----------------------------------------------------------------------------*}
6629
function packFloat128( zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64) : float128;
6634
z.high := ( ( bits64(zSign) ) shl 63 ) + ( ( bits64(zExp) ) shl 48 ) + zSig0;
6638
{*----------------------------------------------------------------------------
6639
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
6640
| and extended significand formed by the concatenation of `zSig0', `zSig1',
6641
| and `zSig2', and returns the proper quadruple-precision floating-point value
6642
| corresponding to the abstract input. Ordinarily, the abstract value is
6643
| simply rounded and packed into the quadruple-precision format, with the
6644
| inexact exception raised if the abstract input cannot be represented
6645
| exactly. However, if the abstract value is too large, the overflow and
6646
| inexact exceptions are raised and an infinity or maximal finite value is
6647
| returned. If the abstract value is too small, the input value is rounded to
6648
| a subnormal number, and the underflow and inexact exceptions are raised if
6649
| the abstract input cannot be represented exactly as a subnormal quadruple-
6650
| precision floating-point number.
6651
| The input significand must be normalized or smaller. If the input
6652
| significand is not normalized, `zExp' must be 0; in that case, the result
6653
| returned is a subnormal number, and it must not require rounding. In the
6654
| usual case that the input significand is normalized, `zExp' must be 1 less
6655
| than the ``true'' floating-point exponent. The handling of underflow and
6656
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6657
*----------------------------------------------------------------------------*}
6659
function roundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64; zSig2: bits64): float128;
6662
roundNearestEven, increment, isTiny: flag;
6664
roundingMode := float_rounding_mode;
6665
roundNearestEven := ord( roundingMode = float_round_nearest_even );
6666
increment := ord( sbits64(zSig2) < 0 );
6667
if ( roundNearestEven=0 ) then
6669
if ( roundingMode = float_round_to_zero ) then
6674
if ( zSign<>0 ) then
6676
increment := ord( roundingMode = float_round_down ) and zSig2;
6679
increment := ord( roundingMode = float_round_up ) and zSig2;
6683
if ( $7FFD <= bits32(zExp) ) then
6685
if ( ord( $7FFD < zExp )
6686
or ( ord( zExp = $7FFD )
6688
int64( $0001FFFFFFFFFFFF ),
6689
int64( $FFFFFFFFFFFFFFFF ),
6697
float_raise( float_flag_overflow or float_flag_inexact );
6698
if ( ord( roundingMode = float_round_to_zero )
6699
or ( zSign and ord( roundingMode = float_round_up ) )
6700
or ( not(zSign) and ord( roundingMode = float_round_down ) )
6707
int64( $0000FFFFFFFFFFFF ),
6708
int64( $FFFFFFFFFFFFFFFF )
6711
result:=packFloat128( zSign, $7FFF, 0, 0 );
6713
if ( zExp < 0 ) then
6716
ord(( float_detect_tininess = float_tininess_before_rounding )
6718
or not( increment<>0 )
6722
int64( $0001FFFFFFFFFFFF ),
6723
int64( $FFFFFFFFFFFFFFFF )
6725
shift128ExtraRightJamming(
6726
zSig0, zSig1, zSig2, - zExp, zSig0, zSig1, zSig2 );
6728
if ( isTiny and zSig2 )<>0 then
6729
float_raise( float_flag_underflow );
6730
if ( roundNearestEven<>0 ) then
6732
increment := ord( sbits64(zSig2) < 0 );
6735
if ( zSign<>0 ) then
6737
increment := ord( roundingMode = float_round_down ) and zSig2;
6740
increment := ord( roundingMode = float_round_up ) and zSig2;
6745
if ( zSig2<>0 ) then
6746
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
6747
if ( increment<>0 ) then
6749
add128( zSig0, zSig1, 0, 1, zSig0, zSig1 );
6750
zSig1 := zSig1 and not( ord( zSig2 + zSig2 = 0 ) and roundNearestEven );
6753
if ( ( zSig0 or zSig1 ) = 0 ) then
6756
result:=packFloat128( zSign, zExp, zSig0, zSig1 );
6759
{*----------------------------------------------------------------------------
6760
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
6761
| and significand formed by the concatenation of `zSig0' and `zSig1', and
6762
| returns the proper quadruple-precision floating-point value corresponding
6763
| to the abstract input. This routine is just like `roundAndPackFloat128'
6764
| except that the input significand has fewer bits and does not have to be
6765
| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
6767
*----------------------------------------------------------------------------*}
6769
function normalizeRoundAndPackFloat128(zSign: flag; zExp: int32; zSig0: bits64; zSig1: bits64): float128;
6774
if ( zSig0 = 0 ) then
6780
shiftCount := countLeadingZeros64( zSig0 ) - 15;
6781
if ( 0 <= shiftCount ) then
6784
shortShift128Left( zSig0, zSig1, shiftCount, zSig0, zSig1 );
6787
shift128ExtraRightJamming(
6788
zSig0, zSig1, 0, - shiftCount, zSig0, zSig1, zSig2 );
6790
dec(zExp, shiftCount);
6791
result:=roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
6795
{*----------------------------------------------------------------------------
6796
| Returns the result of converting the quadruple-precision floating-point
6797
| value `a' to the 32-bit two's complement integer format. The conversion
6798
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6799
| Arithmetic---which means in particular that the conversion is rounded
6800
| according to the current rounding mode. If `a' is a NaN, the largest
6801
| positive integer is returned. Otherwise, if the conversion overflows, the
6802
| largest integer with the same sign as `a' is returned.
6803
*----------------------------------------------------------------------------*}
6805
function float128_to_int32(a: float128): int32;
6808
aExp, shiftCount: int32;
6809
aSig0, aSig1: bits64;
6811
aSig1 := extractFloat128Frac1( a );
6812
aSig0 := extractFloat128Frac0( a );
6813
aExp := extractFloat128Exp( a );
6814
aSign := extractFloat128Sign( a );
6815
if ( ord( aExp = $7FFF ) and ( aSig0 or aSig1 ) )<>0 then
6818
aSig0 := aSig0 or int64( $0001000000000000 );
6819
aSig0 := aSig0 or ord( aSig1 <> 0 );
6820
shiftCount := $4028 - aExp;
6821
if ( 0 < shiftCount ) then
6822
shift64RightJamming( aSig0, shiftCount, aSig0 );
6823
result := roundAndPackInt32( aSign, aSig0 );
6827
{*----------------------------------------------------------------------------
6828
| Returns the result of converting the quadruple-precision floating-point
6829
| value `a' to the 32-bit two's complement integer format. The conversion
6830
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6831
| Arithmetic, except that the conversion is always rounded toward zero. If
6832
| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
6833
| conversion overflows, the largest integer with the same sign as `a' is
6835
*----------------------------------------------------------------------------*}
6837
function float128_to_int32_round_to_zero(a: float128): int32;
6840
aExp, shiftCount: int32;
6841
aSig0, aSig1, savedASig: bits64;
6846
aSig1 := extractFloat128Frac1( a );
6847
aSig0 := extractFloat128Frac0( a );
6848
aExp := extractFloat128Exp( a );
6849
aSign := extractFloat128Sign( a );
6850
aSig0 := aSig0 or ord( aSig1 <> 0 );
6851
if ( $401E < aExp ) then
6853
if ( ord( aExp = $7FFF ) and aSig0 )<>0 then
6857
else if ( aExp < $3FFF ) then
6859
if ( aExp or aSig0 )<>0 then
6860
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
6864
aSig0 := aSig0 or int64( $0001000000000000 );
6865
shiftCount := $402F - aExp;
6867
aSig0 := aSig0 shr shiftCount;
6869
if ( aSign )<>0 then
6871
if ( ord( z < 0 ) xor aSign )<>0 then
6874
float_raise( float_flag_invalid );
6881
if ( ( aSig0 shl shiftCount ) <> savedASig ) then
6883
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
6888
{*----------------------------------------------------------------------------
6889
| Returns the result of converting the quadruple-precision floating-point
6890
| value `a' to the 64-bit two's complement integer format. The conversion
6891
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6892
| Arithmetic---which means in particular that the conversion is rounded
6893
| according to the current rounding mode. If `a' is a NaN, the largest
6894
| positive integer is returned. Otherwise, if the conversion overflows, the
6895
| largest integer with the same sign as `a' is returned.
6896
*----------------------------------------------------------------------------*}
6898
function float128_to_int64(a: float128): int64;
6901
aExp, shiftCount: int32;
6902
aSig0, aSig1: bits64;
6904
aSig1 := extractFloat128Frac1( a );
6905
aSig0 := extractFloat128Frac0( a );
6906
aExp := extractFloat128Exp( a );
6907
aSign := extractFloat128Sign( a );
6909
aSig0 := aSig0 or int64( $0001000000000000 );
6910
shiftCount := $402F - aExp;
6911
if ( shiftCount <= 0 ) then
6913
if ( $403E < aExp ) then
6915
float_raise( float_flag_invalid );
6917
or ( ( aExp = $7FFF )
6918
and ( (aSig1<>0) or ( aSig0 <> int64( $0001000000000000 ) ) )
6922
result := int64( $7FFFFFFFFFFFFFFF );
6924
result := int64( $8000000000000000 );
6926
shortShift128Left( aSig0, aSig1, - shiftCount, aSig0, aSig1 );
6929
shift64ExtraRightJamming( aSig0, aSig1, shiftCount, aSig0, aSig1 );
6931
result := roundAndPackInt64( aSign, aSig0, aSig1 );
6935
{*----------------------------------------------------------------------------
6936
| Returns the result of converting the quadruple-precision floating-point
6937
| value `a' to the 64-bit two's complement integer format. The conversion
6938
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6939
| Arithmetic, except that the conversion is always rounded toward zero.
6940
| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
6941
| the conversion overflows, the largest integer with the same sign as `a' is
6943
*----------------------------------------------------------------------------*}
6945
function float128_to_int64_round_to_zero(a: float128): int64;
6948
aExp, shiftCount: int32;
6949
aSig0, aSig1: bits64;
6952
aSig1 := extractFloat128Frac1( a );
6953
aSig0 := extractFloat128Frac0( a );
6954
aExp := extractFloat128Exp( a );
6955
aSign := extractFloat128Sign( a );
6957
aSig0 := aSig0 or int64( $0001000000000000 );
6958
shiftCount := aExp - $402F;
6959
if ( 0 < shiftCount ) then
6961
if ( $403E <= aExp ) then
6963
aSig0 := aSig0 and int64( $0000FFFFFFFFFFFF );
6964
if ( ( a.high = int64( $C03E000000000000 ) )
6965
and ( aSig1 < int64( $0002000000000000 ) ) ) then
6967
if ( aSig1<>0 ) then
6968
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
6971
float_raise( float_flag_invalid );
6972
if ( (aSign=0) or ( ( aExp = $7FFF ) and (( aSig0 or aSig1 )<>0) ) ) then
6974
result := int64( $7FFFFFFFFFFFFFFF );
6978
result := int64( $8000000000000000 );
6981
z := ( aSig0 shl shiftCount ) or ( aSig1>>( ( - shiftCount ) and 63 ) );
6982
if ( int64( aSig1 shl shiftCount )<>0 ) then
6984
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
6988
if ( aExp < $3FFF ) then
6990
if ( aExp or aSig0 or aSig1 )<>0 then
6992
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
6997
z := aSig0 shr ( - shiftCount );
6999
or ( (shiftCount<>0) and (int64( aSig0 shl ( shiftCount and 63 ) )<>0) ) ) then
7001
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
7004
if ( aSign<>0 ) then
7010
{*----------------------------------------------------------------------------
7011
| Returns the result of converting the quadruple-precision floating-point
7012
| value `a' to the single-precision floating-point format. The conversion
7013
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
7015
*----------------------------------------------------------------------------*}
7017
function float128_to_float32(a: float128): float32;
7021
aSig0, aSig1: bits64;
7024
aSig1 := extractFloat128Frac1( a );
7025
aSig0 := extractFloat128Frac0( a );
7026
aExp := extractFloat128Exp( a );
7027
aSign := extractFloat128Sign( a );
7028
if ( aExp = $7FFF ) then
7030
if ( aSig0 or aSig1 )<>0 then
7032
result := commonNaNToFloat32( float128ToCommonNaN( a ) );
7035
result := packFloat32( aSign, $FF, 0 );
7038
aSig0 := aSig0 or ord( aSig1 <> 0 );
7039
shift64RightJamming( aSig0, 18, aSig0 );
7041
if ( aExp or zSig )<>0 then
7043
zSig := zSig or $40000000;
7046
result := roundAndPackFloat32( aSign, aExp, zSig );
7050
{*----------------------------------------------------------------------------
7051
| Returns the result of converting the quadruple-precision floating-point
7052
| value `a' to the double-precision floating-point format. The conversion
7053
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
7055
*----------------------------------------------------------------------------*}
7057
function float128_to_float64(a: float128): float64;
7061
aSig0, aSig1: bits64;
7063
aSig1 := extractFloat128Frac1( a );
7064
aSig0 := extractFloat128Frac0( a );
7065
aExp := extractFloat128Exp( a );
7066
aSign := extractFloat128Sign( a );
7067
if ( aExp = $7FFF ) then
7069
if ( aSig0 or aSig1 )<>0 then
7071
commonNaNToFloat64( float128ToCommonNaN( a ),result);
7074
result:=packFloat64( aSign, $7FF, 0);
7077
shortShift128Left( aSig0, aSig1, 14, aSig0, aSig1 );
7078
aSig0 := aSig0 or ord( aSig1 <> 0 );
7079
if ( aExp or aSig0 )<>0 then
7081
aSig0 := aSig0 or int64( $4000000000000000 );
7084
result := roundAndPackFloat64( aSign, aExp, aSig0 );
7087
{$ifdef FPC_SOFTFLOAT_FLOATX80}
7089
{*----------------------------------------------------------------------------
7090
| Returns the result of converting the quadruple-precision floating-point
7091
| value `a' to the extended double-precision floating-point format. The
7092
| conversion is performed according to the IEC/IEEE Standard for Binary
7093
| Floating-Point Arithmetic.
7094
*----------------------------------------------------------------------------*}
7096
function float128_to_floatx80(a: float128): floatx80;
7100
aSig0, aSig1: bits64;
7102
aSig1 := extractFloat128Frac1( a );
7103
aSig0 := extractFloat128Frac0( a );
7104
aExp := extractFloat128Exp( a );
7105
aSign := extractFloat128Sign( a );
7106
if ( aExp = $7FFF ) begin
7107
if ( aSig0 or aSig1 ) begin
7108
result := commonNaNToFloatx80( float128ToCommonNaN( a ) );
7110
result := packFloatx80( aSign, $7FFF, int64( $8000000000000000 ) );
7112
if ( aExp = 0 ) begin
7113
if ( ( aSig0 or aSig1 ) = 0 ) result := packFloatx80( aSign, 0, 0 );
7114
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7117
aSig0 or= int64( $0001000000000000 );
7119
shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
7120
result := roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
7124
{$endif FPC_SOFTFLOAT_FLOATX80}
7126
{*----------------------------------------------------------------------------
7127
| Rounds the quadruple-precision floating-point value `a' to an integer, and
7128
| Returns the result as a quadruple-precision floating-point value. The
7129
| operation is performed according to the IEC/IEEE Standard for Binary
7130
| Floating-Point Arithmetic.
7131
*----------------------------------------------------------------------------*}
7133
function float128_round_to_int(a: float128): float128;
7137
lastBitMask, roundBitsMask: bits64;
7141
aExp := extractFloat128Exp( a );
7142
if ( $402F <= aExp ) then
7144
if ( $406F <= aExp ) then
7146
if ( ( aExp = $7FFF )
7147
and (( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) )<>0)
7150
result := propagateFloat128NaN( a, a );
7157
lastBitMask := ( lastBitMask shl ( $406E - aExp ) ) shl 1;
7158
roundBitsMask := lastBitMask - 1;
7160
roundingMode := float_rounding_mode;
7161
if ( roundingMode = float_round_nearest_even ) then
7163
if ( lastBitMask )<>0 then
7165
add128( z.high, z.low, 0, lastBitMask shr 1, z.high, z.low );
7166
if ( ( z.low and roundBitsMask ) = 0 ) then
7167
z.low := z.low and not(lastBitMask);
7170
if ( sbits64(z.low) < 0 ) then
7173
if ( bits64( z.low shl 1 ) = 0 ) then
7174
z.high := z.high and not(1);
7178
else if ( roundingMode <> float_round_to_zero ) then
7180
if ( extractFloat128Sign( z )
7181
xor ord( roundingMode = float_round_up ) )<>0 then
7183
add128( z.high, z.low, 0, roundBitsMask, z.high, z.low );
7186
z.low := z.low and not(roundBitsMask);
7189
if ( aExp < $3FFF ) then
7191
if ( ( ( bits64( a.high shl 1 ) ) or a.low ) = 0 ) then
7196
softfloat_exception_flags := softfloat_exception_flags or float_flag_inexact;
7197
aSign := extractFloat128Sign( a );
7198
case float_rounding_mode of
7199
float_round_nearest_even:
7200
if ( ( aExp = $3FFE )
7201
and ( extractFloat128Frac0( a )
7202
or extractFloat128Frac1( a ) )
7205
result := packFloat128( aSign, $3FFF, 0, 0 );
7212
aSign ? packFloat128( 1, $3FFF, 0, 0 )
7213
: packFloat128( 0, 0, 0, 0 );
7218
aSign ? packFloat128( 1, 0, 0, 0 )
7219
: packFloat128( 0, $3FFF, 0, 0 );
7223
result := packFloat128( aSign, 0, 0, 0 );
7227
lastBitMask shl = $402F - aExp;
7228
roundBitsMask := lastBitMask - 1;
7231
roundingMode := float_rounding_mode;
7232
if ( roundingMode = float_round_nearest_even ) begin
7233
z.high += lastBitMask>>1;
7234
if ( ( ( z.high and roundBitsMask ) or a.low ) = 0 ) begin
7235
z.high &= ~ lastBitMask;
7238
else if ( roundingMode <> float_round_to_zero ) begin
7239
if ( extractFloat128Sign( z )
7240
xor ( roundingMode = float_round_up ) ) begin
7241
z.high or= ( a.low <> 0 );
7242
z.high += roundBitsMask;
7245
z.high &= ~ roundBitsMask;
7247
if ( ( z.low <> a.low ) or ( z.high <> a.high ) ) begin
7248
softfloat_exception_flags or= float_flag_inexact;
7254
{*----------------------------------------------------------------------------
7255
| Returns the result of adding the absolute values of the quadruple-precision
7256
| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
7257
| before being returned. `zSign' is ignored if the result is a NaN.
7258
| The addition is performed according to the IEC/IEEE Standard for Binary
7259
| Floating-Point Arithmetic.
7260
*----------------------------------------------------------------------------*}
7262
function addFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
7264
aExp, bExp, zExp: int32;
7265
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
7268
aSig1 := extractFloat128Frac1( a );
7269
aSig0 := extractFloat128Frac0( a );
7270
aExp := extractFloat128Exp( a );
7271
bSig1 := extractFloat128Frac1( b );
7272
bSig0 := extractFloat128Frac0( b );
7273
bExp := extractFloat128Exp( b );
7274
expDiff := aExp - bExp;
7275
if ( 0 < expDiff ) begin
7276
if ( aExp = $7FFF ) begin
7277
if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
7280
if ( bExp = 0 ) begin
7284
bSig0 or= int64( $0001000000000000 );
7286
shift128ExtraRightJamming(
7287
bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7290
else if ( expDiff < 0 ) begin
7291
if ( bExp = $7FFF ) begin
7292
if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
7293
result := packFloat128( zSign, $7FFF, 0, 0 );
7295
if ( aExp = 0 ) begin
7299
aSig0 or= int64( $0001000000000000 );
7301
shift128ExtraRightJamming(
7302
aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7306
if ( aExp = $7FFF ) begin
7307
if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
7308
result := propagateFloat128NaN( a, b );
7312
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7313
if ( aExp = 0 ) result := packFloat128( zSign, 0, zSig0, zSig1 );
7315
zSig0 or= int64( $0002000000000000 );
7319
aSig0 or= int64( $0001000000000000 );
7320
add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7322
if ( zSig0 < int64( $0002000000000000 ) ) goto roundAndPack;
7325
shift128ExtraRightJamming(
7326
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7328
result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
7332
{*----------------------------------------------------------------------------
7333
| Returns the result of subtracting the absolute values of the quadruple-
7334
| precision floating-point values `a' and `b'. If `zSign' is 1, the
7335
| difference is negated before being returned. `zSign' is ignored if the
7336
| result is a NaN. The subtraction is performed according to the IEC/IEEE
7337
| Standard for Binary Floating-Point Arithmetic.
7338
*----------------------------------------------------------------------------*}
7340
function subFloat128Sigs( float128 a, float128 b, flag zSign ): float128;
7342
aExp, bExp, zExp: int32;
7343
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1: bits64;
7347
aSig1 := extractFloat128Frac1( a );
7348
aSig0 := extractFloat128Frac0( a );
7349
aExp := extractFloat128Exp( a );
7350
bSig1 := extractFloat128Frac1( b );
7351
bSig0 := extractFloat128Frac0( b );
7352
bExp := extractFloat128Exp( b );
7353
expDiff := aExp - bExp;
7354
shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7355
shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7356
if ( 0 < expDiff ) goto aExpBigger;
7357
if ( expDiff < 0 ) goto bExpBigger;
7358
if ( aExp = $7FFF ) begin
7359
if ( aSig0 or aSig1 or bSig0 or bSig1 ) begin
7360
result := propagateFloat128NaN( a, b );
7362
float_raise( float_flag_invalid );
7363
z.low := float128_default_nan_low;
7364
z.high := float128_default_nan_high;
7367
if ( aExp = 0 ) begin
7371
if ( bSig0 < aSig0 ) goto aBigger;
7372
if ( aSig0 < bSig0 ) goto bBigger;
7373
if ( bSig1 < aSig1 ) goto aBigger;
7374
if ( aSig1 < bSig1 ) goto bBigger;
7375
result := packFloat128( float_rounding_mode = float_round_down, 0, 0, 0 );
7377
if ( bExp = $7FFF ) begin
7378
if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
7379
result := packFloat128( zSign xor 1, $7FFF, 0, 0 );
7381
if ( aExp = 0 ) begin
7385
aSig0 or= int64( $4000000000000000 );
7387
shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7388
bSig0 or= int64( $4000000000000000 );
7390
sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7393
goto normalizeRoundAndPack;
7395
if ( aExp = $7FFF ) begin
7396
if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
7399
if ( bExp = 0 ) begin
7403
bSig0 or= int64( $4000000000000000 );
7405
shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7406
aSig0 or= int64( $4000000000000000 );
7408
sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7410
normalizeRoundAndPack:
7412
result := normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
7416
{*----------------------------------------------------------------------------
7417
| Returns the result of adding the quadruple-precision floating-point values
7418
| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
7419
| for Binary Floating-Point Arithmetic.
7420
*----------------------------------------------------------------------------*}
7422
function float128_add(a: float128; b: float128): float128;
7426
aSign := extractFloat128Sign( a );
7427
bSign := extractFloat128Sign( b );
7428
if ( aSign = bSign ) begin
7429
result := addFloat128Sigs( a, b, aSign );
7432
result := subFloat128Sigs( a, b, aSign );
7437
{*----------------------------------------------------------------------------
7438
| Returns the result of subtracting the quadruple-precision floating-point
7439
| values `a' and `b'. The operation is performed according to the IEC/IEEE
7440
| Standard for Binary Floating-Point Arithmetic.
7441
*----------------------------------------------------------------------------*}
7443
function float128_sub(a: float128; b: float128): float128;
7447
aSign := extractFloat128Sign( a );
7448
bSign := extractFloat128Sign( b );
7449
if ( aSign = bSign ) begin
7450
result := subFloat128Sigs( a, b, aSign );
7453
result := addFloat128Sigs( a, b, aSign );
7458
{*----------------------------------------------------------------------------
7459
| Returns the result of multiplying the quadruple-precision floating-point
7460
| values `a' and `b'. The operation is performed according to the IEC/IEEE
7461
| Standard for Binary Floating-Point Arithmetic.
7462
*----------------------------------------------------------------------------*}
7464
function float128_mul(a: float128; b: float128): float128;
7466
aSign, bSign, zSign: flag;
7467
aExp, bExp, zExp: int32;
7468
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3: bits64;
7471
aSig1 := extractFloat128Frac1( a );
7472
aSig0 := extractFloat128Frac0( a );
7473
aExp := extractFloat128Exp( a );
7474
aSign := extractFloat128Sign( a );
7475
bSig1 := extractFloat128Frac1( b );
7476
bSig0 := extractFloat128Frac0( b );
7477
bExp := extractFloat128Exp( b );
7478
bSign := extractFloat128Sign( b );
7479
zSign := aSign xor bSign;
7480
if ( aExp = $7FFF ) begin
7481
if ( ( aSig0 or aSig1 )
7482
or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
7483
result := propagateFloat128NaN( a, b );
7485
if ( ( bExp or bSig0 or bSig1 ) = 0 ) goto invalid;
7486
result := packFloat128( zSign, $7FFF, 0, 0 );
7488
if ( bExp = $7FFF ) begin
7489
if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
7490
if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
7492
float_raise( float_flag_invalid );
7493
z.low := float128_default_nan_low;
7494
z.high := float128_default_nan_high;
7497
result := packFloat128( zSign, $7FFF, 0, 0 );
7499
if ( aExp = 0 ) begin
7500
if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
7501
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7503
if ( bExp = 0 ) begin
7504
if ( ( bSig0 or bSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
7505
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7507
zExp := aExp + bExp - $4000;
7508
aSig0 or= int64( $0001000000000000 );
7509
shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7510
mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7511
add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7512
zSig2 or= ( zSig3 <> 0 );
7513
if ( int64( $0002000000000000 ) <= zSig0 ) begin
7514
shift128ExtraRightJamming(
7515
zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7518
result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
7522
{*----------------------------------------------------------------------------
7523
| Returns the result of dividing the quadruple-precision floating-point value
7524
| `a' by the corresponding value `b'. The operation is performed according to
7525
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7526
*----------------------------------------------------------------------------*}
7528
function float128_div(a: float128; b: float128): float128;
7530
aSign, bSign, zSign: flag;
7531
aExp, bExp, zExp: int32;
7532
aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2: bits64;
7533
rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
7536
aSig1 := extractFloat128Frac1( a );
7537
aSig0 := extractFloat128Frac0( a );
7538
aExp := extractFloat128Exp( a );
7539
aSign := extractFloat128Sign( a );
7540
bSig1 := extractFloat128Frac1( b );
7541
bSig0 := extractFloat128Frac0( b );
7542
bExp := extractFloat128Exp( b );
7543
bSign := extractFloat128Sign( b );
7544
zSign := aSign xor bSign;
7545
if ( aExp = $7FFF ) begin
7546
if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, b );
7547
if ( bExp = $7FFF ) begin
7548
if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
7551
result := packFloat128( zSign, $7FFF, 0, 0 );
7553
if ( bExp = $7FFF ) begin
7554
if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
7555
result := packFloat128( zSign, 0, 0, 0 );
7557
if ( bExp = 0 ) begin
7558
if ( ( bSig0 or bSig1 ) = 0 ) begin
7559
if ( ( aExp or aSig0 or aSig1 ) = 0 ) begin
7561
float_raise( float_flag_invalid );
7562
z.low := float128_default_nan_low;
7563
z.high := float128_default_nan_high;
7566
float_raise( float_flag_divbyzero );
7567
result := packFloat128( zSign, $7FFF, 0, 0 );
7569
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7571
if ( aExp = 0 ) begin
7572
if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( zSign, 0, 0, 0 );
7573
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7575
zExp := aExp - bExp + $3FFD;
7577
aSig0 or int64( $0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7579
bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7580
if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) begin
7581
shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7584
zSig0 := estimateDiv128To64( aSig0, aSig1, bSig0 );
7585
mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7586
sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7587
while ( (sbits64) rem0 < 0 ) begin
7589
add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7591
zSig1 := estimateDiv128To64( rem1, rem2, bSig0 );
7592
if ( ( zSig1 and $3FFF ) <= 4 ) begin
7593
mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7594
sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7595
while ( (sbits64) rem1 < 0 ) begin
7597
add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7599
zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
7601
shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7602
result := roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
7606
{*----------------------------------------------------------------------------
7607
| Returns the remainder of the quadruple-precision floating-point value `a'
7608
| with respect to the corresponding value `b'. The operation is performed
7609
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7610
*----------------------------------------------------------------------------*}
7612
function float128_rem(a: float128; b: float128): float128;
7614
aSign, bSign, zSign: flag;
7615
aExp, bExp, expDiff: int32;
7616
aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2: bits64;
7617
allZero, alternateASig0, alternateASig1, sigMean1: bits64;
7621
aSig1 := extractFloat128Frac1( a );
7622
aSig0 := extractFloat128Frac0( a );
7623
aExp := extractFloat128Exp( a );
7624
aSign := extractFloat128Sign( a );
7625
bSig1 := extractFloat128Frac1( b );
7626
bSig0 := extractFloat128Frac0( b );
7627
bExp := extractFloat128Exp( b );
7628
bSign := extractFloat128Sign( b );
7629
if ( aExp = $7FFF ) begin
7630
if ( ( aSig0 or aSig1 )
7631
or ( ( bExp = $7FFF ) and ( bSig0 or bSig1 ) ) ) begin
7632
result := propagateFloat128NaN( a, b );
7636
if ( bExp = $7FFF ) begin
7637
if ( bSig0 or bSig1 ) result := propagateFloat128NaN( a, b );
7640
if ( bExp = 0 ) begin
7641
if ( ( bSig0 or bSig1 ) = 0 ) begin
7643
float_raise( float_flag_invalid );
7644
z.low := float128_default_nan_low;
7645
z.high := float128_default_nan_high;
7648
normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7650
if ( aExp = 0 ) begin
7651
if ( ( aSig0 or aSig1 ) = 0 ) result := a;
7652
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7654
expDiff := aExp - bExp;
7655
if ( expDiff < -1 ) result := a;
7657
aSig0 or int64( $0001000000000000 ),
7659
15 - ( expDiff < 0 ),
7664
bSig0 or int64( $0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7665
q := le128( bSig0, bSig1, aSig0, aSig1 );
7666
if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7668
while ( 0 < expDiff ) begin
7669
q := estimateDiv128To64( aSig0, aSig1, bSig0 );
7670
q := ( 4 < q ) ? q - 4 : 0;
7671
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7672
shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7673
shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7674
sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7677
if ( -64 < expDiff ) begin
7678
q := estimateDiv128To64( aSig0, aSig1, bSig0 );
7679
q := ( 4 < q ) ? q - 4 : 0;
7681
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7683
if ( expDiff < 0 ) begin
7684
shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7687
shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7689
mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7690
sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7693
shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7694
shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7697
alternateASig0 := aSig0;
7698
alternateASig1 := aSig1;
7700
sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7701
end; while ( 0 <= (sbits64) aSig0 );
7703
aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
7704
if ( ( sigMean0 < 0 )
7705
or ( ( ( sigMean0 or sigMean1 ) = 0 ) and ( q and 1 ) ) ) begin
7706
aSig0 := alternateASig0;
7707
aSig1 := alternateASig1;
7709
zSign := ( (sbits64) aSig0 < 0 );
7710
if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7712
normalizeRoundAndPackFloat128( aSign xor zSign, bExp - 4, aSig0, aSig1 );
7716
{*----------------------------------------------------------------------------
7717
| Returns the square root of the quadruple-precision floating-point value `a'.
7718
| The operation is performed according to the IEC/IEEE Standard for Binary
7719
| Floating-Point Arithmetic.
7720
*----------------------------------------------------------------------------*}
7722
function float128_sqrt(a: float128): float128;
7726
aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0: bits64;
7727
rem0, rem1, rem2, rem3, term0, term1, term2, term3: bits64;
7732
aSig1 := extractFloat128Frac1( a );
7733
aSig0 := extractFloat128Frac0( a );
7734
aExp := extractFloat128Exp( a );
7735
aSign := extractFloat128Sign( a );
7736
if ( aExp = $7FFF ) begin
7737
if ( aSig0 or aSig1 ) result := propagateFloat128NaN( a, a );
7738
if ( ! aSign ) result := a;
7742
if ( ( aExp or aSig0 or aSig1 ) = 0 ) result := a;
7744
float_raise( float_flag_invalid );
7745
z.low := float128_default_nan_low;
7746
z.high := float128_default_nan_high;
7749
if ( aExp = 0 ) begin
7750
if ( ( aSig0 or aSig1 ) = 0 ) result := packFloat128( 0, 0, 0, 0 );
7751
normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7753
zExp := ( ( aExp - $3FFF )>>1 ) + $3FFE;
7754
aSig0 := aSig0 or int64( $0001000000000000 );
7755
zSig0 := estimateSqrt32( aExp, aSig0>>17 );
7756
shortShift128Left( aSig0, aSig1, 13 - ( aExp and 1 ), &aSig0, &aSig1 );
7757
zSig0 := estimateDiv128To64( aSig0, aSig1, zSig0 shl 32 ) + ( zSig0 shl 30 );
7758
doubleZSig0 := zSig0 shl 1;
7759
mul64To128( zSig0, zSig0, &term0, &term1 );
7760
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7761
while ( (sbits64) rem0 < 0 ) begin
7764
add128( rem0, rem1, zSig0>>63, doubleZSig0 or 1, &rem0, &rem1 );
7766
zSig1 := estimateDiv128To64( rem1, 0, doubleZSig0 );
7767
if ( ( zSig1 and $1FFF ) <= 5 ) begin
7768
if ( zSig1 = 0 ) zSig1 := 1;
7769
mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7770
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7771
mul64To128( zSig1, zSig1, &term2, &term3 );
7772
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7773
while ( (sbits64) rem1 < 0 ) begin
7775
shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7777
term2 or= doubleZSig0;
7778
add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7780
zSig1 or= ( ( rem1 or rem2 or rem3 ) <> 0 );
7782
shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7783
result := roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
7787
{*----------------------------------------------------------------------------
7788
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7789
| the corresponding value `b', and 0 otherwise. The comparison is performed
7790
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7791
*----------------------------------------------------------------------------*}
7793
function float128_eq(a: float128; b: float128): flag;
7795
if ( ( ( extractFloat128Exp( a ) = $7FFF )
7796
and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
7797
or ( ( extractFloat128Exp( b ) = $7FFF )
7798
and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
7800
if ( float128_is_signaling_nan( a )
7801
or float128_is_signaling_nan( b ) ) begin
7802
float_raise( float_flag_invalid );
7808
and ( ( a.high = b.high )
7810
and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
7815
{*----------------------------------------------------------------------------
7816
| Returns 1 if the quadruple-precision floating-point value `a' is less than
7817
| or equal to the corresponding value `b', and 0 otherwise. The comparison
7818
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
7820
*----------------------------------------------------------------------------*}
7822
function float128_le(a: float128; b: float128): flag;
7826
if ( ( ( extractFloat128Exp( a ) = $7FFF )
7827
and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
7828
or ( ( extractFloat128Exp( b ) = $7FFF )
7829
and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
7831
float_raise( float_flag_invalid );
7834
aSign := extractFloat128Sign( a );
7835
bSign := extractFloat128Sign( b );
7836
if ( aSign <> bSign ) begin
7839
or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
7843
aSign ? le128( b.high, b.low, a.high, a.low )
7844
: le128( a.high, a.low, b.high, b.low );
7848
{*----------------------------------------------------------------------------
7849
| Returns 1 if the quadruple-precision floating-point value `a' is less than
7850
| the corresponding value `b', and 0 otherwise. The comparison is performed
7851
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7852
*----------------------------------------------------------------------------*}
7854
function float128_lt(a: float128; b: float128): flag;
7858
if ( ( ( extractFloat128Exp( a ) = $7FFF )
7859
and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
7860
or ( ( extractFloat128Exp( b ) = $7FFF )
7861
and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
7863
float_raise( float_flag_invalid );
7866
aSign := extractFloat128Sign( a );
7867
bSign := extractFloat128Sign( b );
7868
if ( aSign <> bSign ) begin
7871
and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
7875
aSign ? lt128( b.high, b.low, a.high, a.low )
7876
: lt128( a.high, a.low, b.high, b.low );
7880
{*----------------------------------------------------------------------------
7881
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
7882
| the corresponding value `b', and 0 otherwise. The invalid exception is
7883
| raised if either operand is a NaN. Otherwise, the comparison is performed
7884
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7885
*----------------------------------------------------------------------------*}
7887
function float128_eq_signaling(a: float128; b: float128): flag;
7889
if ( ( ( extractFloat128Exp( a ) = $7FFF )
7890
and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
7891
or ( ( extractFloat128Exp( b ) = $7FFF )
7892
and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
7894
float_raise( float_flag_invalid );
7899
and ( ( a.high = b.high )
7901
and ( (bits64) ( ( a.high or b.high ) shl 1 ) = 0 ) )
7906
{*----------------------------------------------------------------------------
7907
| Returns 1 if the quadruple-precision floating-point value `a' is less than
7908
| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
7909
| cause an exception. Otherwise, the comparison is performed according to the
7910
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7911
*----------------------------------------------------------------------------*}
7913
function float128_le_quiet(a: float128; b: float128): flag;
7917
if ( ( ( extractFloat128Exp( a ) = $7FFF )
7918
and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
7919
or ( ( extractFloat128Exp( b ) = $7FFF )
7920
and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
7922
if ( float128_is_signaling_nan( a )
7923
or float128_is_signaling_nan( b ) ) begin
7924
float_raise( float_flag_invalid );
7928
aSign := extractFloat128Sign( a );
7929
bSign := extractFloat128Sign( b );
7930
if ( aSign <> bSign ) begin
7933
or ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
7937
aSign ? le128( b.high, b.low, a.high, a.low )
7938
: le128( a.high, a.low, b.high, b.low );
7942
{*----------------------------------------------------------------------------
7943
| Returns 1 if the quadruple-precision floating-point value `a' is less than
7944
| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7945
| exception. Otherwise, the comparison is performed according to the IEC/IEEE
7946
| Standard for Binary Floating-Point Arithmetic.
7947
*----------------------------------------------------------------------------*}
7949
function float128_lt_quiet(a: float128; b: float128): flag;
7953
if ( ( ( extractFloat128Exp( a ) = $7FFF )
7954
and ( extractFloat128Frac0( a ) or extractFloat128Frac1( a ) ) )
7955
or ( ( extractFloat128Exp( b ) = $7FFF )
7956
and ( extractFloat128Frac0( b ) or extractFloat128Frac1( b ) ) )
7958
if ( float128_is_signaling_nan( a )
7959
or float128_is_signaling_nan( b ) ) begin
7960
float_raise( float_flag_invalid );
7964
aSign := extractFloat128Sign( a );
7965
bSign := extractFloat128Sign( b );
7966
if ( aSign <> bSign ) begin
7969
and ( ( ( (bits64) ( ( a.high or b.high ) shl 1 ) ) or a.low or b.low )
7973
aSign ? lt128( b.high, b.low, a.high, a.low )
7974
: lt128( a.high, a.low, b.high, b.low );
7978
{$endif FPC_SOFTFLOAT_FLOAT128}
7980
{$endif not(defined(fpc_softfpu_interface))}
7982
{$if not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}
7986
{$endif not(defined(fpc_softfpu_interface)) and not(defined(fpc_softfpu_implementation))}