~ubuntu-branches/ubuntu/vivid/qemu/vivid

Viewing changes to .pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu/softfloat.c

Committer: Package Import Robot
Author(s): dann frazier
Date: 2014-02-11 15:41:53 UTC
Revision ID: package-import@ubuntu.com-20140211154153-2d001tf0ium08u81

Tags: 1.7.0+dfsg-3ubuntu2

* Backport changes to enable qemu-user-static support for aarch64
* debian/control: add ppc64el to Architectures
* debian/rules: only install qemu-system-aarch64 on arm64.
Fixes a FTBFS when built twice in a row on non-arm64 due to a stale
debian/qemu-system-aarch64 directory

files added:
.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch

.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch/target-arm

.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch/target-arm/cpu64.c

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch/target-arm

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch/target-arm

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch/target-arm

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch/target-arm

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch/target-arm

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch/target-arm

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch/target-arm

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch/target-arm

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch/target-arm

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch/target-arm

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch/target-arm

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/helper.c

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/kvm-consts.h

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/helper.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate.h

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch/target-arm

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/aarch64

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/aarch64/target_cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/arm

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/arm/target_cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/main.c

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch/target-arm

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch/target-arm

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch/target-arm

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/linux-user

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/linux-user/main.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/machine.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/translate.c

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/linux-user

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/linux-user/main.c

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/target-arm

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user/aarch64

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user/aarch64/syscall.h

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch/linux-user

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch/linux-user/signal.c

.pc/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch

.pc/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch/.travis.yml

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch/default-configs

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch/default-configs/aarch64-linux-user.mak

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch/target-arm

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch/target-arm

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/helper.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/helper.h

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/neon_helper.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/translate.c

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch/target-arm

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch/target-arm

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch/target-arm

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/helper-a64.c

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/helper-a64.h

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch/target-arm

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch/target-arm

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm/helper.c

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch/fpu

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/fpu

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include/fpu

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include/fpu

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/fpu

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include/fpu

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch/fpu

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch/fpu

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch/fpu

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include/fpu

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch/fpu

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch/fpu

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/fpu

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include/fpu

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch/fpu

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/fpu

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include/fpu

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch/fpu

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include/fpu

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch/target-arm

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch/target-arm/helper.c

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/helper.c

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/helper.h

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/translate.c

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch/target-arm

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch/target-arm/helper.c

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm/helper.h

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/helper.c

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/helper.h

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch/target-arm

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/helper.c

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/helper.h

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/helper.c

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/helper.h

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch/target-arm

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch/target-arm/helper.c

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch/target-arm

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch/target-arm

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch/target-arm

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch/target-arm

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch/target-arm

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/helper-a64.c

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/helper-a64.h

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch/target-arm

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch/target-arm

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch/target-arm

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch/target-arm

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch/target-arm

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/helper.c

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch/target-arm

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch/target-arm/translate.c

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch/target-arm

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch/target-arm/translate.c

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch/target-arm

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch/target-arm/translate.c

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch/target-arm

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch/target-arm/translate.c

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch/target-arm

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch/target-arm/translate.c

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm/helper.c

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm/helper.h

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch/target-arm

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch/target-arm/translate.c

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch/target-arm

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch/target-arm/translate.c

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch/target-arm

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch/target-arm/translate.c

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch/target-arm

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch/target-arm

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch/target-arm

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch/target-arm

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch/target-arm

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch/target-arm

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch/target-arm

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch/target-arm

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch/target-arm

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch/target-arm

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch/target-arm

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch/tcg

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch/tcg/tcg.h

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch/target-arm

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch/target-arm

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch/target-arm

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch/target-arm

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch/target-arm

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/helper.h

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/neon_helper.c

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch/target-arm

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch/target-arm

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch/target-arm

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch/target-arm

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch/target-arm/translate.c

.pc/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch

.pc/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch/rules.mak

.pc/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch

.pc/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch/rules.mak

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/LICENCE

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/README

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/assembler-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/constants-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/cpu-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/decoder-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/decoder-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/disasm-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/disasm-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/instructions-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/instructions-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/globals.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/platform.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/utils.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/utils.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/a64

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/a64/instructions-a64.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/globals.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/utils.h

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/configure

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas.c

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/Makefile.objs

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/arm-a64.cc

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/libvixl

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/libvixl/Makefile.objs

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include/disas

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include/disas/bfd.h

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/target-arm

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc/linux-user

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc/linux-user/main.c

debian/binfmts/qemu-aarch64

debian/patches/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch

debian/patches/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch

debian/patches/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch

debian/patches/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch

debian/patches/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch

debian/patches/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch

debian/patches/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch

debian/patches/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch

debian/patches/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch

debian/patches/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch

debian/patches/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch

debian/patches/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch

debian/patches/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch

debian/patches/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch

debian/patches/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch

debian/patches/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch

debian/patches/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch

debian/patches/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch

debian/patches/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch

debian/patches/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch

debian/patches/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch

debian/patches/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch

debian/patches/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch

debian/patches/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch

debian/patches/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch

debian/patches/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch

debian/patches/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch

debian/patches/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch

debian/patches/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch

debian/patches/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch

debian/patches/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch

debian/patches/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch

debian/patches/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch

debian/patches/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch

debian/patches/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch

debian/patches/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch

debian/patches/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch

debian/patches/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch

debian/patches/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch

debian/patches/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch

debian/patches/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch

debian/patches/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch

debian/patches/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch

debian/patches/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch

debian/patches/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch

debian/patches/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch

debian/patches/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch

debian/patches/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch

debian/patches/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch

debian/patches/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch

debian/patches/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch

debian/patches/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch

debian/patches/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch

debian/patches/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch

debian/patches/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch

debian/patches/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch

debian/patches/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch

debian/patches/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch

debian/patches/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch

debian/patches/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch

debian/patches/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch

debian/patches/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch

debian/patches/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch

debian/patches/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch

debian/patches/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch

debian/patches/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch

debian/patches/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch

debian/patches/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch

debian/patches/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch

debian/patches/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch

debian/patches/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch

debian/patches/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch

debian/patches/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch

debian/patches/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch

debian/patches/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch

debian/patches/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch

debian/patches/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch

debian/patches/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch

debian/patches/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch

debian/patches/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch

debian/patches/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch

debian/patches/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch

debian/patches/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch

debian/patches/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch

debian/patches/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch

debian/patches/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch

debian/patches/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch

debian/patches/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch

debian/patches/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch

debian/patches/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch

debian/patches/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch

debian/patches/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch

debian/patches/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch

debian/patches/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch

debian/patches/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch

debian/patches/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch

debian/patches/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch

debian/patches/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch

debian/patches/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch

debian/patches/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch

debian/patches/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch

debian/patches/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch

debian/patches/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch

debian/patches/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch

debian/patches/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch

debian/patches/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch

debian/patches/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch

debian/patches/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch

debian/patches/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch

debian/patches/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc

default-configs/aarch64-linux-user.mak

disas/arm-a64.cc

disas/libvixl

disas/libvixl/LICENCE

disas/libvixl/Makefile.objs

disas/libvixl/README

disas/libvixl/a64

disas/libvixl/a64/assembler-a64.h

disas/libvixl/a64/constants-a64.h

disas/libvixl/a64/cpu-a64.h

disas/libvixl/a64/decoder-a64.cc

disas/libvixl/a64/decoder-a64.h

disas/libvixl/a64/disasm-a64.cc

disas/libvixl/a64/disasm-a64.h

disas/libvixl/a64/instructions-a64.cc

disas/libvixl/a64/instructions-a64.h

disas/libvixl/globals.h

disas/libvixl/platform.h

disas/libvixl/utils.cc

disas/libvixl/utils.h

files modified:
.pc/applied-patches

.travis.yml

configure

debian/changelog

debian/patches/series

debian/rules

disas.c

disas/Makefile.objs

fpu/softfloat.c

include/disas/bfd.h

include/fpu/softfloat.h

linux-user/aarch64/syscall.h

linux-user/aarch64/target_cpu.h

linux-user/arm/target_cpu.h

linux-user/main.c

linux-user/signal.c

rules.mak

target-arm/cpu.h

target-arm/cpu64.c

target-arm/helper-a64.c

target-arm/helper-a64.h

target-arm/helper.c

target-arm/helper.h

target-arm/kvm-consts.h

target-arm/machine.c

target-arm/neon_helper.c

target-arm/translate-a64.c

target-arm/translate.c

target-arm/translate.h

tcg/tcg.h

Show diffs side-by-side

added added

removed removed

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu/softfloat.c

* QEMU float support

* Derived from SoftFloat.

/*============================================================================

This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic

Package, Release 2b.

Written by John R. Hauser. This work was made possible in part by the

International Computer Science Institute, located at Suite 600, 1947 Center

Street, Berkeley, California 94704. Funding was partially provided by the

National Science Foundation under grant MIP-9311980. The original version

of this code was written as part of a project to build a fixed-point vector

processor in collaboration with the University of California at Berkeley,

overseen by Profs. Nelson Morgan and John Wawrzynek. More information

is available through the Web page `http://www.cs.berkeley.edu/~jhauser/

arithmetic/SoftFloat.html'.

THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has

been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES

RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS

AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,

COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE

EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE

INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR

OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.

Derivative works are acceptable, even for commercial purposes, so long as

(1) the source code for the derivative work includes prominent notice that

the work is derivative, and (2) the source code includes prominent notice with

these four paragraphs for those parts of this code that are retained.

=============================================================================*/

/* softfloat (and in particular the code in softfloat-specialize.h) is

* target-dependent and needs the TARGET_* macros.

#include "config.h"

#include "fpu/softfloat.h"

/*----------------------------------------------------------------------------

| Primitive arithmetic functions, including multi-word arithmetic, and

| division and square root approximations. (Can be specialized to target if

| desired.)

*----------------------------------------------------------------------------*/

#include "softfloat-macros.h"

/*----------------------------------------------------------------------------

| Functions and definitions to determine: (1) whether tininess for underflow

| is detected before or after rounding by default, (2) what (if anything)

| happens when exceptions are raised, (3) how signaling NaNs are distinguished

| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs

| are propagated from function inputs to output. These details are target-

| specific.

*----------------------------------------------------------------------------*/

#include "softfloat-specialize.h"

void set_float_rounding_mode(int val STATUS_PARAM)

{

STATUS(float_rounding_mode) = val;

}

void set_float_exception_flags(int val STATUS_PARAM)

{

STATUS(float_exception_flags) = val;

}

void set_floatx80_rounding_precision(int val STATUS_PARAM)

{

STATUS(floatx80_rounding_precision) = val;

}

/*----------------------------------------------------------------------------

| Returns the fraction bits of the half-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE uint32_t extractFloat16Frac(float16 a)

{

return float16_val(a) & 0x3ff;

}

/*----------------------------------------------------------------------------

| Returns the exponent bits of the half-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE int_fast16_t extractFloat16Exp(float16 a)

{

return (float16_val(a) >> 10) & 0x1f;

}

/*----------------------------------------------------------------------------

| Returns the sign bit of the single-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE flag extractFloat16Sign(float16 a)

100

{

101

return float16_val(a)>>15;

102

}

103

104

/*----------------------------------------------------------------------------

105

| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6

106

| and 7, and returns the properly rounded 32-bit integer corresponding to the

107

| input. If `zSign' is 1, the input is negated before being converted to an

108

| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input

109

| is simply rounded to an integer, with the inexact exception raised if the

110

| input cannot be represented exactly as an integer. However, if the fixed-

111

| point input is too large, the invalid exception is raised and the largest

112

| positive or negative integer is returned.

113

*----------------------------------------------------------------------------*/

114

115

static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)

116

{

117

int8 roundingMode;

118

flag roundNearestEven;

119

int8 roundIncrement, roundBits;

120

int32_t z;

121

122

roundingMode = STATUS(float_rounding_mode);

123

roundNearestEven = ( roundingMode == float_round_nearest_even );

124

roundIncrement = 0x40;

125

if ( ! roundNearestEven ) {

126

if ( roundingMode == float_round_to_zero ) {

127

roundIncrement = 0;

128

}

129

else {

130

roundIncrement = 0x7F;

131

if ( zSign ) {

132

if ( roundingMode == float_round_up ) roundIncrement = 0;

133

}

134

else {

135

if ( roundingMode == float_round_down ) roundIncrement = 0;

136

}

137

}

138

}

139

roundBits = absZ & 0x7F;

140

absZ = ( absZ + roundIncrement )>>7;

141

absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );

142

z = absZ;

143

if ( zSign ) z = - z;

144

if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {

145

float_raise( float_flag_invalid STATUS_VAR);

146

return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

147

}

148

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

149

return z;

150

151

}

152

153

/*----------------------------------------------------------------------------

154

| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and

155

| `absZ1', with binary point between bits 63 and 64 (between the input words),

156

| and returns the properly rounded 64-bit integer corresponding to the input.

157

| If `zSign' is 1, the input is negated before being converted to an integer.

158

| Ordinarily, the fixed-point input is simply rounded to an integer, with

159

| the inexact exception raised if the input cannot be represented exactly as

160

| an integer. However, if the fixed-point input is too large, the invalid

161

| exception is raised and the largest positive or negative integer is

162

| returned.

163

*----------------------------------------------------------------------------*/

164

165

static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)

166

{

167

int8 roundingMode;

168

flag roundNearestEven, increment;

169

int64_t z;

170

171

roundingMode = STATUS(float_rounding_mode);

172

roundNearestEven = ( roundingMode == float_round_nearest_even );

173

increment = ( (int64_t) absZ1 < 0 );

174

if ( ! roundNearestEven ) {

175

if ( roundingMode == float_round_to_zero ) {

176

increment = 0;

177

}

178

else {

179

if ( zSign ) {

180

increment = ( roundingMode == float_round_down ) && absZ1;

181

}

182

else {

183

increment = ( roundingMode == float_round_up ) && absZ1;

184

}

185

}

186

}

187

if ( increment ) {

188

++absZ0;

189

if ( absZ0 == 0 ) goto overflow;

190

absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );

191

}

192

z = absZ0;

193

if ( zSign ) z = - z;

194

if ( z && ( ( z < 0 ) ^ zSign ) ) {

195

overflow:

196

float_raise( float_flag_invalid STATUS_VAR);

197

return

198

zSign ? (int64_t) LIT64( 0x8000000000000000 )

199

: LIT64( 0x7FFFFFFFFFFFFFFF );

200

}

201

if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;

202

return z;

203

204

}

205

206

/*----------------------------------------------------------------------------

207

| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and

208

| `absZ1', with binary point between bits 63 and 64 (between the input words),

209

| and returns the properly rounded 64-bit unsigned integer corresponding to the

210

| input. Ordinarily, the fixed-point input is simply rounded to an integer,

211

| with the inexact exception raised if the input cannot be represented exactly

212

| as an integer. However, if the fixed-point input is too large, the invalid

213

| exception is raised and the largest unsigned integer is returned.

214

*----------------------------------------------------------------------------*/

215

216

static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,

217

uint64_t absZ1 STATUS_PARAM)

218

{

219

int8 roundingMode;

220

flag roundNearestEven, increment;

221

222

roundingMode = STATUS(float_rounding_mode);

223

roundNearestEven = (roundingMode == float_round_nearest_even);

224

increment = ((int64_t)absZ1 < 0);

225

if (!roundNearestEven) {

226

if (roundingMode == float_round_to_zero) {

227

increment = 0;

228

} else if (absZ1) {

229

if (zSign) {

230

increment = (roundingMode == float_round_down) && absZ1;

231

} else {

232

increment = (roundingMode == float_round_up) && absZ1;

233

}

234

}

235

}

236

if (increment) {

237

++absZ0;

238

if (absZ0 == 0) {

239

float_raise(float_flag_invalid STATUS_VAR);

240

return LIT64(0xFFFFFFFFFFFFFFFF);

241

}

242

absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);

243

}

244

245

if (zSign && absZ0) {

246

float_raise(float_flag_invalid STATUS_VAR);

247

return 0;

248

}

249

250

if (absZ1) {

251

STATUS(float_exception_flags) |= float_flag_inexact;

252

}

253

return absZ0;

254

}

255

256

/*----------------------------------------------------------------------------

257

| Returns the fraction bits of the single-precision floating-point value `a'.

258

*----------------------------------------------------------------------------*/

259

260

INLINE uint32_t extractFloat32Frac( float32 a )

261

{

262

263

return float32_val(a) & 0x007FFFFF;

264

265

}

266

267

/*----------------------------------------------------------------------------

268

| Returns the exponent bits of the single-precision floating-point value `a'.

269

*----------------------------------------------------------------------------*/

270

271

INLINE int_fast16_t extractFloat32Exp(float32 a)

272

{

273

274

return ( float32_val(a)>>23 ) & 0xFF;

275

276

}

277

278

/*----------------------------------------------------------------------------

279

| Returns the sign bit of the single-precision floating-point value `a'.

280

*----------------------------------------------------------------------------*/

281

282

INLINE flag extractFloat32Sign( float32 a )

283

{

284

285

return float32_val(a)>>31;

286

287

}

288

289

/*----------------------------------------------------------------------------

290

| If `a' is denormal and we are in flush-to-zero mode then set the

291

| input-denormal exception and return zero. Otherwise just return the value.

292

*----------------------------------------------------------------------------*/

293

static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)

294

{

295

if (STATUS(flush_inputs_to_zero)) {

296

if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {

297

float_raise(float_flag_input_denormal STATUS_VAR);

298

return make_float32(float32_val(a) & 0x80000000);

299

}

300

}

301

return a;

302

}

303

304

/*----------------------------------------------------------------------------

305

| Normalizes the subnormal single-precision floating-point value represented

306

| by the denormalized significand `aSig'. The normalized exponent and

307

| significand are stored at the locations pointed to by `zExpPtr' and

308

| `zSigPtr', respectively.

309

*----------------------------------------------------------------------------*/

310

311

static void

312

normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)

313

{

314

int8 shiftCount;

315

316

shiftCount = countLeadingZeros32( aSig ) - 8;

317

*zSigPtr = aSig<<shiftCount;

318

*zExpPtr = 1 - shiftCount;

319

320

}

321

322

/*----------------------------------------------------------------------------

323

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

324

| single-precision floating-point value, returning the result. After being

325

| shifted into the proper positions, the three fields are simply added

326

| together to form the result. This means that any integer portion of `zSig'

327

| will be added into the exponent. Since a properly normalized significand

328

| will have an integer portion equal to 1, the `zExp' input should be 1 less

329

| than the desired result exponent whenever `zSig' is a complete, normalized

330

| significand.

331

*----------------------------------------------------------------------------*/

332

333

INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)

334

{

335

336

return make_float32(

337

( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);

338

339

}

340

341

/*----------------------------------------------------------------------------

342

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

343

| and significand `zSig', and returns the proper single-precision floating-

344

| point value corresponding to the abstract input. Ordinarily, the abstract

345

| value is simply rounded and packed into the single-precision format, with

346

| the inexact exception raised if the abstract input cannot be represented

347

| exactly. However, if the abstract value is too large, the overflow and

348

| inexact exceptions are raised and an infinity or maximal finite value is

349

| returned. If the abstract value is too small, the input value is rounded to

350

| a subnormal number, and the underflow and inexact exceptions are raised if

351

| the abstract input cannot be represented exactly as a subnormal single-

352

| precision floating-point number.

353

| The input significand `zSig' has its binary point between bits 30

354

| and 29, which is 7 bits to the left of the usual location. This shifted

355

| significand must be normalized or smaller. If `zSig' is not normalized,

356

| `zExp' must be 0; in that case, the result returned is a subnormal number,

357

| and it must not require rounding. In the usual case that `zSig' is

358

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

359

| The handling of underflow and overflow follows the IEC/IEEE Standard for

360

| Binary Floating-Point Arithmetic.

361

*----------------------------------------------------------------------------*/

362

363

static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)

364

{

365

int8 roundingMode;

366

flag roundNearestEven;

367

int8 roundIncrement, roundBits;

368

flag isTiny;

369

370

roundingMode = STATUS(float_rounding_mode);

371

roundNearestEven = ( roundingMode == float_round_nearest_even );

372

roundIncrement = 0x40;

373

if ( ! roundNearestEven ) {

374

if ( roundingMode == float_round_to_zero ) {

375

roundIncrement = 0;

376

}

377

else {

378

roundIncrement = 0x7F;

379

if ( zSign ) {

380

if ( roundingMode == float_round_up ) roundIncrement = 0;

381

}

382

else {

383

if ( roundingMode == float_round_down ) roundIncrement = 0;

384

}

385

}

386

}

387

roundBits = zSig & 0x7F;

388

if ( 0xFD <= (uint16_t) zExp ) {

389

if ( ( 0xFD < zExp )

390

|| ( ( zExp == 0xFD )

391

&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )

392

) {

393

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

394

return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));

395

}

396

if ( zExp < 0 ) {

397

if (STATUS(flush_to_zero)) {

398

float_raise(float_flag_output_denormal STATUS_VAR);

399

return packFloat32(zSign, 0, 0);

400

}

401

isTiny =

402

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

403

|| ( zExp < -1 )

404

|| ( zSig + roundIncrement < 0x80000000 );

405

shift32RightJamming( zSig, - zExp, &zSig );

406

zExp = 0;

407

roundBits = zSig & 0x7F;

408

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

409

}

410

}

411

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

412

zSig = ( zSig + roundIncrement )>>7;

413

zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );

414

if ( zSig == 0 ) zExp = 0;

415

return packFloat32( zSign, zExp, zSig );

416

417

}

418

419

/*----------------------------------------------------------------------------

420

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

421

| and significand `zSig', and returns the proper single-precision floating-

422

| point value corresponding to the abstract input. This routine is just like

423

| `roundAndPackFloat32' except that `zSig' does not have to be normalized.

424

| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''

425

| floating-point exponent.

426

*----------------------------------------------------------------------------*/

427

428

static float32

429

normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)

430

{

431

int8 shiftCount;

432

433

shiftCount = countLeadingZeros32( zSig ) - 1;

434

return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);

435

436

}

437

438

/*----------------------------------------------------------------------------

439

| Returns the fraction bits of the double-precision floating-point value `a'.

440

*----------------------------------------------------------------------------*/

441

442

INLINE uint64_t extractFloat64Frac( float64 a )

443

{

444

445

return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );

446

447

}

448

449

/*----------------------------------------------------------------------------

450

| Returns the exponent bits of the double-precision floating-point value `a'.

451

*----------------------------------------------------------------------------*/

452

453

INLINE int_fast16_t extractFloat64Exp(float64 a)

454

{

455

456

return ( float64_val(a)>>52 ) & 0x7FF;

457

458

}

459

460

/*----------------------------------------------------------------------------

461

| Returns the sign bit of the double-precision floating-point value `a'.

462

*----------------------------------------------------------------------------*/

463

464

INLINE flag extractFloat64Sign( float64 a )

465

{

466

467

return float64_val(a)>>63;

468

469

}

470

471

/*----------------------------------------------------------------------------

472

| If `a' is denormal and we are in flush-to-zero mode then set the

473

| input-denormal exception and return zero. Otherwise just return the value.

474

*----------------------------------------------------------------------------*/

475

static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)

476

{

477

if (STATUS(flush_inputs_to_zero)) {

478

if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {

479

float_raise(float_flag_input_denormal STATUS_VAR);

480

return make_float64(float64_val(a) & (1ULL << 63));

481

}

482

}

483

return a;

484

}

485

486

/*----------------------------------------------------------------------------

487

| Normalizes the subnormal double-precision floating-point value represented

488

| by the denormalized significand `aSig'. The normalized exponent and

489

| significand are stored at the locations pointed to by `zExpPtr' and

490

| `zSigPtr', respectively.

491

*----------------------------------------------------------------------------*/

492

493

static void

494

normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)

495

{

496

int8 shiftCount;

497

498

shiftCount = countLeadingZeros64( aSig ) - 11;

499

*zSigPtr = aSig<<shiftCount;

500

*zExpPtr = 1 - shiftCount;

501

502

}

503

504

/*----------------------------------------------------------------------------

505

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

506

| double-precision floating-point value, returning the result. After being

507

| shifted into the proper positions, the three fields are simply added

508

| together to form the result. This means that any integer portion of `zSig'

509

| will be added into the exponent. Since a properly normalized significand

510

| will have an integer portion equal to 1, the `zExp' input should be 1 less

511

| than the desired result exponent whenever `zSig' is a complete, normalized

512

| significand.

513

*----------------------------------------------------------------------------*/

514

515

INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)

516

{

517

518

return make_float64(

519

( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);

520

521

}

522

523

/*----------------------------------------------------------------------------

524

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

525

| and significand `zSig', and returns the proper double-precision floating-

526

| point value corresponding to the abstract input. Ordinarily, the abstract

527

| value is simply rounded and packed into the double-precision format, with

528

| the inexact exception raised if the abstract input cannot be represented

529

| exactly. However, if the abstract value is too large, the overflow and

530

| inexact exceptions are raised and an infinity or maximal finite value is

531

| returned. If the abstract value is too small, the input value is rounded

532

| to a subnormal number, and the underflow and inexact exceptions are raised

533

| if the abstract input cannot be represented exactly as a subnormal double-

534

| precision floating-point number.

535

| The input significand `zSig' has its binary point between bits 62

536

| and 61, which is 10 bits to the left of the usual location. This shifted

537

| significand must be normalized or smaller. If `zSig' is not normalized,

538

| `zExp' must be 0; in that case, the result returned is a subnormal number,

539

| and it must not require rounding. In the usual case that `zSig' is

540

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

541

| The handling of underflow and overflow follows the IEC/IEEE Standard for

542

| Binary Floating-Point Arithmetic.

543

*----------------------------------------------------------------------------*/

544

545

static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)

546

{

547

int8 roundingMode;

548

flag roundNearestEven;

549

int_fast16_t roundIncrement, roundBits;

550

flag isTiny;

551

552

roundingMode = STATUS(float_rounding_mode);

553

roundNearestEven = ( roundingMode == float_round_nearest_even );

554

roundIncrement = 0x200;

555

if ( ! roundNearestEven ) {

556

if ( roundingMode == float_round_to_zero ) {

557

roundIncrement = 0;

558

}

559

else {

560

roundIncrement = 0x3FF;

561

if ( zSign ) {

562

if ( roundingMode == float_round_up ) roundIncrement = 0;

563

}

564

else {

565

if ( roundingMode == float_round_down ) roundIncrement = 0;

566

}

567

}

568

}

569

roundBits = zSig & 0x3FF;

570

if ( 0x7FD <= (uint16_t) zExp ) {

571

if ( ( 0x7FD < zExp )

572

|| ( ( zExp == 0x7FD )

573

&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )

574

) {

575

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

576

return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));

577

}

578

if ( zExp < 0 ) {

579

if (STATUS(flush_to_zero)) {

580

float_raise(float_flag_output_denormal STATUS_VAR);

581

return packFloat64(zSign, 0, 0);

582

}

583

isTiny =

584

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

585

|| ( zExp < -1 )

586

|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );

587

shift64RightJamming( zSig, - zExp, &zSig );

588

zExp = 0;

589

roundBits = zSig & 0x3FF;

590

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

591

}

592

}

593

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

594

zSig = ( zSig + roundIncrement )>>10;

595

zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );

596

if ( zSig == 0 ) zExp = 0;

597

return packFloat64( zSign, zExp, zSig );

598

599

}

600

601

/*----------------------------------------------------------------------------

602

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

603

| and significand `zSig', and returns the proper double-precision floating-

604

| point value corresponding to the abstract input. This routine is just like

605

| `roundAndPackFloat64' except that `zSig' does not have to be normalized.

606

| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''

607

| floating-point exponent.

608

*----------------------------------------------------------------------------*/

609

610

static float64

611

normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)

612

{

613

int8 shiftCount;

614

615

shiftCount = countLeadingZeros64( zSig ) - 1;

616

return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);

617

618

}

619

620

/*----------------------------------------------------------------------------

621

| Returns the fraction bits of the extended double-precision floating-point

622

| value `a'.

623

*----------------------------------------------------------------------------*/

624

625

INLINE uint64_t extractFloatx80Frac( floatx80 a )

626

{

627

628

return a.low;

629

630

}

631

632

/*----------------------------------------------------------------------------

633

| Returns the exponent bits of the extended double-precision floating-point

634

| value `a'.

635

*----------------------------------------------------------------------------*/

636

637

INLINE int32 extractFloatx80Exp( floatx80 a )

638

{

639

640

return a.high & 0x7FFF;

641

642

}

643

644

/*----------------------------------------------------------------------------

645

| Returns the sign bit of the extended double-precision floating-point value

646

| `a'.

647

*----------------------------------------------------------------------------*/

648

649

INLINE flag extractFloatx80Sign( floatx80 a )

650

{

651

652

return a.high>>15;

653

654

}

655

656

/*----------------------------------------------------------------------------

657

| Normalizes the subnormal extended double-precision floating-point value

658

| represented by the denormalized significand `aSig'. The normalized exponent

659

| and significand are stored at the locations pointed to by `zExpPtr' and

660

| `zSigPtr', respectively.

661

*----------------------------------------------------------------------------*/

662

663

static void

664

normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )

665

{

666

int8 shiftCount;

667

668

shiftCount = countLeadingZeros64( aSig );

669

*zSigPtr = aSig<<shiftCount;

670

*zExpPtr = 1 - shiftCount;

671

672

}

673

674

/*----------------------------------------------------------------------------

675

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an

676

| extended double-precision floating-point value, returning the result.

677

*----------------------------------------------------------------------------*/

678

679

INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )

680

{

681

floatx80 z;

682

683

z.low = zSig;

684

z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;

685

return z;

686

687

}

688

689

/*----------------------------------------------------------------------------

690

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

691

| and extended significand formed by the concatenation of `zSig0' and `zSig1',

692

| and returns the proper extended double-precision floating-point value

693

| corresponding to the abstract input. Ordinarily, the abstract value is

694

| rounded and packed into the extended double-precision format, with the

695

| inexact exception raised if the abstract input cannot be represented

696

| exactly. However, if the abstract value is too large, the overflow and

697

| inexact exceptions are raised and an infinity or maximal finite value is

698

| returned. If the abstract value is too small, the input value is rounded to

699

| a subnormal number, and the underflow and inexact exceptions are raised if

700

| the abstract input cannot be represented exactly as a subnormal extended

701

| double-precision floating-point number.

702

| If `roundingPrecision' is 32 or 64, the result is rounded to the same

703

| number of bits as single or double precision, respectively. Otherwise, the

704

| result is rounded to the full precision of the extended double-precision

705

| format.

706

| The input significand must be normalized or smaller. If the input

707

| significand is not normalized, `zExp' must be 0; in that case, the result

708

| returned is a subnormal number, and it must not require rounding. The

709

| handling of underflow and overflow follows the IEC/IEEE Standard for Binary

710

| Floating-Point Arithmetic.

711

*----------------------------------------------------------------------------*/

712

713

static floatx80

714

roundAndPackFloatx80(

715

int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1

716

STATUS_PARAM)

717

{

718

int8 roundingMode;

719

flag roundNearestEven, increment, isTiny;

720

int64 roundIncrement, roundMask, roundBits;

721

722

roundingMode = STATUS(float_rounding_mode);

723

roundNearestEven = ( roundingMode == float_round_nearest_even );

724

if ( roundingPrecision == 80 ) goto precision80;

725

if ( roundingPrecision == 64 ) {

726

roundIncrement = LIT64( 0x0000000000000400 );

727

roundMask = LIT64( 0x00000000000007FF );

728

}

729

else if ( roundingPrecision == 32 ) {

730

roundIncrement = LIT64( 0x0000008000000000 );

731

roundMask = LIT64( 0x000000FFFFFFFFFF );

732

}

733

else {

734

goto precision80;

735

}

736

zSig0 |= ( zSig1 != 0 );

737

if ( ! roundNearestEven ) {

738

if ( roundingMode == float_round_to_zero ) {

739

roundIncrement = 0;

740

}

741

else {

742

roundIncrement = roundMask;

743

if ( zSign ) {

744

if ( roundingMode == float_round_up ) roundIncrement = 0;

745

}

746

else {

747

if ( roundingMode == float_round_down ) roundIncrement = 0;

748

}

749

}

750

}

751

roundBits = zSig0 & roundMask;

752

if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {

753

if ( ( 0x7FFE < zExp )

754

|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )

755

) {

756

goto overflow;

757

}

758

if ( zExp <= 0 ) {

759

if (STATUS(flush_to_zero)) {

760

float_raise(float_flag_output_denormal STATUS_VAR);

761

return packFloatx80(zSign, 0, 0);

762

}

763

isTiny =

764

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

765

|| ( zExp < 0 )

766

|| ( zSig0 <= zSig0 + roundIncrement );

767

shift64RightJamming( zSig0, 1 - zExp, &zSig0 );

768

zExp = 0;

769

roundBits = zSig0 & roundMask;

770

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

771

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

772

zSig0 += roundIncrement;

773

if ( (int64_t) zSig0 < 0 ) zExp = 1;

774

roundIncrement = roundMask + 1;

775

if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {

776

roundMask |= roundIncrement;

777

}

778

zSig0 &= ~ roundMask;

779

return packFloatx80( zSign, zExp, zSig0 );

780

}

781

}

782

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

783

zSig0 += roundIncrement;

784

if ( zSig0 < roundIncrement ) {

785

++zExp;

786

zSig0 = LIT64( 0x8000000000000000 );

787

}

788

roundIncrement = roundMask + 1;

789

if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {

790

roundMask |= roundIncrement;

791

}

792

zSig0 &= ~ roundMask;

793

if ( zSig0 == 0 ) zExp = 0;

794

return packFloatx80( zSign, zExp, zSig0 );

795

precision80:

796

increment = ( (int64_t) zSig1 < 0 );

797

if ( ! roundNearestEven ) {

798

if ( roundingMode == float_round_to_zero ) {

799

increment = 0;

800

}

801

else {

802

if ( zSign ) {

803

increment = ( roundingMode == float_round_down ) && zSig1;

804

}

805

else {

806

increment = ( roundingMode == float_round_up ) && zSig1;

807

}

808

}

809

}

810

if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {

811

if ( ( 0x7FFE < zExp )

812

|| ( ( zExp == 0x7FFE )

813

&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )

814

&& increment

815

)

816

) {

817

roundMask = 0;

818

overflow:

819

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

820

if ( ( roundingMode == float_round_to_zero )

821

|| ( zSign && ( roundingMode == float_round_up ) )

822

|| ( ! zSign && ( roundingMode == float_round_down ) )

823

) {

824

return packFloatx80( zSign, 0x7FFE, ~ roundMask );

825

}

826

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

827

}

828

if ( zExp <= 0 ) {

829

isTiny =

830

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

831

|| ( zExp < 0 )

832

|| ! increment

833

|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );

834

shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );

835

zExp = 0;

836

if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);

837

if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

838

if ( roundNearestEven ) {

839

increment = ( (int64_t) zSig1 < 0 );

840

}

841

else {

842

if ( zSign ) {

843

increment = ( roundingMode == float_round_down ) && zSig1;

844

}

845

else {

846

increment = ( roundingMode == float_round_up ) && zSig1;

847

}

848

}

849

if ( increment ) {

850

++zSig0;

851

zSig0 &=

852

~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );

853

if ( (int64_t) zSig0 < 0 ) zExp = 1;

854

}

855

return packFloatx80( zSign, zExp, zSig0 );

856

}

857

}

858

if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

859

if ( increment ) {

860

++zSig0;

861

if ( zSig0 == 0 ) {

862

++zExp;

863

zSig0 = LIT64( 0x8000000000000000 );

864

}

865

else {

866

zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );

867

}

868

}

869

else {

870

if ( zSig0 == 0 ) zExp = 0;

871

}

872

return packFloatx80( zSign, zExp, zSig0 );

873

874

}

875

876

/*----------------------------------------------------------------------------

877

| Takes an abstract floating-point value having sign `zSign', exponent

878

| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',

879

| and returns the proper extended double-precision floating-point value

880

| corresponding to the abstract input. This routine is just like

881

| `roundAndPackFloatx80' except that the input significand does not have to be

882

| normalized.

883

*----------------------------------------------------------------------------*/

884

885

static floatx80

886

normalizeRoundAndPackFloatx80(

887

int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1

888

STATUS_PARAM)

889

{

890

int8 shiftCount;

891

892

if ( zSig0 == 0 ) {

893

zSig0 = zSig1;

894

zSig1 = 0;

895

zExp -= 64;

896

}

897

shiftCount = countLeadingZeros64( zSig0 );

898

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

899

zExp -= shiftCount;

900

return

901

roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);

902

903

}

904

905

/*----------------------------------------------------------------------------

906

| Returns the least-significant 64 fraction bits of the quadruple-precision

907

| floating-point value `a'.

908

*----------------------------------------------------------------------------*/

909

910

INLINE uint64_t extractFloat128Frac1( float128 a )

911

{

912

913

return a.low;

914

915

}

916

917

/*----------------------------------------------------------------------------

918

| Returns the most-significant 48 fraction bits of the quadruple-precision

919

| floating-point value `a'.

920

*----------------------------------------------------------------------------*/

921

922

INLINE uint64_t extractFloat128Frac0( float128 a )

923

{

924

925

return a.high & LIT64( 0x0000FFFFFFFFFFFF );

926

927

}

928

929

/*----------------------------------------------------------------------------

930

| Returns the exponent bits of the quadruple-precision floating-point value

931

| `a'.

932

*----------------------------------------------------------------------------*/

933

934

INLINE int32 extractFloat128Exp( float128 a )

935

{

936

937

return ( a.high>>48 ) & 0x7FFF;

938

939

}

940

941

/*----------------------------------------------------------------------------

942

| Returns the sign bit of the quadruple-precision floating-point value `a'.

943

*----------------------------------------------------------------------------*/

944

945

INLINE flag extractFloat128Sign( float128 a )

946

{

947

948

return a.high>>63;

949

950

}

951

952

/*----------------------------------------------------------------------------

953

| Normalizes the subnormal quadruple-precision floating-point value

954

| represented by the denormalized significand formed by the concatenation of

955

| `aSig0' and `aSig1'. The normalized exponent is stored at the location

956

| pointed to by `zExpPtr'. The most significant 49 bits of the normalized

957

| significand are stored at the location pointed to by `zSig0Ptr', and the

958

| least significant 64 bits of the normalized significand are stored at the

959

| location pointed to by `zSig1Ptr'.

960

*----------------------------------------------------------------------------*/

961

962

static void

963

normalizeFloat128Subnormal(

964

uint64_t aSig0,

965

uint64_t aSig1,

966

int32 *zExpPtr,

967

uint64_t *zSig0Ptr,

968

uint64_t *zSig1Ptr

969

)

970

{

971

int8 shiftCount;

972

973

if ( aSig0 == 0 ) {

974

shiftCount = countLeadingZeros64( aSig1 ) - 15;

975

if ( shiftCount < 0 ) {

976

*zSig0Ptr = aSig1>>( - shiftCount );

977

*zSig1Ptr = aSig1<<( shiftCount & 63 );

978

}

979

else {

980

*zSig0Ptr = aSig1<<shiftCount;

981

*zSig1Ptr = 0;

982

}

983

*zExpPtr = - shiftCount - 63;

984

}

985

else {

986

shiftCount = countLeadingZeros64( aSig0 ) - 15;

987

shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );

988

*zExpPtr = 1 - shiftCount;

989

}

990

991

}

992

993

/*----------------------------------------------------------------------------

994

| Packs the sign `zSign', the exponent `zExp', and the significand formed

995

| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision

996

| floating-point value, returning the result. After being shifted into the

997

| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply

998

| added together to form the most significant 32 bits of the result. This

999

| means that any integer portion of `zSig0' will be added into the exponent.

1000

| Since a properly normalized significand will have an integer portion equal

1001

| to 1, the `zExp' input should be 1 less than the desired result exponent

1002

| whenever `zSig0' and `zSig1' concatenated form a complete, normalized

1003

| significand.

1004

*----------------------------------------------------------------------------*/

1005

1006

INLINE float128

1007

packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )

1008

{

1009

float128 z;

1010

1011

z.low = zSig1;

1012

z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;

1013

return z;

1014

1015

}

1016

1017

/*----------------------------------------------------------------------------

1018

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

1019

| and extended significand formed by the concatenation of `zSig0', `zSig1',

1020

| and `zSig2', and returns the proper quadruple-precision floating-point value

1021

| corresponding to the abstract input. Ordinarily, the abstract value is

1022

| simply rounded and packed into the quadruple-precision format, with the

1023

| inexact exception raised if the abstract input cannot be represented

1024

| exactly. However, if the abstract value is too large, the overflow and

1025

| inexact exceptions are raised and an infinity or maximal finite value is

1026

| returned. If the abstract value is too small, the input value is rounded to

1027

| a subnormal number, and the underflow and inexact exceptions are raised if

1028

| the abstract input cannot be represented exactly as a subnormal quadruple-

1029

| precision floating-point number.

1030

| The input significand must be normalized or smaller. If the input

1031

| significand is not normalized, `zExp' must be 0; in that case, the result

1032

| returned is a subnormal number, and it must not require rounding. In the

1033

| usual case that the input significand is normalized, `zExp' must be 1 less

1034

| than the ``true'' floating-point exponent. The handling of underflow and

1035

| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1036

*----------------------------------------------------------------------------*/

1037

1038

static float128

1039

roundAndPackFloat128(

1040

flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)

1041

{

1042

int8 roundingMode;

1043

flag roundNearestEven, increment, isTiny;

1044

1045

roundingMode = STATUS(float_rounding_mode);

1046

roundNearestEven = ( roundingMode == float_round_nearest_even );

1047

increment = ( (int64_t) zSig2 < 0 );

1048

if ( ! roundNearestEven ) {

1049

if ( roundingMode == float_round_to_zero ) {

1050

increment = 0;

1051

}

1052

else {

1053

if ( zSign ) {

1054

increment = ( roundingMode == float_round_down ) && zSig2;

1055

}

1056

else {

1057

increment = ( roundingMode == float_round_up ) && zSig2;

1058

}

1059

}

1060

}

1061

if ( 0x7FFD <= (uint32_t) zExp ) {

1062

if ( ( 0x7FFD < zExp )

1063

|| ( ( zExp == 0x7FFD )

1064

&& eq128(

1065

LIT64( 0x0001FFFFFFFFFFFF ),

1066

LIT64( 0xFFFFFFFFFFFFFFFF ),

1067

zSig0,

1068

zSig1

1069

)

1070

&& increment

1071

)

1072

) {

1073

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

1074

if ( ( roundingMode == float_round_to_zero )

1075

|| ( zSign && ( roundingMode == float_round_up ) )

1076

|| ( ! zSign && ( roundingMode == float_round_down ) )

1077

) {

1078

return

1079

packFloat128(

1080

zSign,

1081

0x7FFE,

1082

LIT64( 0x0000FFFFFFFFFFFF ),

1083

LIT64( 0xFFFFFFFFFFFFFFFF )

1084

);

1085

}

1086

return packFloat128( zSign, 0x7FFF, 0, 0 );

1087

}

1088

if ( zExp < 0 ) {

1089

if (STATUS(flush_to_zero)) {

1090

float_raise(float_flag_output_denormal STATUS_VAR);

1091

return packFloat128(zSign, 0, 0, 0);

1092

}

1093

isTiny =

1094

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

1095

|| ( zExp < -1 )

1096

|| ! increment

1097

|| lt128(

1098

zSig0,

1099

zSig1,

1100

LIT64( 0x0001FFFFFFFFFFFF ),

1101

LIT64( 0xFFFFFFFFFFFFFFFF )

1102

);

1103

shift128ExtraRightJamming(

1104

zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );

1105

zExp = 0;

1106

if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);

1107

if ( roundNearestEven ) {

1108

increment = ( (int64_t) zSig2 < 0 );

1109

}

1110

else {

1111

if ( zSign ) {

1112

increment = ( roundingMode == float_round_down ) && zSig2;

1113

}

1114

else {

1115

increment = ( roundingMode == float_round_up ) && zSig2;

1116

}

1117

}

1118

}

1119

}

1120

if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;

1121

if ( increment ) {

1122

add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );

1123

zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );

1124

}

1125

else {

1126

if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;

1127

}

1128

return packFloat128( zSign, zExp, zSig0, zSig1 );

1129

1130

}

1131

1132

/*----------------------------------------------------------------------------

1133

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

1134

| and significand formed by the concatenation of `zSig0' and `zSig1', and

1135

| returns the proper quadruple-precision floating-point value corresponding

1136

| to the abstract input. This routine is just like `roundAndPackFloat128'

1137

| except that the input significand has fewer bits and does not have to be

1138

| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-

1139

| point exponent.

1140

*----------------------------------------------------------------------------*/

1141

1142

static float128

1143

normalizeRoundAndPackFloat128(

1144

flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)

1145

{

1146

int8 shiftCount;

1147

uint64_t zSig2;

1148

1149

if ( zSig0 == 0 ) {

1150

zSig0 = zSig1;

1151

zSig1 = 0;

1152

zExp -= 64;

1153

}

1154

shiftCount = countLeadingZeros64( zSig0 ) - 15;

1155

if ( 0 <= shiftCount ) {

1156

zSig2 = 0;

1157

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

1158

}

1159

else {

1160

shift128ExtraRightJamming(

1161

zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );

1162

}

1163

zExp -= shiftCount;

1164

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);

1165

1166

}

1167

1168

/*----------------------------------------------------------------------------

1169

| Returns the result of converting the 32-bit two's complement integer `a'

1170

| to the single-precision floating-point format. The conversion is performed

1171

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1172

*----------------------------------------------------------------------------*/

1173

1174

float32 int32_to_float32(int32_t a STATUS_PARAM)

1175

{

1176

flag zSign;

1177

1178

if ( a == 0 ) return float32_zero;

1179

if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );

1180

zSign = ( a < 0 );

1181

return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );

1182

1183

}

1184

1185

/*----------------------------------------------------------------------------

1186

| Returns the result of converting the 32-bit two's complement integer `a'

1187

| to the double-precision floating-point format. The conversion is performed

1188

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1189

*----------------------------------------------------------------------------*/

1190

1191

float64 int32_to_float64(int32_t a STATUS_PARAM)

1192

{

1193

flag zSign;

1194

uint32 absA;

1195

int8 shiftCount;

1196

uint64_t zSig;

1197

1198

if ( a == 0 ) return float64_zero;

1199

zSign = ( a < 0 );

1200

absA = zSign ? - a : a;

1201

shiftCount = countLeadingZeros32( absA ) + 21;

1202

zSig = absA;

1203

return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );

1204

1205

}

1206

1207

/*----------------------------------------------------------------------------

1208

| Returns the result of converting the 32-bit two's complement integer `a'

1209

| to the extended double-precision floating-point format. The conversion

1210

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1211

| Arithmetic.

1212

*----------------------------------------------------------------------------*/

1213

1214

floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)

1215

{

1216

flag zSign;

1217

uint32 absA;

1218

int8 shiftCount;

1219

uint64_t zSig;

1220

1221

if ( a == 0 ) return packFloatx80( 0, 0, 0 );

1222

zSign = ( a < 0 );

1223

absA = zSign ? - a : a;

1224

shiftCount = countLeadingZeros32( absA ) + 32;

1225

zSig = absA;

1226

return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );

1227

1228

}

1229

1230

/*----------------------------------------------------------------------------

1231

| Returns the result of converting the 32-bit two's complement integer `a' to

1232

| the quadruple-precision floating-point format. The conversion is performed

1233

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1234

*----------------------------------------------------------------------------*/

1235

1236

float128 int32_to_float128(int32_t a STATUS_PARAM)

1237

{

1238

flag zSign;

1239

uint32 absA;

1240

int8 shiftCount;

1241

uint64_t zSig0;

1242

1243

if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );

1244

zSign = ( a < 0 );

1245

absA = zSign ? - a : a;

1246

shiftCount = countLeadingZeros32( absA ) + 17;

1247

zSig0 = absA;

1248

return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );

1249

1250

}

1251

1252

/*----------------------------------------------------------------------------

1253

| Returns the result of converting the 64-bit two's complement integer `a'

1254

| to the single-precision floating-point format. The conversion is performed

1255

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1256

*----------------------------------------------------------------------------*/

1257

1258

float32 int64_to_float32(int64_t a STATUS_PARAM)

1259

{

1260

flag zSign;

1261

uint64 absA;

1262

int8 shiftCount;

1263

1264

if ( a == 0 ) return float32_zero;

1265

zSign = ( a < 0 );

1266

absA = zSign ? - a : a;

1267

shiftCount = countLeadingZeros64( absA ) - 40;

1268

if ( 0 <= shiftCount ) {

1269

return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );

1270

}

1271

else {

1272

shiftCount += 7;

1273

if ( shiftCount < 0 ) {

1274

shift64RightJamming( absA, - shiftCount, &absA );

1275

}

1276

else {

1277

absA <<= shiftCount;

1278

}

1279

return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );

1280

}

1281

1282

}

1283

1284

float32 uint64_to_float32(uint64_t a STATUS_PARAM)

1285

{

1286

int8 shiftCount;

1287

1288

if ( a == 0 ) return float32_zero;

1289

shiftCount = countLeadingZeros64( a ) - 40;

1290

if ( 0 <= shiftCount ) {

1291

return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);

1292

}

1293

else {

1294

shiftCount += 7;

1295

if ( shiftCount < 0 ) {

1296

shift64RightJamming( a, - shiftCount, &a );

1297

}

1298

else {

1299

a <<= shiftCount;

1300

}

1301

return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);

1302

}

1303

}

1304

1305

/*----------------------------------------------------------------------------

1306

| Returns the result of converting the 64-bit two's complement integer `a'

1307

| to the double-precision floating-point format. The conversion is performed

1308

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1309

*----------------------------------------------------------------------------*/

1310

1311

float64 int64_to_float64(int64_t a STATUS_PARAM)

1312

{

1313

flag zSign;

1314

1315

if ( a == 0 ) return float64_zero;

1316

if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {

1317

return packFloat64( 1, 0x43E, 0 );

1318

}

1319

zSign = ( a < 0 );

1320

return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );

1321

1322

}

1323

1324

float64 uint64_to_float64(uint64_t a STATUS_PARAM)

1325

{

1326

int exp = 0x43C;

1327

1328

if (a == 0) {

1329

return float64_zero;

1330

}

1331

if ((int64_t)a < 0) {

1332

shift64RightJamming(a, 1, &a);

1333

exp += 1;

1334

}

1335

return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);

1336

}

1337

1338

/*----------------------------------------------------------------------------

1339

| Returns the result of converting the 64-bit two's complement integer `a'

1340

| to the extended double-precision floating-point format. The conversion

1341

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1342

| Arithmetic.

1343

*----------------------------------------------------------------------------*/

1344

1345

floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)

1346

{

1347

flag zSign;

1348

uint64 absA;

1349

int8 shiftCount;

1350

1351

if ( a == 0 ) return packFloatx80( 0, 0, 0 );

1352

zSign = ( a < 0 );

1353

absA = zSign ? - a : a;

1354

shiftCount = countLeadingZeros64( absA );

1355

return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );

1356

1357

}

1358

1359

/*----------------------------------------------------------------------------

1360

| Returns the result of converting the 64-bit two's complement integer `a' to

1361

| the quadruple-precision floating-point format. The conversion is performed

1362

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1363

*----------------------------------------------------------------------------*/

1364

1365

float128 int64_to_float128(int64_t a STATUS_PARAM)

1366

{

1367

flag zSign;

1368

uint64 absA;

1369

int8 shiftCount;

1370

int32 zExp;

1371

uint64_t zSig0, zSig1;

1372

1373

if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );

1374

zSign = ( a < 0 );

1375

absA = zSign ? - a : a;

1376

shiftCount = countLeadingZeros64( absA ) + 49;

1377

zExp = 0x406E - shiftCount;

1378

if ( 64 <= shiftCount ) {

1379

zSig1 = 0;

1380

zSig0 = absA;

1381

shiftCount -= 64;

1382

}

1383

else {

1384

zSig1 = absA;

1385

zSig0 = 0;

1386

}

1387

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

1388

return packFloat128( zSign, zExp, zSig0, zSig1 );

1389

1390

}

1391

1392

float128 uint64_to_float128(uint64_t a STATUS_PARAM)

1393

{

1394

if (a == 0) {

1395

return float128_zero;

1396

}

1397

return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);

1398

}

1399

1400

/*----------------------------------------------------------------------------

1401

| Returns the result of converting the single-precision floating-point value

1402

| `a' to the 32-bit two's complement integer format. The conversion is

1403

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1404

| Arithmetic---which means in particular that the conversion is rounded

1405

| according to the current rounding mode. If `a' is a NaN, the largest

1406

| positive integer is returned. Otherwise, if the conversion overflows, the

1407

| largest integer with the same sign as `a' is returned.

1408

*----------------------------------------------------------------------------*/

1409

1410

int32 float32_to_int32( float32 a STATUS_PARAM )

1411

{

1412

flag aSign;

1413

int_fast16_t aExp, shiftCount;

1414

uint32_t aSig;

1415

uint64_t aSig64;

1416

1417

a = float32_squash_input_denormal(a STATUS_VAR);

1418

aSig = extractFloat32Frac( a );

1419

aExp = extractFloat32Exp( a );

1420

aSign = extractFloat32Sign( a );

1421

if ( ( aExp == 0xFF ) && aSig ) aSign = 0;

1422

if ( aExp ) aSig |= 0x00800000;

1423

shiftCount = 0xAF - aExp;

1424

aSig64 = aSig;

1425

aSig64 <<= 32;

1426

if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );

1427

return roundAndPackInt32( aSign, aSig64 STATUS_VAR );

1428

1429

}

1430

1431

/*----------------------------------------------------------------------------

1432

| Returns the result of converting the single-precision floating-point value

1433

| `a' to the 32-bit two's complement integer format. The conversion is

1434

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1435

| Arithmetic, except that the conversion is always rounded toward zero.

1436

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

1437

| the conversion overflows, the largest integer with the same sign as `a' is

1438

| returned.

1439

*----------------------------------------------------------------------------*/

1440

1441

int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )

1442

{

1443

flag aSign;

1444

int_fast16_t aExp, shiftCount;

1445

uint32_t aSig;

1446

int32_t z;

1447

a = float32_squash_input_denormal(a STATUS_VAR);

1448

1449

aSig = extractFloat32Frac( a );

1450

aExp = extractFloat32Exp( a );

1451

aSign = extractFloat32Sign( a );

1452

shiftCount = aExp - 0x9E;

1453

if ( 0 <= shiftCount ) {

1454

if ( float32_val(a) != 0xCF000000 ) {

1455

float_raise( float_flag_invalid STATUS_VAR);

1456

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;

1457

}

1458

return (int32_t) 0x80000000;

1459

}

1460

else if ( aExp <= 0x7E ) {

1461

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

1462

return 0;

1463

}

1464

aSig = ( aSig | 0x00800000 )<<8;

1465

z = aSig>>( - shiftCount );

1466

if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {

1467

STATUS(float_exception_flags) |= float_flag_inexact;

1468

}

1469

if ( aSign ) z = - z;

1470

return z;

1471

1472

}

1473

1474

/*----------------------------------------------------------------------------

1475

| Returns the result of converting the single-precision floating-point value

1476

| `a' to the 16-bit two's complement integer format. The conversion is

1477

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1478

| Arithmetic, except that the conversion is always rounded toward zero.

1479

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

1480

| the conversion overflows, the largest integer with the same sign as `a' is

1481

| returned.

1482

*----------------------------------------------------------------------------*/

1483

1484

int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)

1485

{

1486

flag aSign;

1487

int_fast16_t aExp, shiftCount;

1488

uint32_t aSig;

1489

int32 z;

1490

1491

aSig = extractFloat32Frac( a );

1492

aExp = extractFloat32Exp( a );

1493

aSign = extractFloat32Sign( a );

1494

shiftCount = aExp - 0x8E;

1495

if ( 0 <= shiftCount ) {

1496

if ( float32_val(a) != 0xC7000000 ) {

1497

float_raise( float_flag_invalid STATUS_VAR);

1498

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1499

return 0x7FFF;

1500

}

1501

}

1502

return (int32_t) 0xffff8000;

1503

}

1504

else if ( aExp <= 0x7E ) {

1505

if ( aExp | aSig ) {

1506

STATUS(float_exception_flags) |= float_flag_inexact;

1507

}

1508

return 0;

1509

}

1510

shiftCount -= 0x10;

1511

aSig = ( aSig | 0x00800000 )<<8;

1512

z = aSig>>( - shiftCount );

1513

if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {

1514

STATUS(float_exception_flags) |= float_flag_inexact;

1515

}

1516

if ( aSign ) {

1517

z = - z;

1518

}

1519

return z;

1520

1521

}

1522

1523

/*----------------------------------------------------------------------------

1524

| Returns the result of converting the single-precision floating-point value

1525

| `a' to the 64-bit two's complement integer format. The conversion is

1526

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1527

| Arithmetic---which means in particular that the conversion is rounded

1528

| according to the current rounding mode. If `a' is a NaN, the largest

1529

| positive integer is returned. Otherwise, if the conversion overflows, the

1530

| largest integer with the same sign as `a' is returned.

1531

*----------------------------------------------------------------------------*/

1532

1533

int64 float32_to_int64( float32 a STATUS_PARAM )

1534

{

1535

flag aSign;

1536

int_fast16_t aExp, shiftCount;

1537

uint32_t aSig;

1538

uint64_t aSig64, aSigExtra;

1539

a = float32_squash_input_denormal(a STATUS_VAR);

1540

1541

aSig = extractFloat32Frac( a );

1542

aExp = extractFloat32Exp( a );

1543

aSign = extractFloat32Sign( a );

1544

shiftCount = 0xBE - aExp;

1545

if ( shiftCount < 0 ) {

1546

float_raise( float_flag_invalid STATUS_VAR);

1547

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1548

return LIT64( 0x7FFFFFFFFFFFFFFF );

1549

}

1550

return (int64_t) LIT64( 0x8000000000000000 );

1551

}

1552

if ( aExp ) aSig |= 0x00800000;

1553

aSig64 = aSig;

1554

aSig64 <<= 40;

1555

shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );

1556

return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );

1557

1558

}

1559

1560

/*----------------------------------------------------------------------------

1561

| Returns the result of converting the single-precision floating-point value

1562

| `a' to the 64-bit unsigned integer format. The conversion is

1563

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1564

| Arithmetic---which means in particular that the conversion is rounded

1565

| according to the current rounding mode. If `a' is a NaN, the largest

1566

| unsigned integer is returned. Otherwise, if the conversion overflows, the

1567

| largest unsigned integer is returned. If the 'a' is negative, the result

1568

| is rounded and zero is returned; values that do not round to zero will

1569

| raise the inexact exception flag.

1570

*----------------------------------------------------------------------------*/

1571

1572

uint64 float32_to_uint64(float32 a STATUS_PARAM)

1573

{

1574

flag aSign;

1575

int_fast16_t aExp, shiftCount;

1576

uint32_t aSig;

1577

uint64_t aSig64, aSigExtra;

1578

a = float32_squash_input_denormal(a STATUS_VAR);

1579

1580

aSig = extractFloat32Frac(a);

1581

aExp = extractFloat32Exp(a);

1582

aSign = extractFloat32Sign(a);

1583

if ((aSign) && (aExp > 126)) {

1584

float_raise(float_flag_invalid STATUS_VAR);

1585

if (float32_is_any_nan(a)) {

1586

return LIT64(0xFFFFFFFFFFFFFFFF);

1587

} else {

1588

return 0;

1589

}

1590

}

1591

shiftCount = 0xBE - aExp;

1592

if (aExp) {

1593

aSig |= 0x00800000;

1594

}

1595

if (shiftCount < 0) {

1596

float_raise(float_flag_invalid STATUS_VAR);

1597

return LIT64(0xFFFFFFFFFFFFFFFF);

1598

}

1599

1600

aSig64 = aSig;

1601

aSig64 <<= 40;

1602

shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);

1603

return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);

1604

}

1605

1606

/*----------------------------------------------------------------------------

1607

| Returns the result of converting the single-precision floating-point value

1608

| `a' to the 64-bit two's complement integer format. The conversion is

1609

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1610

| Arithmetic, except that the conversion is always rounded toward zero. If

1611

| `a' is a NaN, the largest positive integer is returned. Otherwise, if the

1612

| conversion overflows, the largest integer with the same sign as `a' is

1613

| returned.

1614

*----------------------------------------------------------------------------*/

1615

1616

int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )

1617

{

1618

flag aSign;

1619

int_fast16_t aExp, shiftCount;

1620

uint32_t aSig;

1621

uint64_t aSig64;

1622

int64 z;

1623

a = float32_squash_input_denormal(a STATUS_VAR);

1624

1625

aSig = extractFloat32Frac( a );

1626

aExp = extractFloat32Exp( a );

1627

aSign = extractFloat32Sign( a );

1628

shiftCount = aExp - 0xBE;

1629

if ( 0 <= shiftCount ) {

1630

if ( float32_val(a) != 0xDF000000 ) {

1631

float_raise( float_flag_invalid STATUS_VAR);

1632

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1633

return LIT64( 0x7FFFFFFFFFFFFFFF );

1634

}

1635

}

1636

return (int64_t) LIT64( 0x8000000000000000 );

1637

}

1638

else if ( aExp <= 0x7E ) {

1639

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

1640

return 0;

1641

}

1642

aSig64 = aSig | 0x00800000;

1643

aSig64 <<= 40;

1644

z = aSig64>>( - shiftCount );

1645

if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {

1646

STATUS(float_exception_flags) |= float_flag_inexact;

1647

}

1648

if ( aSign ) z = - z;

1649

return z;

1650

1651

}

1652

1653

/*----------------------------------------------------------------------------

1654

| Returns the result of converting the single-precision floating-point value

1655

| `a' to the double-precision floating-point format. The conversion is

1656

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1657

| Arithmetic.

1658

*----------------------------------------------------------------------------*/

1659

1660

float64 float32_to_float64( float32 a STATUS_PARAM )

1661

{

1662

flag aSign;

1663

int_fast16_t aExp;

1664

uint32_t aSig;

1665

a = float32_squash_input_denormal(a STATUS_VAR);

1666

1667

aSig = extractFloat32Frac( a );

1668

aExp = extractFloat32Exp( a );

1669

aSign = extractFloat32Sign( a );

1670

if ( aExp == 0xFF ) {

1671

if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1672

return packFloat64( aSign, 0x7FF, 0 );

1673

}

1674

if ( aExp == 0 ) {

1675

if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );

1676

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1677

--aExp;

1678

}

1679

return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );

1680

1681

}

1682

1683

/*----------------------------------------------------------------------------

1684

| Returns the result of converting the single-precision floating-point value

1685

| `a' to the extended double-precision floating-point format. The conversion

1686

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1687

| Arithmetic.

1688

*----------------------------------------------------------------------------*/

1689

1690

floatx80 float32_to_floatx80( float32 a STATUS_PARAM )

1691

{

1692

flag aSign;

1693

int_fast16_t aExp;

1694

uint32_t aSig;

1695

1696

a = float32_squash_input_denormal(a STATUS_VAR);

1697

aSig = extractFloat32Frac( a );

1698

aExp = extractFloat32Exp( a );

1699

aSign = extractFloat32Sign( a );

1700

if ( aExp == 0xFF ) {

1701

if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1702

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

1703

}

1704

if ( aExp == 0 ) {

1705

if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );

1706

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1707

}

1708

aSig |= 0x00800000;

1709

return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );

1710

1711

}

1712

1713

/*----------------------------------------------------------------------------

1714

| Returns the result of converting the single-precision floating-point value

1715

| `a' to the double-precision floating-point format. The conversion is

1716

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1717

| Arithmetic.

1718

*----------------------------------------------------------------------------*/

1719

1720

float128 float32_to_float128( float32 a STATUS_PARAM )

1721

{

1722

flag aSign;

1723

int_fast16_t aExp;

1724

uint32_t aSig;

1725

1726

a = float32_squash_input_denormal(a STATUS_VAR);

1727

aSig = extractFloat32Frac( a );

1728

aExp = extractFloat32Exp( a );

1729

aSign = extractFloat32Sign( a );

1730

if ( aExp == 0xFF ) {

1731

if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1732

return packFloat128( aSign, 0x7FFF, 0, 0 );

1733

}

1734

if ( aExp == 0 ) {

1735

if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );

1736

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1737

--aExp;

1738

}

1739

return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );

1740

1741

}

1742

1743

/*----------------------------------------------------------------------------

1744

| Rounds the single-precision floating-point value `a' to an integer, and

1745

| returns the result as a single-precision floating-point value. The

1746

| operation is performed according to the IEC/IEEE Standard for Binary

1747

| Floating-Point Arithmetic.

1748

*----------------------------------------------------------------------------*/

1749

1750

float32 float32_round_to_int( float32 a STATUS_PARAM)

1751

{

1752

flag aSign;

1753

int_fast16_t aExp;

1754

uint32_t lastBitMask, roundBitsMask;

1755

int8 roundingMode;

1756

uint32_t z;

1757

a = float32_squash_input_denormal(a STATUS_VAR);

1758

1759

aExp = extractFloat32Exp( a );

1760

if ( 0x96 <= aExp ) {

1761

if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {

1762

return propagateFloat32NaN( a, a STATUS_VAR );

1763

}

1764

return a;

1765

}

1766

if ( aExp <= 0x7E ) {

1767

if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;

1768

STATUS(float_exception_flags) |= float_flag_inexact;

1769

aSign = extractFloat32Sign( a );

1770

switch ( STATUS(float_rounding_mode) ) {

1771

case float_round_nearest_even:

1772

if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {

1773

return packFloat32( aSign, 0x7F, 0 );

1774

}

1775

break;

1776

case float_round_down:

1777

return make_float32(aSign ? 0xBF800000 : 0);

1778

case float_round_up:

1779

return make_float32(aSign ? 0x80000000 : 0x3F800000);

1780

}

1781

return packFloat32( aSign, 0, 0 );

1782

}

1783

lastBitMask = 1;

1784

lastBitMask <<= 0x96 - aExp;

1785

roundBitsMask = lastBitMask - 1;

1786

z = float32_val(a);

1787

roundingMode = STATUS(float_rounding_mode);

1788

if ( roundingMode == float_round_nearest_even ) {

1789

z += lastBitMask>>1;

1790

if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;

1791

}

1792

else if ( roundingMode != float_round_to_zero ) {

1793

if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {

1794

z += roundBitsMask;

1795

}

1796

}

1797

z &= ~ roundBitsMask;

1798

if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;

1799

return make_float32(z);

1800

1801

}

1802

1803

/*----------------------------------------------------------------------------

1804

| Returns the result of adding the absolute values of the single-precision

1805

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

1806

| before being returned. `zSign' is ignored if the result is a NaN.

1807

| The addition is performed according to the IEC/IEEE Standard for Binary

1808

| Floating-Point Arithmetic.

1809

*----------------------------------------------------------------------------*/

1810

1811

static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)

1812

{

1813

int_fast16_t aExp, bExp, zExp;

1814

uint32_t aSig, bSig, zSig;

1815

int_fast16_t expDiff;

1816

1817

aSig = extractFloat32Frac( a );

1818

aExp = extractFloat32Exp( a );

1819

bSig = extractFloat32Frac( b );

1820

bExp = extractFloat32Exp( b );

1821

expDiff = aExp - bExp;

1822

aSig <<= 6;

1823

bSig <<= 6;

1824

if ( 0 < expDiff ) {

1825

if ( aExp == 0xFF ) {

1826

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1827

return a;

1828

}

1829

if ( bExp == 0 ) {

1830

--expDiff;

1831

}

1832

else {

1833

bSig |= 0x20000000;

1834

}

1835

shift32RightJamming( bSig, expDiff, &bSig );

1836

zExp = aExp;

1837

}

1838

else if ( expDiff < 0 ) {

1839

if ( bExp == 0xFF ) {

1840

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1841

return packFloat32( zSign, 0xFF, 0 );

1842

}

1843

if ( aExp == 0 ) {

1844

++expDiff;

1845

}

1846

else {

1847

aSig |= 0x20000000;

1848

}

1849

shift32RightJamming( aSig, - expDiff, &aSig );

1850

zExp = bExp;

1851

}

1852

else {

1853

if ( aExp == 0xFF ) {

1854

if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1855

return a;

1856

}

1857

if ( aExp == 0 ) {

1858

if (STATUS(flush_to_zero)) {

1859

if (aSig | bSig) {

1860

float_raise(float_flag_output_denormal STATUS_VAR);

1861

}

1862

return packFloat32(zSign, 0, 0);

1863

}

1864

return packFloat32( zSign, 0, ( aSig + bSig )>>6 );

1865

}

1866

zSig = 0x40000000 + aSig + bSig;

1867

zExp = aExp;

1868

goto roundAndPack;

1869

}

1870

aSig |= 0x20000000;

1871

zSig = ( aSig + bSig )<<1;

1872

--zExp;

1873

if ( (int32_t) zSig < 0 ) {

1874

zSig = aSig + bSig;

1875

++zExp;

1876

}

1877

roundAndPack:

1878

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

1879

1880

}

1881

1882

/*----------------------------------------------------------------------------

1883

| Returns the result of subtracting the absolute values of the single-

1884

| precision floating-point values `a' and `b'. If `zSign' is 1, the

1885

| difference is negated before being returned. `zSign' is ignored if the

1886

| result is a NaN. The subtraction is performed according to the IEC/IEEE

1887

| Standard for Binary Floating-Point Arithmetic.

1888

*----------------------------------------------------------------------------*/

1889

1890

static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)

1891

{

1892

int_fast16_t aExp, bExp, zExp;

1893

uint32_t aSig, bSig, zSig;

1894

int_fast16_t expDiff;

1895

1896

aSig = extractFloat32Frac( a );

1897

aExp = extractFloat32Exp( a );

1898

bSig = extractFloat32Frac( b );

1899

bExp = extractFloat32Exp( b );

1900

expDiff = aExp - bExp;

1901

aSig <<= 7;

1902

bSig <<= 7;

1903

if ( 0 < expDiff ) goto aExpBigger;

1904

if ( expDiff < 0 ) goto bExpBigger;

1905

if ( aExp == 0xFF ) {

1906

if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1907

float_raise( float_flag_invalid STATUS_VAR);

1908

return float32_default_nan;

1909

}

1910

if ( aExp == 0 ) {

1911

aExp = 1;

1912

bExp = 1;

1913

}

1914

if ( bSig < aSig ) goto aBigger;

1915

if ( aSig < bSig ) goto bBigger;

1916

return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

1917

bExpBigger:

1918

if ( bExp == 0xFF ) {

1919

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1920

return packFloat32( zSign ^ 1, 0xFF, 0 );

1921

}

1922

if ( aExp == 0 ) {

1923

++expDiff;

1924

}

1925

else {

1926

aSig |= 0x40000000;

1927

}

1928

shift32RightJamming( aSig, - expDiff, &aSig );

1929

bSig |= 0x40000000;

1930

bBigger:

1931

zSig = bSig - aSig;

1932

zExp = bExp;

1933

zSign ^= 1;

1934

goto normalizeRoundAndPack;

1935

aExpBigger:

1936

if ( aExp == 0xFF ) {

1937

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1938

return a;

1939

}

1940

if ( bExp == 0 ) {

1941

--expDiff;

1942

}

1943

else {

1944

bSig |= 0x40000000;

1945

}

1946

shift32RightJamming( bSig, expDiff, &bSig );

1947

aSig |= 0x40000000;

1948

aBigger:

1949

zSig = aSig - bSig;

1950

zExp = aExp;

1951

normalizeRoundAndPack:

1952

--zExp;

1953

return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

1954

1955

}

1956

1957

/*----------------------------------------------------------------------------

1958

| Returns the result of adding the single-precision floating-point values `a'

1959

| and `b'. The operation is performed according to the IEC/IEEE Standard for

1960

| Binary Floating-Point Arithmetic.

1961

*----------------------------------------------------------------------------*/

1962

1963

float32 float32_add( float32 a, float32 b STATUS_PARAM )

1964

{

1965

flag aSign, bSign;

1966

a = float32_squash_input_denormal(a STATUS_VAR);

1967

b = float32_squash_input_denormal(b STATUS_VAR);

1968

1969

aSign = extractFloat32Sign( a );

1970

bSign = extractFloat32Sign( b );

1971

if ( aSign == bSign ) {

1972

return addFloat32Sigs( a, b, aSign STATUS_VAR);

1973

}

1974

else {

1975

return subFloat32Sigs( a, b, aSign STATUS_VAR );

1976

}

1977

1978

}

1979

1980

/*----------------------------------------------------------------------------

1981

| Returns the result of subtracting the single-precision floating-point values

1982

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

1983

| for Binary Floating-Point Arithmetic.

1984

*----------------------------------------------------------------------------*/

1985

1986

float32 float32_sub( float32 a, float32 b STATUS_PARAM )

1987

{

1988

flag aSign, bSign;

1989

a = float32_squash_input_denormal(a STATUS_VAR);

1990

b = float32_squash_input_denormal(b STATUS_VAR);

1991

1992

aSign = extractFloat32Sign( a );

1993

bSign = extractFloat32Sign( b );

1994

if ( aSign == bSign ) {

1995

return subFloat32Sigs( a, b, aSign STATUS_VAR );

1996

}

1997

else {

1998

return addFloat32Sigs( a, b, aSign STATUS_VAR );

1999

}

2000

2001

}

2002

2003

/*----------------------------------------------------------------------------

2004

| Returns the result of multiplying the single-precision floating-point values

2005

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

2006

| for Binary Floating-Point Arithmetic.

2007

*----------------------------------------------------------------------------*/

2008

2009

float32 float32_mul( float32 a, float32 b STATUS_PARAM )

2010

{

2011

flag aSign, bSign, zSign;

2012

int_fast16_t aExp, bExp, zExp;

2013

uint32_t aSig, bSig;

2014

uint64_t zSig64;

2015

uint32_t zSig;

2016

2017

a = float32_squash_input_denormal(a STATUS_VAR);

2018

b = float32_squash_input_denormal(b STATUS_VAR);

2019

2020

aSig = extractFloat32Frac( a );

2021

aExp = extractFloat32Exp( a );

2022

aSign = extractFloat32Sign( a );

2023

bSig = extractFloat32Frac( b );

2024

bExp = extractFloat32Exp( b );

2025

bSign = extractFloat32Sign( b );

2026

zSign = aSign ^ bSign;

2027

if ( aExp == 0xFF ) {

2028

if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {

2029

return propagateFloat32NaN( a, b STATUS_VAR );

2030

}

2031

if ( ( bExp | bSig ) == 0 ) {

2032

float_raise( float_flag_invalid STATUS_VAR);

2033

return float32_default_nan;

2034

}

2035

return packFloat32( zSign, 0xFF, 0 );

2036

}

2037

if ( bExp == 0xFF ) {

2038

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2039

if ( ( aExp | aSig ) == 0 ) {

2040

float_raise( float_flag_invalid STATUS_VAR);

2041

return float32_default_nan;

2042

}

2043

return packFloat32( zSign, 0xFF, 0 );

2044

}

2045

if ( aExp == 0 ) {

2046

if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );

2047

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2048

}

2049

if ( bExp == 0 ) {

2050

if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );

2051

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2052

}

2053

zExp = aExp + bExp - 0x7F;

2054

aSig = ( aSig | 0x00800000 )<<7;

2055

bSig = ( bSig | 0x00800000 )<<8;

2056

shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );

2057

zSig = zSig64;

2058

if ( 0 <= (int32_t) ( zSig<<1 ) ) {

2059

zSig <<= 1;

2060

--zExp;

2061

}

2062

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

2063

2064

}

2065

2066

/*----------------------------------------------------------------------------

2067

| Returns the result of dividing the single-precision floating-point value `a'

2068

| by the corresponding value `b'. The operation is performed according to the

2069

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2070

*----------------------------------------------------------------------------*/

2071

2072

float32 float32_div( float32 a, float32 b STATUS_PARAM )

2073

{

2074

flag aSign, bSign, zSign;

2075

int_fast16_t aExp, bExp, zExp;

2076

uint32_t aSig, bSig, zSig;

2077

a = float32_squash_input_denormal(a STATUS_VAR);

2078

b = float32_squash_input_denormal(b STATUS_VAR);

2079

2080

aSig = extractFloat32Frac( a );

2081

aExp = extractFloat32Exp( a );

2082

aSign = extractFloat32Sign( a );

2083

bSig = extractFloat32Frac( b );

2084

bExp = extractFloat32Exp( b );

2085

bSign = extractFloat32Sign( b );

2086

zSign = aSign ^ bSign;

2087

if ( aExp == 0xFF ) {

2088

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2089

if ( bExp == 0xFF ) {

2090

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2091

float_raise( float_flag_invalid STATUS_VAR);

2092

return float32_default_nan;

2093

}

2094

return packFloat32( zSign, 0xFF, 0 );

2095

}

2096

if ( bExp == 0xFF ) {

2097

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2098

return packFloat32( zSign, 0, 0 );

2099

}

2100

if ( bExp == 0 ) {

2101

if ( bSig == 0 ) {

2102

if ( ( aExp | aSig ) == 0 ) {

2103

float_raise( float_flag_invalid STATUS_VAR);

2104

return float32_default_nan;

2105

}

2106

float_raise( float_flag_divbyzero STATUS_VAR);

2107

return packFloat32( zSign, 0xFF, 0 );

2108

}

2109

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2110

}

2111

if ( aExp == 0 ) {

2112

if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );

2113

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2114

}

2115

zExp = aExp - bExp + 0x7D;

2116

aSig = ( aSig | 0x00800000 )<<7;

2117

bSig = ( bSig | 0x00800000 )<<8;

2118

if ( bSig <= ( aSig + aSig ) ) {

2119

aSig >>= 1;

2120

++zExp;

2121

}

2122

zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;

2123

if ( ( zSig & 0x3F ) == 0 ) {

2124

zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );

2125

}

2126

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

2127

2128

}

2129

2130

/*----------------------------------------------------------------------------

2131

| Returns the remainder of the single-precision floating-point value `a'

2132

| with respect to the corresponding value `b'. The operation is performed

2133

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2134

*----------------------------------------------------------------------------*/

2135

2136

float32 float32_rem( float32 a, float32 b STATUS_PARAM )

2137

{

2138

flag aSign, zSign;

2139

int_fast16_t aExp, bExp, expDiff;

2140

uint32_t aSig, bSig;

2141

uint32_t q;

2142

uint64_t aSig64, bSig64, q64;

2143

uint32_t alternateASig;

2144

int32_t sigMean;

2145

a = float32_squash_input_denormal(a STATUS_VAR);

2146

b = float32_squash_input_denormal(b STATUS_VAR);

2147

2148

aSig = extractFloat32Frac( a );

2149

aExp = extractFloat32Exp( a );

2150

aSign = extractFloat32Sign( a );

2151

bSig = extractFloat32Frac( b );

2152

bExp = extractFloat32Exp( b );

2153

if ( aExp == 0xFF ) {

2154

if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {

2155

return propagateFloat32NaN( a, b STATUS_VAR );

2156

}

2157

float_raise( float_flag_invalid STATUS_VAR);

2158

return float32_default_nan;

2159

}

2160

if ( bExp == 0xFF ) {

2161

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2162

return a;

2163

}

2164

if ( bExp == 0 ) {

2165

if ( bSig == 0 ) {

2166

float_raise( float_flag_invalid STATUS_VAR);

2167

return float32_default_nan;

2168

}

2169

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2170

}

2171

if ( aExp == 0 ) {

2172

if ( aSig == 0 ) return a;

2173

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2174

}

2175

expDiff = aExp - bExp;

2176

aSig |= 0x00800000;

2177

bSig |= 0x00800000;

2178

if ( expDiff < 32 ) {

2179

aSig <<= 8;

2180

bSig <<= 8;

2181

if ( expDiff < 0 ) {

2182

if ( expDiff < -1 ) return a;

2183

aSig >>= 1;

2184

}

2185

q = ( bSig <= aSig );

2186

if ( q ) aSig -= bSig;

2187

if ( 0 < expDiff ) {

2188

q = ( ( (uint64_t) aSig )<<32 ) / bSig;

2189

q >>= 32 - expDiff;

2190

bSig >>= 2;

2191

aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;

2192

}

2193

else {

2194

aSig >>= 2;

2195

bSig >>= 2;

2196

}

2197

}

2198

else {

2199

if ( bSig <= aSig ) aSig -= bSig;

2200

aSig64 = ( (uint64_t) aSig )<<40;

2201

bSig64 = ( (uint64_t) bSig )<<40;

2202

expDiff -= 64;

2203

while ( 0 < expDiff ) {

2204

q64 = estimateDiv128To64( aSig64, 0, bSig64 );

2205

q64 = ( 2 < q64 ) ? q64 - 2 : 0;

2206

aSig64 = - ( ( bSig * q64 )<<38 );

2207

expDiff -= 62;

2208

}

2209

expDiff += 64;

2210

q64 = estimateDiv128To64( aSig64, 0, bSig64 );

2211

q64 = ( 2 < q64 ) ? q64 - 2 : 0;

2212

q = q64>>( 64 - expDiff );

2213

bSig <<= 6;

2214

aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;

2215

}

2216

do {

2217

alternateASig = aSig;

2218

++q;

2219

aSig -= bSig;

2220

} while ( 0 <= (int32_t) aSig );

2221

sigMean = aSig + alternateASig;

2222

if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {

2223

aSig = alternateASig;

2224

}

2225

zSign = ( (int32_t) aSig < 0 );

2226

if ( zSign ) aSig = - aSig;

2227

return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );

2228

2229

}

2230

2231

/*----------------------------------------------------------------------------

2232

| Returns the result of multiplying the single-precision floating-point values

2233

| `a' and `b' then adding 'c', with no intermediate rounding step after the

2234

| multiplication. The operation is performed according to the IEC/IEEE

2235

| Standard for Binary Floating-Point Arithmetic 754-2008.

2236

| The flags argument allows the caller to select negation of the

2237

| addend, the intermediate product, or the final result. (The difference

2238

| between this and having the caller do a separate negation is that negating

2239

| externally will flip the sign bit on NaNs.)

2240

*----------------------------------------------------------------------------*/

2241

2242

float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)

2243

{

2244

flag aSign, bSign, cSign, zSign;

2245

int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;

2246

uint32_t aSig, bSig, cSig;

2247

flag pInf, pZero, pSign;

2248

uint64_t pSig64, cSig64, zSig64;

2249

uint32_t pSig;

2250

int shiftcount;

2251

flag signflip, infzero;

2252

2253

a = float32_squash_input_denormal(a STATUS_VAR);

2254

b = float32_squash_input_denormal(b STATUS_VAR);

2255

c = float32_squash_input_denormal(c STATUS_VAR);

2256

aSig = extractFloat32Frac(a);

2257

aExp = extractFloat32Exp(a);

2258

aSign = extractFloat32Sign(a);

2259

bSig = extractFloat32Frac(b);

2260

bExp = extractFloat32Exp(b);

2261

bSign = extractFloat32Sign(b);

2262

cSig = extractFloat32Frac(c);

2263

cExp = extractFloat32Exp(c);

2264

cSign = extractFloat32Sign(c);

2265

2266

infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||

2267

(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));

2268

2269

/* It is implementation-defined whether the cases of (0,inf,qnan)

2270

* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN

2271

* they return if they do), so we have to hand this information

2272

* off to the target-specific pick-a-NaN routine.

2273

2274

if (((aExp == 0xff) && aSig) ||

2275

((bExp == 0xff) && bSig) ||

2276

((cExp == 0xff) && cSig)) {

2277

return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);

2278

}

2279

2280

if (infzero) {

2281

float_raise(float_flag_invalid STATUS_VAR);

2282

return float32_default_nan;

2283

}

2284

2285

if (flags & float_muladd_negate_c) {

2286

cSign ^= 1;

2287

}

2288

2289

signflip = (flags & float_muladd_negate_result) ? 1 : 0;

2290

2291

/* Work out the sign and type of the product */

2292

pSign = aSign ^ bSign;

2293

if (flags & float_muladd_negate_product) {

2294

pSign ^= 1;

2295

}

2296

pInf = (aExp == 0xff) || (bExp == 0xff);

2297

pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);

2298

2299

if (cExp == 0xff) {

2300

if (pInf && (pSign ^ cSign)) {

2301

/* addition of opposite-signed infinities => InvalidOperation */

2302

float_raise(float_flag_invalid STATUS_VAR);

2303

return float32_default_nan;

2304

}

2305

/* Otherwise generate an infinity of the same sign */

2306

return packFloat32(cSign ^ signflip, 0xff, 0);

2307

}

2308

2309

if (pInf) {

2310

return packFloat32(pSign ^ signflip, 0xff, 0);

2311

}

2312

2313

if (pZero) {

2314

if (cExp == 0) {

2315

if (cSig == 0) {

2316

/* Adding two exact zeroes */

2317

if (pSign == cSign) {

2318

zSign = pSign;

2319

} else if (STATUS(float_rounding_mode) == float_round_down) {

2320

zSign = 1;

2321

} else {

2322

zSign = 0;

2323

}

2324

return packFloat32(zSign ^ signflip, 0, 0);

2325

}

2326

/* Exact zero plus a denorm */

2327

if (STATUS(flush_to_zero)) {

2328

float_raise(float_flag_output_denormal STATUS_VAR);

2329

return packFloat32(cSign ^ signflip, 0, 0);

2330

}

2331

}

2332

/* Zero plus something non-zero : just return the something */

2333

return packFloat32(cSign ^ signflip, cExp, cSig);

2334

}

2335

2336

if (aExp == 0) {

2337

normalizeFloat32Subnormal(aSig, &aExp, &aSig);

2338

}

2339

if (bExp == 0) {

2340

normalizeFloat32Subnormal(bSig, &bExp, &bSig);

2341

}

2342

2343

/* Calculate the actual result a * b + c */

2344

2345

/* Multiply first; this is easy. */

2346

/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f

2347

* because we want the true exponent, not the "one-less-than"

2348

* flavour that roundAndPackFloat32() takes.

2349

2350

pExp = aExp + bExp - 0x7e;

2351

aSig = (aSig | 0x00800000) << 7;

2352

bSig = (bSig | 0x00800000) << 8;

2353

pSig64 = (uint64_t)aSig * bSig;

2354

if ((int64_t)(pSig64 << 1) >= 0) {

2355

pSig64 <<= 1;

2356

pExp--;

2357

}

2358

2359

zSign = pSign ^ signflip;

2360

2361

/* Now pSig64 is the significand of the multiply, with the explicit bit in

2362

* position 62.

2363

2364

if (cExp == 0) {

2365

if (!cSig) {

2366

/* Throw out the special case of c being an exact zero now */

2367

shift64RightJamming(pSig64, 32, &pSig64);

2368

pSig = pSig64;

2369

return roundAndPackFloat32(zSign, pExp - 1,

2370

pSig STATUS_VAR);

2371

}

2372

normalizeFloat32Subnormal(cSig, &cExp, &cSig);

2373

}

2374

2375

cSig64 = (uint64_t)cSig << (62 - 23);

2376

cSig64 |= LIT64(0x4000000000000000);

2377

expDiff = pExp - cExp;

2378

2379

if (pSign == cSign) {

2380

/* Addition */

2381

if (expDiff > 0) {

2382

/* scale c to match p */

2383

shift64RightJamming(cSig64, expDiff, &cSig64);

2384

zExp = pExp;

2385

} else if (expDiff < 0) {

2386

/* scale p to match c */

2387

shift64RightJamming(pSig64, -expDiff, &pSig64);

2388

zExp = cExp;

2389

} else {

2390

/* no scaling needed */

2391

zExp = cExp;

2392

}

2393

/* Add significands and make sure explicit bit ends up in posn 62 */

2394

zSig64 = pSig64 + cSig64;

2395

if ((int64_t)zSig64 < 0) {

2396

shift64RightJamming(zSig64, 1, &zSig64);

2397

} else {

2398

zExp--;

2399

}

2400

} else {

2401

/* Subtraction */

2402

if (expDiff > 0) {

2403

shift64RightJamming(cSig64, expDiff, &cSig64);

2404

zSig64 = pSig64 - cSig64;

2405

zExp = pExp;

2406

} else if (expDiff < 0) {

2407

shift64RightJamming(pSig64, -expDiff, &pSig64);

2408

zSig64 = cSig64 - pSig64;

2409

zExp = cExp;

2410

zSign ^= 1;

2411

} else {

2412

zExp = pExp;

2413

if (cSig64 < pSig64) {

2414

zSig64 = pSig64 - cSig64;

2415

} else if (pSig64 < cSig64) {

2416

zSig64 = cSig64 - pSig64;

2417

zSign ^= 1;

2418

} else {

2419

/* Exact zero */

2420

zSign = signflip;

2421

if (STATUS(float_rounding_mode) == float_round_down) {

2422

zSign ^= 1;

2423

}

2424

return packFloat32(zSign, 0, 0);

2425

}

2426

}

2427

--zExp;

2428

/* Normalize to put the explicit bit back into bit 62. */

2429

shiftcount = countLeadingZeros64(zSig64) - 1;

2430

zSig64 <<= shiftcount;

2431

zExp -= shiftcount;

2432

}

2433

shift64RightJamming(zSig64, 32, &zSig64);

2434

return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);

2435

}

2436

2437

2438

/*----------------------------------------------------------------------------

2439

| Returns the square root of the single-precision floating-point value `a'.

2440

| The operation is performed according to the IEC/IEEE Standard for Binary

2441

| Floating-Point Arithmetic.

2442

*----------------------------------------------------------------------------*/

2443

2444

float32 float32_sqrt( float32 a STATUS_PARAM )

2445

{

2446

flag aSign;

2447

int_fast16_t aExp, zExp;

2448

uint32_t aSig, zSig;

2449

uint64_t rem, term;

2450

a = float32_squash_input_denormal(a STATUS_VAR);

2451

2452

aSig = extractFloat32Frac( a );

2453

aExp = extractFloat32Exp( a );

2454

aSign = extractFloat32Sign( a );

2455

if ( aExp == 0xFF ) {

2456

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2457

if ( ! aSign ) return a;

2458

float_raise( float_flag_invalid STATUS_VAR);

2459

return float32_default_nan;

2460

}

2461

if ( aSign ) {

2462

if ( ( aExp | aSig ) == 0 ) return a;

2463

float_raise( float_flag_invalid STATUS_VAR);

2464

return float32_default_nan;

2465

}

2466

if ( aExp == 0 ) {

2467

if ( aSig == 0 ) return float32_zero;

2468

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2469

}

2470

zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;

2471

aSig = ( aSig | 0x00800000 )<<8;

2472

zSig = estimateSqrt32( aExp, aSig ) + 2;

2473

if ( ( zSig & 0x7F ) <= 5 ) {

2474

if ( zSig < 2 ) {

2475

zSig = 0x7FFFFFFF;

2476

goto roundAndPack;

2477

}

2478

aSig >>= aExp & 1;

2479

term = ( (uint64_t) zSig ) * zSig;

2480

rem = ( ( (uint64_t) aSig )<<32 ) - term;

2481

while ( (int64_t) rem < 0 ) {

2482

--zSig;

2483

rem += ( ( (uint64_t) zSig )<<1 ) | 1;

2484

}

2485

zSig |= ( rem != 0 );

2486

}

2487

shift32RightJamming( zSig, 1, &zSig );

2488

roundAndPack:

2489

return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );

2490

2491

}

2492

2493

/*----------------------------------------------------------------------------

2494

| Returns the binary exponential of the single-precision floating-point value

2495

| `a'. The operation is performed according to the IEC/IEEE Standard for

2496

| Binary Floating-Point Arithmetic.

2497

2498

| Uses the following identities:

2499

2500

| 1. -------------------------------------------------------------------------

2501

| x x*ln(2)

2502

| 2 = e

2503

2504

| 2. -------------------------------------------------------------------------

2505

| 2 3 4 5 n

2506

| x x x x x x x

2507

| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...

2508

| 1! 2! 3! 4! 5! n!

2509

*----------------------------------------------------------------------------*/

2510

2511

static const float64 float32_exp2_coefficients[15] =

2512

{

2513

const_float64( 0x3ff0000000000000ll ), /* 1 */

2514

const_float64( 0x3fe0000000000000ll ), /* 2 */

2515

const_float64( 0x3fc5555555555555ll ), /* 3 */

2516

const_float64( 0x3fa5555555555555ll ), /* 4 */

2517

const_float64( 0x3f81111111111111ll ), /* 5 */

2518

const_float64( 0x3f56c16c16c16c17ll ), /* 6 */

2519

const_float64( 0x3f2a01a01a01a01all ), /* 7 */

2520

const_float64( 0x3efa01a01a01a01all ), /* 8 */

2521

const_float64( 0x3ec71de3a556c734ll ), /* 9 */

2522

const_float64( 0x3e927e4fb7789f5cll ), /* 10 */

2523

const_float64( 0x3e5ae64567f544e4ll ), /* 11 */

2524

const_float64( 0x3e21eed8eff8d898ll ), /* 12 */

2525

const_float64( 0x3de6124613a86d09ll ), /* 13 */

2526

const_float64( 0x3da93974a8c07c9dll ), /* 14 */

2527

const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */

2528

};

2529

2530

float32 float32_exp2( float32 a STATUS_PARAM )

2531

{

2532

flag aSign;

2533

int_fast16_t aExp;

2534

uint32_t aSig;

2535

float64 r, x, xn;

2536

int i;

2537

a = float32_squash_input_denormal(a STATUS_VAR);

2538

2539

aSig = extractFloat32Frac( a );

2540

aExp = extractFloat32Exp( a );

2541

aSign = extractFloat32Sign( a );

2542

2543

if ( aExp == 0xFF) {

2544

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2545

return (aSign) ? float32_zero : a;

2546

}

2547

if (aExp == 0) {

2548

if (aSig == 0) return float32_one;

2549

}

2550

2551

float_raise( float_flag_inexact STATUS_VAR);

2552

2553

/* ******************************* */

2554

/* using float64 for approximation */

2555

/* ******************************* */

2556

x = float32_to_float64(a STATUS_VAR);

2557

x = float64_mul(x, float64_ln2 STATUS_VAR);

2558

2559

xn = x;

2560

r = float64_one;

2561

for (i = 0 ; i < 15 ; i++) {

2562

float64 f;

2563

2564

f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);

2565

r = float64_add(r, f STATUS_VAR);

2566

2567

xn = float64_mul(xn, x STATUS_VAR);

2568

}

2569

2570

return float64_to_float32(r, status);

2571

}

2572

2573

/*----------------------------------------------------------------------------

2574

| Returns the binary log of the single-precision floating-point value `a'.

2575

| The operation is performed according to the IEC/IEEE Standard for Binary

2576

| Floating-Point Arithmetic.

2577

*----------------------------------------------------------------------------*/

2578

float32 float32_log2( float32 a STATUS_PARAM )

2579

{

2580

flag aSign, zSign;

2581

int_fast16_t aExp;

2582

uint32_t aSig, zSig, i;

2583

2584

a = float32_squash_input_denormal(a STATUS_VAR);

2585

aSig = extractFloat32Frac( a );

2586

aExp = extractFloat32Exp( a );

2587

aSign = extractFloat32Sign( a );

2588

2589

if ( aExp == 0 ) {

2590

if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );

2591

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2592

}

2593

if ( aSign ) {

2594

float_raise( float_flag_invalid STATUS_VAR);

2595

return float32_default_nan;

2596

}

2597

if ( aExp == 0xFF ) {

2598

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2599

return a;

2600

}

2601

2602

aExp -= 0x7F;

2603

aSig |= 0x00800000;

2604

zSign = aExp < 0;

2605

zSig = aExp << 23;

2606

2607

for (i = 1 << 22; i > 0; i >>= 1) {

2608

aSig = ( (uint64_t)aSig * aSig ) >> 23;

2609

if ( aSig & 0x01000000 ) {

2610

aSig >>= 1;

2611

zSig |= i;

2612

}

2613

}

2614

2615

if ( zSign )

2616

zSig = -zSig;

2617

2618

return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );

2619

}

2620

2621

/*----------------------------------------------------------------------------

2622

| Returns 1 if the single-precision floating-point value `a' is equal to

2623

| the corresponding value `b', and 0 otherwise. The invalid exception is

2624

| raised if either operand is a NaN. Otherwise, the comparison is performed

2625

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2626

*----------------------------------------------------------------------------*/

2627

2628

int float32_eq( float32 a, float32 b STATUS_PARAM )

2629

{

2630

uint32_t av, bv;

2631

a = float32_squash_input_denormal(a STATUS_VAR);

2632

b = float32_squash_input_denormal(b STATUS_VAR);

2633

2634

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2635

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2636

) {

2637

float_raise( float_flag_invalid STATUS_VAR);

2638

return 0;

2639

}

2640

av = float32_val(a);

2641

bv = float32_val(b);

2642

return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2643

}

2644

2645

/*----------------------------------------------------------------------------

2646

| Returns 1 if the single-precision floating-point value `a' is less than

2647

| or equal to the corresponding value `b', and 0 otherwise. The invalid

2648

| exception is raised if either operand is a NaN. The comparison is performed

2649

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2650

*----------------------------------------------------------------------------*/

2651

2652

int float32_le( float32 a, float32 b STATUS_PARAM )

2653

{

2654

flag aSign, bSign;

2655

uint32_t av, bv;

2656

a = float32_squash_input_denormal(a STATUS_VAR);

2657

b = float32_squash_input_denormal(b STATUS_VAR);

2658

2659

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2660

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2661

) {

2662

float_raise( float_flag_invalid STATUS_VAR);

2663

return 0;

2664

}

2665

aSign = extractFloat32Sign( a );

2666

bSign = extractFloat32Sign( b );

2667

av = float32_val(a);

2668

bv = float32_val(b);

2669

if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2670

return ( av == bv ) || ( aSign ^ ( av < bv ) );

2671

2672

}

2673

2674

/*----------------------------------------------------------------------------

2675

| Returns 1 if the single-precision floating-point value `a' is less than

2676

| the corresponding value `b', and 0 otherwise. The invalid exception is

2677

| raised if either operand is a NaN. The comparison is performed according

2678

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2679

*----------------------------------------------------------------------------*/

2680

2681

int float32_lt( float32 a, float32 b STATUS_PARAM )

2682

{

2683

flag aSign, bSign;

2684

uint32_t av, bv;

2685

a = float32_squash_input_denormal(a STATUS_VAR);

2686

b = float32_squash_input_denormal(b STATUS_VAR);

2687

2688

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2689

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2690

) {

2691

float_raise( float_flag_invalid STATUS_VAR);

2692

return 0;

2693

}

2694

aSign = extractFloat32Sign( a );

2695

bSign = extractFloat32Sign( b );

2696

av = float32_val(a);

2697

bv = float32_val(b);

2698

if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );

2699

return ( av != bv ) && ( aSign ^ ( av < bv ) );

2700

2701

}

2702

2703

/*----------------------------------------------------------------------------

2704

| Returns 1 if the single-precision floating-point values `a' and `b' cannot

2705

| be compared, and 0 otherwise. The invalid exception is raised if either

2706

| operand is a NaN. The comparison is performed according to the IEC/IEEE

2707

| Standard for Binary Floating-Point Arithmetic.

2708

*----------------------------------------------------------------------------*/

2709

2710

int float32_unordered( float32 a, float32 b STATUS_PARAM )

2711

{

2712

a = float32_squash_input_denormal(a STATUS_VAR);

2713

b = float32_squash_input_denormal(b STATUS_VAR);

2714

2715

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2716

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2717

) {

2718

float_raise( float_flag_invalid STATUS_VAR);

2719

return 1;

2720

}

2721

return 0;

2722

}

2723

2724

/*----------------------------------------------------------------------------

2725

| Returns 1 if the single-precision floating-point value `a' is equal to

2726

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

2727

| exception. The comparison is performed according to the IEC/IEEE Standard

2728

| for Binary Floating-Point Arithmetic.

2729

*----------------------------------------------------------------------------*/

2730

2731

int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )

2732

{

2733

a = float32_squash_input_denormal(a STATUS_VAR);

2734

b = float32_squash_input_denormal(b STATUS_VAR);

2735

2736

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2737

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2738

) {

2739

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2740

float_raise( float_flag_invalid STATUS_VAR);

2741

}

2742

return 0;

2743

}

2744

return ( float32_val(a) == float32_val(b) ) ||

2745

( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );

2746

}

2747

2748

/*----------------------------------------------------------------------------

2749

| Returns 1 if the single-precision floating-point value `a' is less than or

2750

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

2751

| cause an exception. Otherwise, the comparison is performed according to the

2752

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2753

*----------------------------------------------------------------------------*/

2754

2755

int float32_le_quiet( float32 a, float32 b STATUS_PARAM )

2756

{

2757

flag aSign, bSign;

2758

uint32_t av, bv;

2759

a = float32_squash_input_denormal(a STATUS_VAR);

2760

b = float32_squash_input_denormal(b STATUS_VAR);

2761

2762

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2763

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2764

) {

2765

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2766

float_raise( float_flag_invalid STATUS_VAR);

2767

}

2768

return 0;

2769

}

2770

aSign = extractFloat32Sign( a );

2771

bSign = extractFloat32Sign( b );

2772

av = float32_val(a);

2773

bv = float32_val(b);

2774

if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2775

return ( av == bv ) || ( aSign ^ ( av < bv ) );

2776

2777

}

2778

2779

/*----------------------------------------------------------------------------

2780

| Returns 1 if the single-precision floating-point value `a' is less than

2781

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

2782

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

2783

| Standard for Binary Floating-Point Arithmetic.

2784

*----------------------------------------------------------------------------*/

2785

2786

int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )

2787

{

2788

flag aSign, bSign;

2789

uint32_t av, bv;

2790

a = float32_squash_input_denormal(a STATUS_VAR);

2791

b = float32_squash_input_denormal(b STATUS_VAR);

2792

2793

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2794

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2795

) {

2796

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2797

float_raise( float_flag_invalid STATUS_VAR);

2798

}

2799

return 0;

2800

}

2801

aSign = extractFloat32Sign( a );

2802

bSign = extractFloat32Sign( b );

2803

av = float32_val(a);

2804

bv = float32_val(b);

2805

if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );

2806

return ( av != bv ) && ( aSign ^ ( av < bv ) );

2807

2808

}

2809

2810

/*----------------------------------------------------------------------------

2811

| Returns 1 if the single-precision floating-point values `a' and `b' cannot

2812

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

2813

| comparison is performed according to the IEC/IEEE Standard for Binary

2814

| Floating-Point Arithmetic.

2815

*----------------------------------------------------------------------------*/

2816

2817

int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )

2818

{

2819

a = float32_squash_input_denormal(a STATUS_VAR);

2820

b = float32_squash_input_denormal(b STATUS_VAR);

2821

2822

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2823

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2824

) {

2825

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2826

float_raise( float_flag_invalid STATUS_VAR);

2827

}

2828

return 1;

2829

}

2830

return 0;

2831

}

2832

2833

/*----------------------------------------------------------------------------

2834

| Returns the result of converting the double-precision floating-point value

2835

| `a' to the 32-bit two's complement integer format. The conversion is

2836

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2837

| Arithmetic---which means in particular that the conversion is rounded

2838

| according to the current rounding mode. If `a' is a NaN, the largest

2839

| positive integer is returned. Otherwise, if the conversion overflows, the

2840

| largest integer with the same sign as `a' is returned.

2841

*----------------------------------------------------------------------------*/

2842

2843

int32 float64_to_int32( float64 a STATUS_PARAM )

2844

{

2845

flag aSign;

2846

int_fast16_t aExp, shiftCount;

2847

uint64_t aSig;

2848

a = float64_squash_input_denormal(a STATUS_VAR);

2849

2850

aSig = extractFloat64Frac( a );

2851

aExp = extractFloat64Exp( a );

2852

aSign = extractFloat64Sign( a );

2853

if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;

2854

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

2855

shiftCount = 0x42C - aExp;

2856

if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );

2857

return roundAndPackInt32( aSign, aSig STATUS_VAR );

2858

2859

}

2860

2861

/*----------------------------------------------------------------------------

2862

| Returns the result of converting the double-precision floating-point value

2863

| `a' to the 32-bit two's complement integer format. The conversion is

2864

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2865

| Arithmetic, except that the conversion is always rounded toward zero.

2866

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2867

| the conversion overflows, the largest integer with the same sign as `a' is

2868

| returned.

2869

*----------------------------------------------------------------------------*/

2870

2871

int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )

2872

{

2873

flag aSign;

2874

int_fast16_t aExp, shiftCount;

2875

uint64_t aSig, savedASig;

2876

int32_t z;

2877

a = float64_squash_input_denormal(a STATUS_VAR);

2878

2879

aSig = extractFloat64Frac( a );

2880

aExp = extractFloat64Exp( a );

2881

aSign = extractFloat64Sign( a );

2882

if ( 0x41E < aExp ) {

2883

if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;

2884

goto invalid;

2885

}

2886

else if ( aExp < 0x3FF ) {

2887

if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

2888

return 0;

2889

}

2890

aSig |= LIT64( 0x0010000000000000 );

2891

shiftCount = 0x433 - aExp;

2892

savedASig = aSig;

2893

aSig >>= shiftCount;

2894

z = aSig;

2895

if ( aSign ) z = - z;

2896

if ( ( z < 0 ) ^ aSign ) {

2897

invalid:

2898

float_raise( float_flag_invalid STATUS_VAR);

2899

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

2900

}

2901

if ( ( aSig<<shiftCount ) != savedASig ) {

2902

STATUS(float_exception_flags) |= float_flag_inexact;

2903

}

2904

return z;

2905

2906

}

2907

2908

/*----------------------------------------------------------------------------

2909

| Returns the result of converting the double-precision floating-point value

2910

| `a' to the 16-bit two's complement integer format. The conversion is

2911

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2912

| Arithmetic, except that the conversion is always rounded toward zero.

2913

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2914

| the conversion overflows, the largest integer with the same sign as `a' is

2915

| returned.

2916

*----------------------------------------------------------------------------*/

2917

2918

int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)

2919

{

2920

flag aSign;

2921

int_fast16_t aExp, shiftCount;

2922

uint64_t aSig, savedASig;

2923

int32 z;

2924

2925

aSig = extractFloat64Frac( a );

2926

aExp = extractFloat64Exp( a );

2927

aSign = extractFloat64Sign( a );

2928

if ( 0x40E < aExp ) {

2929

if ( ( aExp == 0x7FF ) && aSig ) {

2930

aSign = 0;

2931

}

2932

goto invalid;

2933

}

2934

else if ( aExp < 0x3FF ) {

2935

if ( aExp || aSig ) {

2936

STATUS(float_exception_flags) |= float_flag_inexact;

2937

}

2938

return 0;

2939

}

2940

aSig |= LIT64( 0x0010000000000000 );

2941

shiftCount = 0x433 - aExp;

2942

savedASig = aSig;

2943

aSig >>= shiftCount;

2944

z = aSig;

2945

if ( aSign ) {

2946

z = - z;

2947

}

2948

if ( ( (int16_t)z < 0 ) ^ aSign ) {

2949

invalid:

2950

float_raise( float_flag_invalid STATUS_VAR);

2951

return aSign ? (int32_t) 0xffff8000 : 0x7FFF;

2952

}

2953

if ( ( aSig<<shiftCount ) != savedASig ) {

2954

STATUS(float_exception_flags) |= float_flag_inexact;

2955

}

2956

return z;

2957

}

2958

2959

/*----------------------------------------------------------------------------

2960

| Returns the result of converting the double-precision floating-point value

2961

| `a' to the 64-bit two's complement integer format. The conversion is

2962

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2963

| Arithmetic---which means in particular that the conversion is rounded

2964

| according to the current rounding mode. If `a' is a NaN, the largest

2965

| positive integer is returned. Otherwise, if the conversion overflows, the

2966

| largest integer with the same sign as `a' is returned.

2967

*----------------------------------------------------------------------------*/

2968

2969

int64 float64_to_int64( float64 a STATUS_PARAM )

2970

{

2971

flag aSign;

2972

int_fast16_t aExp, shiftCount;

2973

uint64_t aSig, aSigExtra;

2974

a = float64_squash_input_denormal(a STATUS_VAR);

2975

2976

aSig = extractFloat64Frac( a );

2977

aExp = extractFloat64Exp( a );

2978

aSign = extractFloat64Sign( a );

2979

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

2980

shiftCount = 0x433 - aExp;

2981

if ( shiftCount <= 0 ) {

2982

if ( 0x43E < aExp ) {

2983

float_raise( float_flag_invalid STATUS_VAR);

2984

if ( ! aSign

2985

|| ( ( aExp == 0x7FF )

2986

&& ( aSig != LIT64( 0x0010000000000000 ) ) )

2987

) {

2988

return LIT64( 0x7FFFFFFFFFFFFFFF );

2989

}

2990

return (int64_t) LIT64( 0x8000000000000000 );

2991

}

2992

aSigExtra = 0;

2993

aSig <<= - shiftCount;

2994

}

2995

else {

2996

shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );

2997

}

2998

return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );

2999

3000

}

3001

3002

/*----------------------------------------------------------------------------

3003

| Returns the result of converting the double-precision floating-point value

3004

| `a' to the 64-bit two's complement integer format. The conversion is

3005

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3006

| Arithmetic, except that the conversion is always rounded toward zero.

3007

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

3008

| the conversion overflows, the largest integer with the same sign as `a' is

3009

| returned.

3010

*----------------------------------------------------------------------------*/

3011

3012

int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )

3013

{

3014

flag aSign;

3015

int_fast16_t aExp, shiftCount;

3016

uint64_t aSig;

3017

int64 z;

3018

a = float64_squash_input_denormal(a STATUS_VAR);

3019

3020

aSig = extractFloat64Frac( a );

3021

aExp = extractFloat64Exp( a );

3022

aSign = extractFloat64Sign( a );

3023

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

3024

shiftCount = aExp - 0x433;

3025

if ( 0 <= shiftCount ) {

3026

if ( 0x43E <= aExp ) {

3027

if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {

3028

float_raise( float_flag_invalid STATUS_VAR);

3029

if ( ! aSign

3030

|| ( ( aExp == 0x7FF )

3031

&& ( aSig != LIT64( 0x0010000000000000 ) ) )

3032

) {

3033

return LIT64( 0x7FFFFFFFFFFFFFFF );

3034

}

3035

}

3036

return (int64_t) LIT64( 0x8000000000000000 );

3037

}

3038

z = aSig<<shiftCount;

3039

}

3040

else {

3041

if ( aExp < 0x3FE ) {

3042

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

3043

return 0;

3044

}

3045

z = aSig>>( - shiftCount );

3046

if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {

3047

STATUS(float_exception_flags) |= float_flag_inexact;

3048

}

3049

}

3050

if ( aSign ) z = - z;

3051

return z;

3052

3053

}

3054

3055

/*----------------------------------------------------------------------------

3056

| Returns the result of converting the double-precision floating-point value

3057

| `a' to the single-precision floating-point format. The conversion is

3058

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3059

| Arithmetic.

3060

*----------------------------------------------------------------------------*/

3061

3062

float32 float64_to_float32( float64 a STATUS_PARAM )

3063

{

3064

flag aSign;

3065

int_fast16_t aExp;

3066

uint64_t aSig;

3067

uint32_t zSig;

3068

a = float64_squash_input_denormal(a STATUS_VAR);

3069

3070

aSig = extractFloat64Frac( a );

3071

aExp = extractFloat64Exp( a );

3072

aSign = extractFloat64Sign( a );

3073

if ( aExp == 0x7FF ) {

3074

if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3075

return packFloat32( aSign, 0xFF, 0 );

3076

}

3077

shift64RightJamming( aSig, 22, &aSig );

3078

zSig = aSig;

3079

if ( aExp || zSig ) {

3080

zSig |= 0x40000000;

3081

aExp -= 0x381;

3082

}

3083

return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );

3084

3085

}

3086

3087

3088

/*----------------------------------------------------------------------------

3089

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

3090

| half-precision floating-point value, returning the result. After being

3091

| shifted into the proper positions, the three fields are simply added

3092

| together to form the result. This means that any integer portion of `zSig'

3093

| will be added into the exponent. Since a properly normalized significand

3094

| will have an integer portion equal to 1, the `zExp' input should be 1 less

3095

| than the desired result exponent whenever `zSig' is a complete, normalized

3096

| significand.

3097

*----------------------------------------------------------------------------*/

3098

static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)

3099

{

3100

return make_float16(

3101

(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);

3102

}

3103

3104

/* Half precision floats come in two formats: standard IEEE and "ARM" format.

3105

The latter gains extra exponent range by omitting the NaN/Inf encodings. */

3106

3107

float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)

3108

{

3109

flag aSign;

3110

int_fast16_t aExp;

3111

uint32_t aSig;

3112

3113

aSign = extractFloat16Sign(a);

3114

aExp = extractFloat16Exp(a);

3115

aSig = extractFloat16Frac(a);

3116

3117

if (aExp == 0x1f && ieee) {

3118

if (aSig) {

3119

return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3120

}

3121

return packFloat32(aSign, 0xff, 0);

3122

}

3123

if (aExp == 0) {

3124

int8 shiftCount;

3125

3126

if (aSig == 0) {

3127

return packFloat32(aSign, 0, 0);

3128

}

3129

3130

shiftCount = countLeadingZeros32( aSig ) - 21;

3131

aSig = aSig << shiftCount;

3132

aExp = -shiftCount;

3133

}

3134

return packFloat32( aSign, aExp + 0x70, aSig << 13);

3135

}

3136

3137

float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)

3138

{

3139

flag aSign;

3140

int_fast16_t aExp;

3141

uint32_t aSig;

3142

uint32_t mask;

3143

uint32_t increment;

3144

int8 roundingMode;

3145

int maxexp = ieee ? 15 : 16;

3146

bool rounding_bumps_exp;

3147

bool is_tiny = false;

3148

3149

a = float32_squash_input_denormal(a STATUS_VAR);

3150

3151

aSig = extractFloat32Frac( a );

3152

aExp = extractFloat32Exp( a );

3153

aSign = extractFloat32Sign( a );

3154

if ( aExp == 0xFF ) {

3155

if (aSig) {

3156

/* Input is a NaN */

3157

if (!ieee) {

3158

float_raise(float_flag_invalid STATUS_VAR);

3159

return packFloat16(aSign, 0, 0);

3160

}

3161

return commonNaNToFloat16(

3162

float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3163

}

3164

/* Infinity */

3165

if (!ieee) {

3166

float_raise(float_flag_invalid STATUS_VAR);

3167

return packFloat16(aSign, 0x1f, 0x3ff);

3168

}

3169

return packFloat16(aSign, 0x1f, 0);

3170

}

3171

if (aExp == 0 && aSig == 0) {

3172

return packFloat16(aSign, 0, 0);

3173

}

3174

/* Decimal point between bits 22 and 23. Note that we add the 1 bit

3175

* even if the input is denormal; however this is harmless because

3176

* the largest possible single-precision denormal is still smaller

3177

* than the smallest representable half-precision denormal, and so we

3178

* will end up ignoring aSig and returning via the "always return zero"

3179

* codepath.

3180

3181

aSig |= 0x00800000;

3182

aExp -= 0x7f;

3183

/* Calculate the mask of bits of the mantissa which are not

3184

* representable in half-precision and will be lost.

3185

3186

if (aExp < -14) {

3187

/* Will be denormal in halfprec */

3188

mask = 0x00ffffff;

3189

if (aExp >= -24) {

3190

mask >>= 25 + aExp;

3191

}

3192

} else {

3193

/* Normal number in halfprec */

3194

mask = 0x00001fff;

3195

}

3196

3197

roundingMode = STATUS(float_rounding_mode);

3198

switch (roundingMode) {

3199

case float_round_nearest_even:

3200

increment = (mask + 1) >> 1;

3201

if ((aSig & mask) == increment) {

3202

increment = aSig & (increment << 1);

3203

}

3204

break;

3205

case float_round_up:

3206

increment = aSign ? 0 : mask;

3207

break;

3208

case float_round_down:

3209

increment = aSign ? mask : 0;

3210

break;

3211

default: /* round_to_zero */

3212

increment = 0;

3213

break;

3214

}

3215

3216

rounding_bumps_exp = (aSig + increment >= 0x01000000);

3217

3218

if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {

3219

if (ieee) {

3220

float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);

3221

return packFloat16(aSign, 0x1f, 0);

3222

} else {

3223

float_raise(float_flag_invalid STATUS_VAR);

3224

return packFloat16(aSign, 0x1f, 0x3ff);

3225

}

3226

}

3227

3228

if (aExp < -14) {

3229

/* Note that flush-to-zero does not affect half-precision results */

3230

is_tiny =

3231

(STATUS(float_detect_tininess) == float_tininess_before_rounding)

3232

|| (aExp < -15)

3233

|| (!rounding_bumps_exp);

3234

}

3235

if (aSig & mask) {

3236

float_raise(float_flag_inexact STATUS_VAR);

3237

if (is_tiny) {

3238

float_raise(float_flag_underflow STATUS_VAR);

3239

}

3240

}

3241

3242

aSig += increment;

3243

if (rounding_bumps_exp) {

3244

aSig >>= 1;

3245

aExp++;

3246

}

3247

3248

if (aExp < -24) {

3249

return packFloat16(aSign, 0, 0);

3250

}

3251

if (aExp < -14) {

3252

aSig >>= -14 - aExp;

3253

aExp = -14;

3254

}

3255

return packFloat16(aSign, aExp + 14, aSig >> 13);

3256

}

3257

3258

/*----------------------------------------------------------------------------

3259

| Returns the result of converting the double-precision floating-point value

3260

| `a' to the extended double-precision floating-point format. The conversion

3261

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

3262

| Arithmetic.

3263

*----------------------------------------------------------------------------*/

3264

3265

floatx80 float64_to_floatx80( float64 a STATUS_PARAM )

3266

{

3267

flag aSign;

3268

int_fast16_t aExp;

3269

uint64_t aSig;

3270

3271

a = float64_squash_input_denormal(a STATUS_VAR);

3272

aSig = extractFloat64Frac( a );

3273

aExp = extractFloat64Exp( a );

3274

aSign = extractFloat64Sign( a );

3275

if ( aExp == 0x7FF ) {

3276

if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3277

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

3278

}

3279

if ( aExp == 0 ) {

3280

if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );

3281

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3282

}

3283

return

3284

packFloatx80(

3285

aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );

3286

3287

}

3288

3289

/*----------------------------------------------------------------------------

3290

| Returns the result of converting the double-precision floating-point value

3291

| `a' to the quadruple-precision floating-point format. The conversion is

3292

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3293

| Arithmetic.

3294

*----------------------------------------------------------------------------*/

3295

3296

float128 float64_to_float128( float64 a STATUS_PARAM )

3297

{

3298

flag aSign;

3299

int_fast16_t aExp;

3300

uint64_t aSig, zSig0, zSig1;

3301

3302

a = float64_squash_input_denormal(a STATUS_VAR);

3303

aSig = extractFloat64Frac( a );

3304

aExp = extractFloat64Exp( a );

3305

aSign = extractFloat64Sign( a );

3306

if ( aExp == 0x7FF ) {

3307

if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3308

return packFloat128( aSign, 0x7FFF, 0, 0 );

3309

}

3310

if ( aExp == 0 ) {

3311

if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );

3312

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3313

--aExp;

3314

}

3315

shift128Right( aSig, 0, 4, &zSig0, &zSig1 );

3316

return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );

3317

3318

}

3319

3320

/*----------------------------------------------------------------------------

3321

| Rounds the double-precision floating-point value `a' to an integer, and

3322

| returns the result as a double-precision floating-point value. The

3323

| operation is performed according to the IEC/IEEE Standard for Binary

3324

| Floating-Point Arithmetic.

3325

*----------------------------------------------------------------------------*/

3326

3327

float64 float64_round_to_int( float64 a STATUS_PARAM )

3328

{

3329

flag aSign;

3330

int_fast16_t aExp;

3331

uint64_t lastBitMask, roundBitsMask;

3332

int8 roundingMode;

3333

uint64_t z;

3334

a = float64_squash_input_denormal(a STATUS_VAR);

3335

3336

aExp = extractFloat64Exp( a );

3337

if ( 0x433 <= aExp ) {

3338

if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {

3339

return propagateFloat64NaN( a, a STATUS_VAR );

3340

}

3341

return a;

3342

}

3343

if ( aExp < 0x3FF ) {

3344

if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;

3345

STATUS(float_exception_flags) |= float_flag_inexact;

3346

aSign = extractFloat64Sign( a );

3347

switch ( STATUS(float_rounding_mode) ) {

3348

case float_round_nearest_even:

3349

if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {

3350

return packFloat64( aSign, 0x3FF, 0 );

3351

}

3352

break;

3353

case float_round_down:

3354

return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);

3355

case float_round_up:

3356

return make_float64(

3357

aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));

3358

}

3359

return packFloat64( aSign, 0, 0 );

3360

}

3361

lastBitMask = 1;

3362

lastBitMask <<= 0x433 - aExp;

3363

roundBitsMask = lastBitMask - 1;

3364

z = float64_val(a);

3365

roundingMode = STATUS(float_rounding_mode);

3366

if ( roundingMode == float_round_nearest_even ) {

3367

z += lastBitMask>>1;

3368

if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;

3369

}

3370

else if ( roundingMode != float_round_to_zero ) {

3371

if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {

3372

z += roundBitsMask;

3373

}

3374

}

3375

z &= ~ roundBitsMask;

3376

if ( z != float64_val(a) )

3377

STATUS(float_exception_flags) |= float_flag_inexact;

3378

return make_float64(z);

3379

3380

}

3381

3382

float64 float64_trunc_to_int( float64 a STATUS_PARAM)

3383

{

3384

int oldmode;

3385

float64 res;

3386

oldmode = STATUS(float_rounding_mode);

3387

STATUS(float_rounding_mode) = float_round_to_zero;

3388

res = float64_round_to_int(a STATUS_VAR);

3389

STATUS(float_rounding_mode) = oldmode;

3390

return res;

3391

}

3392

3393

/*----------------------------------------------------------------------------

3394

| Returns the result of adding the absolute values of the double-precision

3395

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

3396

| before being returned. `zSign' is ignored if the result is a NaN.

3397

| The addition is performed according to the IEC/IEEE Standard for Binary

3398

| Floating-Point Arithmetic.

3399

*----------------------------------------------------------------------------*/

3400

3401

static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )

3402

{

3403

int_fast16_t aExp, bExp, zExp;

3404

uint64_t aSig, bSig, zSig;

3405

int_fast16_t expDiff;

3406

3407

aSig = extractFloat64Frac( a );

3408

aExp = extractFloat64Exp( a );

3409

bSig = extractFloat64Frac( b );

3410

bExp = extractFloat64Exp( b );

3411

expDiff = aExp - bExp;

3412

aSig <<= 9;

3413

bSig <<= 9;

3414

if ( 0 < expDiff ) {

3415

if ( aExp == 0x7FF ) {

3416

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3417

return a;

3418

}

3419

if ( bExp == 0 ) {

3420

--expDiff;

3421

}

3422

else {

3423

bSig |= LIT64( 0x2000000000000000 );

3424

}

3425

shift64RightJamming( bSig, expDiff, &bSig );

3426

zExp = aExp;

3427

}

3428

else if ( expDiff < 0 ) {

3429

if ( bExp == 0x7FF ) {

3430

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3431

return packFloat64( zSign, 0x7FF, 0 );

3432

}

3433

if ( aExp == 0 ) {

3434

++expDiff;

3435

}

3436

else {

3437

aSig |= LIT64( 0x2000000000000000 );

3438

}

3439

shift64RightJamming( aSig, - expDiff, &aSig );

3440

zExp = bExp;

3441

}

3442

else {

3443

if ( aExp == 0x7FF ) {

3444

if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3445

return a;

3446

}

3447

if ( aExp == 0 ) {

3448

if (STATUS(flush_to_zero)) {

3449

if (aSig | bSig) {

3450

float_raise(float_flag_output_denormal STATUS_VAR);

3451

}

3452

return packFloat64(zSign, 0, 0);

3453

}

3454

return packFloat64( zSign, 0, ( aSig + bSig )>>9 );

3455

}

3456

zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;

3457

zExp = aExp;

3458

goto roundAndPack;

3459

}

3460

aSig |= LIT64( 0x2000000000000000 );

3461

zSig = ( aSig + bSig )<<1;

3462

--zExp;

3463

if ( (int64_t) zSig < 0 ) {

3464

zSig = aSig + bSig;

3465

++zExp;

3466

}

3467

roundAndPack:

3468

return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3469

3470

}

3471

3472

/*----------------------------------------------------------------------------

3473

| Returns the result of subtracting the absolute values of the double-

3474

| precision floating-point values `a' and `b'. If `zSign' is 1, the

3475

| difference is negated before being returned. `zSign' is ignored if the

3476

| result is a NaN. The subtraction is performed according to the IEC/IEEE

3477

| Standard for Binary Floating-Point Arithmetic.

3478

*----------------------------------------------------------------------------*/

3479

3480

static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )

3481

{

3482

int_fast16_t aExp, bExp, zExp;

3483

uint64_t aSig, bSig, zSig;

3484

int_fast16_t expDiff;

3485

3486

aSig = extractFloat64Frac( a );

3487

aExp = extractFloat64Exp( a );

3488

bSig = extractFloat64Frac( b );

3489

bExp = extractFloat64Exp( b );

3490

expDiff = aExp - bExp;

3491

aSig <<= 10;

3492

bSig <<= 10;

3493

if ( 0 < expDiff ) goto aExpBigger;

3494

if ( expDiff < 0 ) goto bExpBigger;

3495

if ( aExp == 0x7FF ) {

3496

if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3497

float_raise( float_flag_invalid STATUS_VAR);

3498

return float64_default_nan;

3499

}

3500

if ( aExp == 0 ) {

3501

aExp = 1;

3502

bExp = 1;

3503

}

3504

if ( bSig < aSig ) goto aBigger;

3505

if ( aSig < bSig ) goto bBigger;

3506

return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

3507

bExpBigger:

3508

if ( bExp == 0x7FF ) {

3509

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3510

return packFloat64( zSign ^ 1, 0x7FF, 0 );

3511

}

3512

if ( aExp == 0 ) {

3513

++expDiff;

3514

}

3515

else {

3516

aSig |= LIT64( 0x4000000000000000 );

3517

}

3518

shift64RightJamming( aSig, - expDiff, &aSig );

3519

bSig |= LIT64( 0x4000000000000000 );

3520

bBigger:

3521

zSig = bSig - aSig;

3522

zExp = bExp;

3523

zSign ^= 1;

3524

goto normalizeRoundAndPack;

3525

aExpBigger:

3526

if ( aExp == 0x7FF ) {

3527

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3528

return a;

3529

}

3530

if ( bExp == 0 ) {

3531

--expDiff;

3532

}

3533

else {

3534

bSig |= LIT64( 0x4000000000000000 );

3535

}

3536

shift64RightJamming( bSig, expDiff, &bSig );

3537

aSig |= LIT64( 0x4000000000000000 );

3538

aBigger:

3539

zSig = aSig - bSig;

3540

zExp = aExp;

3541

normalizeRoundAndPack:

3542

--zExp;

3543

return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3544

3545

}

3546

3547

/*----------------------------------------------------------------------------

3548

| Returns the result of adding the double-precision floating-point values `a'

3549

| and `b'. The operation is performed according to the IEC/IEEE Standard for

3550

| Binary Floating-Point Arithmetic.

3551

*----------------------------------------------------------------------------*/

3552

3553

float64 float64_add( float64 a, float64 b STATUS_PARAM )

3554

{

3555

flag aSign, bSign;

3556

a = float64_squash_input_denormal(a STATUS_VAR);

3557

b = float64_squash_input_denormal(b STATUS_VAR);

3558

3559

aSign = extractFloat64Sign( a );

3560

bSign = extractFloat64Sign( b );

3561

if ( aSign == bSign ) {

3562

return addFloat64Sigs( a, b, aSign STATUS_VAR );

3563

}

3564

else {

3565

return subFloat64Sigs( a, b, aSign STATUS_VAR );

3566

}

3567

3568

}

3569

3570

/*----------------------------------------------------------------------------

3571

| Returns the result of subtracting the double-precision floating-point values

3572

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

3573

| for Binary Floating-Point Arithmetic.

3574

*----------------------------------------------------------------------------*/

3575

3576

float64 float64_sub( float64 a, float64 b STATUS_PARAM )

3577

{

3578

flag aSign, bSign;

3579

a = float64_squash_input_denormal(a STATUS_VAR);

3580

b = float64_squash_input_denormal(b STATUS_VAR);

3581

3582

aSign = extractFloat64Sign( a );

3583

bSign = extractFloat64Sign( b );

3584

if ( aSign == bSign ) {

3585

return subFloat64Sigs( a, b, aSign STATUS_VAR );

3586

}

3587

else {

3588

return addFloat64Sigs( a, b, aSign STATUS_VAR );

3589

}

3590

3591

}

3592

3593

/*----------------------------------------------------------------------------

3594

| Returns the result of multiplying the double-precision floating-point values

3595

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

3596

| for Binary Floating-Point Arithmetic.

3597

*----------------------------------------------------------------------------*/

3598

3599

float64 float64_mul( float64 a, float64 b STATUS_PARAM )

3600

{

3601

flag aSign, bSign, zSign;

3602

int_fast16_t aExp, bExp, zExp;

3603

uint64_t aSig, bSig, zSig0, zSig1;

3604

3605

a = float64_squash_input_denormal(a STATUS_VAR);

3606

b = float64_squash_input_denormal(b STATUS_VAR);

3607

3608

aSig = extractFloat64Frac( a );

3609

aExp = extractFloat64Exp( a );

3610

aSign = extractFloat64Sign( a );

3611

bSig = extractFloat64Frac( b );

3612

bExp = extractFloat64Exp( b );

3613

bSign = extractFloat64Sign( b );

3614

zSign = aSign ^ bSign;

3615

if ( aExp == 0x7FF ) {

3616

if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {

3617

return propagateFloat64NaN( a, b STATUS_VAR );

3618

}

3619

if ( ( bExp | bSig ) == 0 ) {

3620

float_raise( float_flag_invalid STATUS_VAR);

3621

return float64_default_nan;

3622

}

3623

return packFloat64( zSign, 0x7FF, 0 );

3624

}

3625

if ( bExp == 0x7FF ) {

3626

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3627

if ( ( aExp | aSig ) == 0 ) {

3628

float_raise( float_flag_invalid STATUS_VAR);

3629

return float64_default_nan;

3630

}

3631

return packFloat64( zSign, 0x7FF, 0 );

3632

}

3633

if ( aExp == 0 ) {

3634

if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );

3635

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3636

}

3637

if ( bExp == 0 ) {

3638

if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );

3639

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3640

}

3641

zExp = aExp + bExp - 0x3FF;

3642

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;

3643

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3644

mul64To128( aSig, bSig, &zSig0, &zSig1 );

3645

zSig0 |= ( zSig1 != 0 );

3646

if ( 0 <= (int64_t) ( zSig0<<1 ) ) {

3647

zSig0 <<= 1;

3648

--zExp;

3649

}

3650

return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );

3651

3652

}

3653

3654

/*----------------------------------------------------------------------------

3655

| Returns the result of dividing the double-precision floating-point value `a'

3656

| by the corresponding value `b'. The operation is performed according to

3657

| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

3658

*----------------------------------------------------------------------------*/

3659

3660

float64 float64_div( float64 a, float64 b STATUS_PARAM )

3661

{

3662

flag aSign, bSign, zSign;

3663

int_fast16_t aExp, bExp, zExp;

3664

uint64_t aSig, bSig, zSig;

3665

uint64_t rem0, rem1;

3666

uint64_t term0, term1;

3667

a = float64_squash_input_denormal(a STATUS_VAR);

3668

b = float64_squash_input_denormal(b STATUS_VAR);

3669

3670

aSig = extractFloat64Frac( a );

3671

aExp = extractFloat64Exp( a );

3672

aSign = extractFloat64Sign( a );

3673

bSig = extractFloat64Frac( b );

3674

bExp = extractFloat64Exp( b );

3675

bSign = extractFloat64Sign( b );

3676

zSign = aSign ^ bSign;

3677

if ( aExp == 0x7FF ) {

3678

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3679

if ( bExp == 0x7FF ) {

3680

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3681

float_raise( float_flag_invalid STATUS_VAR);

3682

return float64_default_nan;

3683

}

3684

return packFloat64( zSign, 0x7FF, 0 );

3685

}

3686

if ( bExp == 0x7FF ) {

3687

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3688

return packFloat64( zSign, 0, 0 );

3689

}

3690

if ( bExp == 0 ) {

3691

if ( bSig == 0 ) {

3692

if ( ( aExp | aSig ) == 0 ) {

3693

float_raise( float_flag_invalid STATUS_VAR);

3694

return float64_default_nan;

3695

}

3696

float_raise( float_flag_divbyzero STATUS_VAR);

3697

return packFloat64( zSign, 0x7FF, 0 );

3698

}

3699

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3700

}

3701

if ( aExp == 0 ) {

3702

if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );

3703

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3704

}

3705

zExp = aExp - bExp + 0x3FD;

3706

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;

3707

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3708

if ( bSig <= ( aSig + aSig ) ) {

3709

aSig >>= 1;

3710

++zExp;

3711

}

3712

zSig = estimateDiv128To64( aSig, 0, bSig );

3713

if ( ( zSig & 0x1FF ) <= 2 ) {

3714

mul64To128( bSig, zSig, &term0, &term1 );

3715

sub128( aSig, 0, term0, term1, &rem0, &rem1 );

3716

while ( (int64_t) rem0 < 0 ) {

3717

--zSig;

3718

add128( rem0, rem1, 0, bSig, &rem0, &rem1 );

3719

}

3720

zSig |= ( rem1 != 0 );

3721

}

3722

return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3723

3724

}

3725

3726

/*----------------------------------------------------------------------------

3727

| Returns the remainder of the double-precision floating-point value `a'

3728

| with respect to the corresponding value `b'. The operation is performed

3729

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

3730

*----------------------------------------------------------------------------*/

3731

3732

float64 float64_rem( float64 a, float64 b STATUS_PARAM )

3733

{

3734

flag aSign, zSign;

3735

int_fast16_t aExp, bExp, expDiff;

3736

uint64_t aSig, bSig;

3737

uint64_t q, alternateASig;

3738

int64_t sigMean;

3739

3740

a = float64_squash_input_denormal(a STATUS_VAR);

3741

b = float64_squash_input_denormal(b STATUS_VAR);

3742

aSig = extractFloat64Frac( a );

3743

aExp = extractFloat64Exp( a );

3744

aSign = extractFloat64Sign( a );

3745

bSig = extractFloat64Frac( b );

3746

bExp = extractFloat64Exp( b );

3747

if ( aExp == 0x7FF ) {

3748

if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {

3749

return propagateFloat64NaN( a, b STATUS_VAR );

3750

}

3751

float_raise( float_flag_invalid STATUS_VAR);

3752

return float64_default_nan;

3753

}

3754

if ( bExp == 0x7FF ) {

3755

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3756

return a;

3757

}

3758

if ( bExp == 0 ) {

3759

if ( bSig == 0 ) {

3760

float_raise( float_flag_invalid STATUS_VAR);

3761

return float64_default_nan;

3762

}

3763

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3764

}

3765

if ( aExp == 0 ) {

3766

if ( aSig == 0 ) return a;

3767

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3768

}

3769

expDiff = aExp - bExp;

3770

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;

3771

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3772

if ( expDiff < 0 ) {

3773

if ( expDiff < -1 ) return a;

3774

aSig >>= 1;

3775

}

3776

q = ( bSig <= aSig );

3777

if ( q ) aSig -= bSig;

3778

expDiff -= 64;

3779

while ( 0 < expDiff ) {

3780

q = estimateDiv128To64( aSig, 0, bSig );

3781

q = ( 2 < q ) ? q - 2 : 0;

3782

aSig = - ( ( bSig>>2 ) * q );

3783

expDiff -= 62;

3784

}

3785

expDiff += 64;

3786

if ( 0 < expDiff ) {

3787

q = estimateDiv128To64( aSig, 0, bSig );

3788

q = ( 2 < q ) ? q - 2 : 0;

3789

q >>= 64 - expDiff;

3790

bSig >>= 2;

3791

aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;

3792

}

3793

else {

3794

aSig >>= 2;

3795

bSig >>= 2;

3796

}

3797

do {

3798

alternateASig = aSig;

3799

++q;

3800

aSig -= bSig;

3801

} while ( 0 <= (int64_t) aSig );

3802

sigMean = aSig + alternateASig;

3803

if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {

3804

aSig = alternateASig;

3805

}

3806

zSign = ( (int64_t) aSig < 0 );

3807

if ( zSign ) aSig = - aSig;

3808

return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );

3809

3810

}

3811

3812

/*----------------------------------------------------------------------------

3813

| Returns the result of multiplying the double-precision floating-point values

3814

| `a' and `b' then adding 'c', with no intermediate rounding step after the

3815

| multiplication. The operation is performed according to the IEC/IEEE

3816

| Standard for Binary Floating-Point Arithmetic 754-2008.

3817

| The flags argument allows the caller to select negation of the

3818

| addend, the intermediate product, or the final result. (The difference

3819

| between this and having the caller do a separate negation is that negating

3820

| externally will flip the sign bit on NaNs.)

3821

*----------------------------------------------------------------------------*/

3822

3823

float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)

3824

{

3825

flag aSign, bSign, cSign, zSign;

3826

int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;

3827

uint64_t aSig, bSig, cSig;

3828

flag pInf, pZero, pSign;

3829

uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;

3830

int shiftcount;

3831

flag signflip, infzero;

3832

3833

a = float64_squash_input_denormal(a STATUS_VAR);

3834

b = float64_squash_input_denormal(b STATUS_VAR);

3835

c = float64_squash_input_denormal(c STATUS_VAR);

3836

aSig = extractFloat64Frac(a);

3837

aExp = extractFloat64Exp(a);

3838

aSign = extractFloat64Sign(a);

3839

bSig = extractFloat64Frac(b);

3840

bExp = extractFloat64Exp(b);

3841

bSign = extractFloat64Sign(b);

3842

cSig = extractFloat64Frac(c);

3843

cExp = extractFloat64Exp(c);

3844

cSign = extractFloat64Sign(c);

3845

3846

infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||

3847

(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));

3848

3849

/* It is implementation-defined whether the cases of (0,inf,qnan)

3850

* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN

3851

* they return if they do), so we have to hand this information

3852

* off to the target-specific pick-a-NaN routine.

3853

3854

if (((aExp == 0x7ff) && aSig) ||

3855

((bExp == 0x7ff) && bSig) ||

3856

((cExp == 0x7ff) && cSig)) {

3857

return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);

3858

}

3859

3860

if (infzero) {

3861

float_raise(float_flag_invalid STATUS_VAR);

3862

return float64_default_nan;

3863

}

3864

3865

if (flags & float_muladd_negate_c) {

3866

cSign ^= 1;

3867

}

3868

3869

signflip = (flags & float_muladd_negate_result) ? 1 : 0;

3870

3871

/* Work out the sign and type of the product */

3872

pSign = aSign ^ bSign;

3873

if (flags & float_muladd_negate_product) {

3874

pSign ^= 1;

3875

}

3876

pInf = (aExp == 0x7ff) || (bExp == 0x7ff);

3877

pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);

3878

3879

if (cExp == 0x7ff) {

3880

if (pInf && (pSign ^ cSign)) {

3881

/* addition of opposite-signed infinities => InvalidOperation */

3882

float_raise(float_flag_invalid STATUS_VAR);

3883

return float64_default_nan;

3884

}

3885

/* Otherwise generate an infinity of the same sign */

3886

return packFloat64(cSign ^ signflip, 0x7ff, 0);

3887

}

3888

3889

if (pInf) {

3890

return packFloat64(pSign ^ signflip, 0x7ff, 0);

3891

}

3892

3893

if (pZero) {

3894

if (cExp == 0) {

3895

if (cSig == 0) {

3896

/* Adding two exact zeroes */

3897

if (pSign == cSign) {

3898

zSign = pSign;

3899

} else if (STATUS(float_rounding_mode) == float_round_down) {

3900

zSign = 1;

3901

} else {

3902

zSign = 0;

3903

}

3904

return packFloat64(zSign ^ signflip, 0, 0);

3905

}

3906

/* Exact zero plus a denorm */

3907

if (STATUS(flush_to_zero)) {

3908

float_raise(float_flag_output_denormal STATUS_VAR);

3909

return packFloat64(cSign ^ signflip, 0, 0);

3910

}

3911

}

3912

/* Zero plus something non-zero : just return the something */

3913

return packFloat64(cSign ^ signflip, cExp, cSig);

3914

}

3915

3916

if (aExp == 0) {

3917

normalizeFloat64Subnormal(aSig, &aExp, &aSig);

3918

}

3919

if (bExp == 0) {

3920

normalizeFloat64Subnormal(bSig, &bExp, &bSig);

3921

}

3922

3923

/* Calculate the actual result a * b + c */

3924

3925

/* Multiply first; this is easy. */

3926

/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff

3927

* because we want the true exponent, not the "one-less-than"

3928

* flavour that roundAndPackFloat64() takes.

3929

3930

pExp = aExp + bExp - 0x3fe;

3931

aSig = (aSig | LIT64(0x0010000000000000))<<10;

3932

bSig = (bSig | LIT64(0x0010000000000000))<<11;

3933

mul64To128(aSig, bSig, &pSig0, &pSig1);

3934

if ((int64_t)(pSig0 << 1) >= 0) {

3935

shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);

3936

pExp--;

3937

}

3938

3939

zSign = pSign ^ signflip;

3940

3941

/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit

3942

* bit in position 126.

3943

3944

if (cExp == 0) {

3945

if (!cSig) {

3946

/* Throw out the special case of c being an exact zero now */

3947

shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);

3948

return roundAndPackFloat64(zSign, pExp - 1,

3949

pSig1 STATUS_VAR);

3950

}

3951

normalizeFloat64Subnormal(cSig, &cExp, &cSig);

3952

}

3953

3954

/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the

3955

* significand of the addend, with the explicit bit in position 126.

3956

3957

cSig0 = cSig << (126 - 64 - 52);

3958

cSig1 = 0;

3959

cSig0 |= LIT64(0x4000000000000000);

3960

expDiff = pExp - cExp;

3961

3962

if (pSign == cSign) {

3963

/* Addition */

3964

if (expDiff > 0) {

3965

/* scale c to match p */

3966

shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);

3967

zExp = pExp;

3968

} else if (expDiff < 0) {

3969

/* scale p to match c */

3970

shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);

3971

zExp = cExp;

3972

} else {

3973

/* no scaling needed */

3974

zExp = cExp;

3975

}

3976

/* Add significands and make sure explicit bit ends up in posn 126 */

3977

add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

3978

if ((int64_t)zSig0 < 0) {

3979

shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);

3980

} else {

3981

zExp--;

3982

}

3983

shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);

3984

return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);

3985

} else {

3986

/* Subtraction */

3987

if (expDiff > 0) {

3988

shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);

3989

sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

3990

zExp = pExp;

3991

} else if (expDiff < 0) {

3992

shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);

3993

sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);

3994

zExp = cExp;

3995

zSign ^= 1;

3996

} else {

3997

zExp = pExp;

3998

if (lt128(cSig0, cSig1, pSig0, pSig1)) {

3999

sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

4000

} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {

4001

sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);

4002

zSign ^= 1;

4003

} else {

4004

/* Exact zero */

4005

zSign = signflip;

4006

if (STATUS(float_rounding_mode) == float_round_down) {

4007

zSign ^= 1;

4008

}

4009

return packFloat64(zSign, 0, 0);

4010

}

4011

}

4012

--zExp;

4013

/* Do the equivalent of normalizeRoundAndPackFloat64() but

4014

* starting with the significand in a pair of uint64_t.

4015

4016

if (zSig0) {

4017

shiftcount = countLeadingZeros64(zSig0) - 1;

4018

shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);

4019

if (zSig1) {

4020

zSig0 |= 1;

4021

}

4022

zExp -= shiftcount;

4023

} else {

4024

shiftcount = countLeadingZeros64(zSig1);

4025

if (shiftcount == 0) {

4026

zSig0 = (zSig1 >> 1) | (zSig1 & 1);

4027

zExp -= 63;

4028

} else {

4029

shiftcount--;

4030

zSig0 = zSig1 << shiftcount;

4031

zExp -= (shiftcount + 64);

4032

}

4033

}

4034

return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);

4035

}

4036

}

4037

4038

/*----------------------------------------------------------------------------

4039

| Returns the square root of the double-precision floating-point value `a'.

4040

| The operation is performed according to the IEC/IEEE Standard for Binary

4041

| Floating-Point Arithmetic.

4042

*----------------------------------------------------------------------------*/

4043

4044

float64 float64_sqrt( float64 a STATUS_PARAM )

4045

{

4046

flag aSign;

4047

int_fast16_t aExp, zExp;

4048

uint64_t aSig, zSig, doubleZSig;

4049

uint64_t rem0, rem1, term0, term1;

4050

a = float64_squash_input_denormal(a STATUS_VAR);

4051

4052

aSig = extractFloat64Frac( a );

4053

aExp = extractFloat64Exp( a );

4054

aSign = extractFloat64Sign( a );

4055

if ( aExp == 0x7FF ) {

4056

if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );

4057

if ( ! aSign ) return a;

4058

float_raise( float_flag_invalid STATUS_VAR);

4059

return float64_default_nan;

4060

}

4061

if ( aSign ) {

4062

if ( ( aExp | aSig ) == 0 ) return a;

4063

float_raise( float_flag_invalid STATUS_VAR);

4064

return float64_default_nan;

4065

}

4066

if ( aExp == 0 ) {

4067

if ( aSig == 0 ) return float64_zero;

4068

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

4069

}

4070

zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;

4071

aSig |= LIT64( 0x0010000000000000 );

4072

zSig = estimateSqrt32( aExp, aSig>>21 );

4073

aSig <<= 9 - ( aExp & 1 );

4074

zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );

4075

if ( ( zSig & 0x1FF ) <= 5 ) {

4076

doubleZSig = zSig<<1;

4077

mul64To128( zSig, zSig, &term0, &term1 );

4078

sub128( aSig, 0, term0, term1, &rem0, &rem1 );

4079

while ( (int64_t) rem0 < 0 ) {

4080

--zSig;

4081

doubleZSig -= 2;

4082

add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );

4083

}

4084

zSig |= ( ( rem0 | rem1 ) != 0 );

4085

}

4086

return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );

4087

4088

}

4089

4090

/*----------------------------------------------------------------------------

4091

| Returns the binary log of the double-precision floating-point value `a'.

4092

| The operation is performed according to the IEC/IEEE Standard for Binary

4093

| Floating-Point Arithmetic.

4094

*----------------------------------------------------------------------------*/

4095

float64 float64_log2( float64 a STATUS_PARAM )

4096

{

4097

flag aSign, zSign;

4098

int_fast16_t aExp;

4099

uint64_t aSig, aSig0, aSig1, zSig, i;

4100

a = float64_squash_input_denormal(a STATUS_VAR);

4101

4102

aSig = extractFloat64Frac( a );

4103

aExp = extractFloat64Exp( a );

4104

aSign = extractFloat64Sign( a );

4105

4106

if ( aExp == 0 ) {

4107

if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );

4108

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

4109

}

4110

if ( aSign ) {

4111

float_raise( float_flag_invalid STATUS_VAR);

4112

return float64_default_nan;

4113

}

4114

if ( aExp == 0x7FF ) {

4115

if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );

4116

return a;

4117

}

4118

4119

aExp -= 0x3FF;

4120

aSig |= LIT64( 0x0010000000000000 );

4121

zSign = aExp < 0;

4122

zSig = (uint64_t)aExp << 52;

4123

for (i = 1LL << 51; i > 0; i >>= 1) {

4124

mul64To128( aSig, aSig, &aSig0, &aSig1 );

4125

aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );

4126

if ( aSig & LIT64( 0x0020000000000000 ) ) {

4127

aSig >>= 1;

4128

zSig |= i;

4129

}

4130

}

4131

4132

if ( zSign )

4133

zSig = -zSig;

4134

return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );

4135

}

4136

4137

/*----------------------------------------------------------------------------

4138

| Returns 1 if the double-precision floating-point value `a' is equal to the

4139

| corresponding value `b', and 0 otherwise. The invalid exception is raised

4140

| if either operand is a NaN. Otherwise, the comparison is performed

4141

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4142

*----------------------------------------------------------------------------*/

4143

4144

int float64_eq( float64 a, float64 b STATUS_PARAM )

4145

{

4146

uint64_t av, bv;

4147

a = float64_squash_input_denormal(a STATUS_VAR);

4148

b = float64_squash_input_denormal(b STATUS_VAR);

4149

4150

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4151

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4152

) {

4153

float_raise( float_flag_invalid STATUS_VAR);

4154

return 0;

4155

}

4156

av = float64_val(a);

4157

bv = float64_val(b);

4158

return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4159

4160

}

4161

4162

/*----------------------------------------------------------------------------

4163

| Returns 1 if the double-precision floating-point value `a' is less than or

4164

| equal to the corresponding value `b', and 0 otherwise. The invalid

4165

| exception is raised if either operand is a NaN. The comparison is performed

4166

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4167

*----------------------------------------------------------------------------*/

4168

4169

int float64_le( float64 a, float64 b STATUS_PARAM )

4170

{

4171

flag aSign, bSign;

4172

uint64_t av, bv;

4173

a = float64_squash_input_denormal(a STATUS_VAR);

4174

b = float64_squash_input_denormal(b STATUS_VAR);

4175

4176

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4177

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4178

) {

4179

float_raise( float_flag_invalid STATUS_VAR);

4180

return 0;

4181

}

4182

aSign = extractFloat64Sign( a );

4183

bSign = extractFloat64Sign( b );

4184

av = float64_val(a);

4185

bv = float64_val(b);

4186

if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4187

return ( av == bv ) || ( aSign ^ ( av < bv ) );

4188

4189

}

4190

4191

/*----------------------------------------------------------------------------

4192

| Returns 1 if the double-precision floating-point value `a' is less than

4193

| the corresponding value `b', and 0 otherwise. The invalid exception is

4194

| raised if either operand is a NaN. The comparison is performed according

4195

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4196

*----------------------------------------------------------------------------*/

4197

4198

int float64_lt( float64 a, float64 b STATUS_PARAM )

4199

{

4200

flag aSign, bSign;

4201

uint64_t av, bv;

4202

4203

a = float64_squash_input_denormal(a STATUS_VAR);

4204

b = float64_squash_input_denormal(b STATUS_VAR);

4205

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4206

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4207

) {

4208

float_raise( float_flag_invalid STATUS_VAR);

4209

return 0;

4210

}

4211

aSign = extractFloat64Sign( a );

4212

bSign = extractFloat64Sign( b );

4213

av = float64_val(a);

4214

bv = float64_val(b);

4215

if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );

4216

return ( av != bv ) && ( aSign ^ ( av < bv ) );

4217

4218

}

4219

4220

/*----------------------------------------------------------------------------

4221

| Returns 1 if the double-precision floating-point values `a' and `b' cannot

4222

| be compared, and 0 otherwise. The invalid exception is raised if either

4223

| operand is a NaN. The comparison is performed according to the IEC/IEEE

4224

| Standard for Binary Floating-Point Arithmetic.

4225

*----------------------------------------------------------------------------*/

4226

4227

int float64_unordered( float64 a, float64 b STATUS_PARAM )

4228

{

4229

a = float64_squash_input_denormal(a STATUS_VAR);

4230

b = float64_squash_input_denormal(b STATUS_VAR);

4231

4232

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4233

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4234

) {

4235

float_raise( float_flag_invalid STATUS_VAR);

4236

return 1;

4237

}

4238

return 0;

4239

}

4240

4241

/*----------------------------------------------------------------------------

4242

| Returns 1 if the double-precision floating-point value `a' is equal to the

4243

| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

4244

| exception.The comparison is performed according to the IEC/IEEE Standard

4245

| for Binary Floating-Point Arithmetic.

4246

*----------------------------------------------------------------------------*/

4247

4248

int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )

4249

{

4250

uint64_t av, bv;

4251

a = float64_squash_input_denormal(a STATUS_VAR);

4252

b = float64_squash_input_denormal(b STATUS_VAR);

4253

4254

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4255

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4256

) {

4257

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4258

float_raise( float_flag_invalid STATUS_VAR);

4259

}

4260

return 0;

4261

}

4262

av = float64_val(a);

4263

bv = float64_val(b);

4264

return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4265

4266

}

4267

4268

/*----------------------------------------------------------------------------

4269

| Returns 1 if the double-precision floating-point value `a' is less than or

4270

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

4271

| cause an exception. Otherwise, the comparison is performed according to the

4272

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4273

*----------------------------------------------------------------------------*/

4274

4275

int float64_le_quiet( float64 a, float64 b STATUS_PARAM )

4276

{

4277

flag aSign, bSign;

4278

uint64_t av, bv;

4279

a = float64_squash_input_denormal(a STATUS_VAR);

4280

b = float64_squash_input_denormal(b STATUS_VAR);

4281

4282

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4283

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4284

) {

4285

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4286

float_raise( float_flag_invalid STATUS_VAR);

4287

}

4288

return 0;

4289

}

4290

aSign = extractFloat64Sign( a );

4291

bSign = extractFloat64Sign( b );

4292

av = float64_val(a);

4293

bv = float64_val(b);

4294

if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4295

return ( av == bv ) || ( aSign ^ ( av < bv ) );

4296

4297

}

4298

4299

/*----------------------------------------------------------------------------

4300

| Returns 1 if the double-precision floating-point value `a' is less than

4301

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

4302

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

4303

| Standard for Binary Floating-Point Arithmetic.

4304

*----------------------------------------------------------------------------*/

4305

4306

int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )

4307

{

4308

flag aSign, bSign;

4309

uint64_t av, bv;

4310

a = float64_squash_input_denormal(a STATUS_VAR);

4311

b = float64_squash_input_denormal(b STATUS_VAR);

4312

4313

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4314

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4315

) {

4316

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4317

float_raise( float_flag_invalid STATUS_VAR);

4318

}

4319

return 0;

4320

}

4321

aSign = extractFloat64Sign( a );

4322

bSign = extractFloat64Sign( b );

4323

av = float64_val(a);

4324

bv = float64_val(b);

4325

if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );

4326

return ( av != bv ) && ( aSign ^ ( av < bv ) );

4327

4328

}

4329

4330

/*----------------------------------------------------------------------------

4331

| Returns 1 if the double-precision floating-point values `a' and `b' cannot

4332

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

4333

| comparison is performed according to the IEC/IEEE Standard for Binary

4334

| Floating-Point Arithmetic.

4335

*----------------------------------------------------------------------------*/

4336

4337

int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )

4338

{

4339

a = float64_squash_input_denormal(a STATUS_VAR);

4340

b = float64_squash_input_denormal(b STATUS_VAR);

4341

4342

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4343

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4344

) {

4345

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4346

float_raise( float_flag_invalid STATUS_VAR);

4347

}

4348

return 1;

4349

}

4350

return 0;

4351

}

4352

4353

/*----------------------------------------------------------------------------

4354

| Returns the result of converting the extended double-precision floating-

4355

| point value `a' to the 32-bit two's complement integer format. The

4356

| conversion is performed according to the IEC/IEEE Standard for Binary

4357

| Floating-Point Arithmetic---which means in particular that the conversion

4358

| is rounded according to the current rounding mode. If `a' is a NaN, the

4359

| largest positive integer is returned. Otherwise, if the conversion

4360

| overflows, the largest integer with the same sign as `a' is returned.

4361

*----------------------------------------------------------------------------*/

4362

4363

int32 floatx80_to_int32( floatx80 a STATUS_PARAM )

4364

{

4365

flag aSign;

4366

int32 aExp, shiftCount;

4367

uint64_t aSig;

4368

4369

aSig = extractFloatx80Frac( a );

4370

aExp = extractFloatx80Exp( a );

4371

aSign = extractFloatx80Sign( a );

4372

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;

4373

shiftCount = 0x4037 - aExp;

4374

if ( shiftCount <= 0 ) shiftCount = 1;

4375

shift64RightJamming( aSig, shiftCount, &aSig );

4376

return roundAndPackInt32( aSign, aSig STATUS_VAR );

4377

4378

}

4379

4380

/*----------------------------------------------------------------------------

4381

| Returns the result of converting the extended double-precision floating-

4382

| point value `a' to the 32-bit two's complement integer format. The

4383

| conversion is performed according to the IEC/IEEE Standard for Binary

4384

| Floating-Point Arithmetic, except that the conversion is always rounded

4385

| toward zero. If `a' is a NaN, the largest positive integer is returned.

4386

| Otherwise, if the conversion overflows, the largest integer with the same

4387

| sign as `a' is returned.

4388

*----------------------------------------------------------------------------*/

4389

4390

int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )

4391

{

4392

flag aSign;

4393

int32 aExp, shiftCount;

4394

uint64_t aSig, savedASig;

4395

int32_t z;

4396

4397

aSig = extractFloatx80Frac( a );

4398

aExp = extractFloatx80Exp( a );

4399

aSign = extractFloatx80Sign( a );

4400

if ( 0x401E < aExp ) {

4401

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;

4402

goto invalid;

4403

}

4404

else if ( aExp < 0x3FFF ) {

4405

if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

4406

return 0;

4407

}

4408

shiftCount = 0x403E - aExp;

4409

savedASig = aSig;

4410

aSig >>= shiftCount;

4411

z = aSig;

4412

if ( aSign ) z = - z;

4413

if ( ( z < 0 ) ^ aSign ) {

4414

invalid:

4415

float_raise( float_flag_invalid STATUS_VAR);

4416

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

4417

}

4418

if ( ( aSig<<shiftCount ) != savedASig ) {

4419

STATUS(float_exception_flags) |= float_flag_inexact;

4420

}

4421

return z;

4422

4423

}

4424

4425

/*----------------------------------------------------------------------------

4426

| Returns the result of converting the extended double-precision floating-

4427

| point value `a' to the 64-bit two's complement integer format. The

4428

| conversion is performed according to the IEC/IEEE Standard for Binary

4429

| Floating-Point Arithmetic---which means in particular that the conversion

4430

| is rounded according to the current rounding mode. If `a' is a NaN,

4431

| the largest positive integer is returned. Otherwise, if the conversion

4432

| overflows, the largest integer with the same sign as `a' is returned.

4433

*----------------------------------------------------------------------------*/

4434

4435

int64 floatx80_to_int64( floatx80 a STATUS_PARAM )

4436

{

4437

flag aSign;

4438

int32 aExp, shiftCount;

4439

uint64_t aSig, aSigExtra;

4440

4441

aSig = extractFloatx80Frac( a );

4442

aExp = extractFloatx80Exp( a );

4443

aSign = extractFloatx80Sign( a );

4444

shiftCount = 0x403E - aExp;

4445

if ( shiftCount <= 0 ) {

4446

if ( shiftCount ) {

4447

float_raise( float_flag_invalid STATUS_VAR);

4448

if ( ! aSign

4449

|| ( ( aExp == 0x7FFF )

4450

&& ( aSig != LIT64( 0x8000000000000000 ) ) )

4451

) {

4452

return LIT64( 0x7FFFFFFFFFFFFFFF );

4453

}

4454

return (int64_t) LIT64( 0x8000000000000000 );

4455

}

4456

aSigExtra = 0;

4457

}

4458

else {

4459

shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );

4460

}

4461

return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );

4462

4463

}

4464

4465

/*----------------------------------------------------------------------------

4466

| Returns the result of converting the extended double-precision floating-

4467

| point value `a' to the 64-bit two's complement integer format. The

4468

| conversion is performed according to the IEC/IEEE Standard for Binary

4469

| Floating-Point Arithmetic, except that the conversion is always rounded

4470

| toward zero. If `a' is a NaN, the largest positive integer is returned.

4471

| Otherwise, if the conversion overflows, the largest integer with the same

4472

| sign as `a' is returned.

4473

*----------------------------------------------------------------------------*/

4474

4475

int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )

4476

{

4477

flag aSign;

4478

int32 aExp, shiftCount;

4479

uint64_t aSig;

4480

int64 z;

4481

4482

aSig = extractFloatx80Frac( a );

4483

aExp = extractFloatx80Exp( a );

4484

aSign = extractFloatx80Sign( a );

4485

shiftCount = aExp - 0x403E;

4486

if ( 0 <= shiftCount ) {

4487

aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );

4488

if ( ( a.high != 0xC03E ) || aSig ) {

4489

float_raise( float_flag_invalid STATUS_VAR);

4490

if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {

4491

return LIT64( 0x7FFFFFFFFFFFFFFF );

4492

}

4493

}

4494

return (int64_t) LIT64( 0x8000000000000000 );

4495

}

4496

else if ( aExp < 0x3FFF ) {

4497

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

4498

return 0;

4499

}

4500

z = aSig>>( - shiftCount );

4501

if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {

4502

STATUS(float_exception_flags) |= float_flag_inexact;

4503

}

4504

if ( aSign ) z = - z;

4505

return z;

4506

4507

}

4508

4509

/*----------------------------------------------------------------------------

4510

| Returns the result of converting the extended double-precision floating-

4511

| point value `a' to the single-precision floating-point format. The

4512

| conversion is performed according to the IEC/IEEE Standard for Binary

4513

| Floating-Point Arithmetic.

4514

*----------------------------------------------------------------------------*/

4515

4516

float32 floatx80_to_float32( floatx80 a STATUS_PARAM )

4517

{

4518

flag aSign;

4519

int32 aExp;

4520

uint64_t aSig;

4521

4522

aSig = extractFloatx80Frac( a );

4523

aExp = extractFloatx80Exp( a );

4524

aSign = extractFloatx80Sign( a );

4525

if ( aExp == 0x7FFF ) {

4526

if ( (uint64_t) ( aSig<<1 ) ) {

4527

return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4528

}

4529

return packFloat32( aSign, 0xFF, 0 );

4530

}

4531

shift64RightJamming( aSig, 33, &aSig );

4532

if ( aExp || aSig ) aExp -= 0x3F81;

4533

return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );

4534

4535

}

4536

4537

/*----------------------------------------------------------------------------

4538

| Returns the result of converting the extended double-precision floating-

4539

| point value `a' to the double-precision floating-point format. The

4540

| conversion is performed according to the IEC/IEEE Standard for Binary

4541

| Floating-Point Arithmetic.

4542

*----------------------------------------------------------------------------*/

4543

4544

float64 floatx80_to_float64( floatx80 a STATUS_PARAM )

4545

{

4546

flag aSign;

4547

int32 aExp;

4548

uint64_t aSig, zSig;

4549

4550

aSig = extractFloatx80Frac( a );

4551

aExp = extractFloatx80Exp( a );

4552

aSign = extractFloatx80Sign( a );

4553

if ( aExp == 0x7FFF ) {

4554

if ( (uint64_t) ( aSig<<1 ) ) {

4555

return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4556

}

4557

return packFloat64( aSign, 0x7FF, 0 );

4558

}

4559

shift64RightJamming( aSig, 1, &zSig );

4560

if ( aExp || aSig ) aExp -= 0x3C01;

4561

return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );

4562

4563

}

4564

4565

/*----------------------------------------------------------------------------

4566

| Returns the result of converting the extended double-precision floating-

4567

| point value `a' to the quadruple-precision floating-point format. The

4568

| conversion is performed according to the IEC/IEEE Standard for Binary

4569

| Floating-Point Arithmetic.

4570

*----------------------------------------------------------------------------*/

4571

4572

float128 floatx80_to_float128( floatx80 a STATUS_PARAM )

4573

{

4574

flag aSign;

4575

int_fast16_t aExp;

4576

uint64_t aSig, zSig0, zSig1;

4577

4578

aSig = extractFloatx80Frac( a );

4579

aExp = extractFloatx80Exp( a );

4580

aSign = extractFloatx80Sign( a );

4581

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {

4582

return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4583

}

4584

shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );

4585

return packFloat128( aSign, aExp, zSig0, zSig1 );

4586

4587

}

4588

4589

/*----------------------------------------------------------------------------

4590

| Rounds the extended double-precision floating-point value `a' to an integer,

4591

| and returns the result as an extended quadruple-precision floating-point

4592

| value. The operation is performed according to the IEC/IEEE Standard for

4593

| Binary Floating-Point Arithmetic.

4594

*----------------------------------------------------------------------------*/

4595

4596

floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )

4597

{

4598

flag aSign;

4599

int32 aExp;

4600

uint64_t lastBitMask, roundBitsMask;

4601

int8 roundingMode;

4602

floatx80 z;

4603

4604

aExp = extractFloatx80Exp( a );

4605

if ( 0x403E <= aExp ) {

4606

if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {

4607

return propagateFloatx80NaN( a, a STATUS_VAR );

4608

}

4609

return a;

4610

}

4611

if ( aExp < 0x3FFF ) {

4612

if ( ( aExp == 0 )

4613

&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {

4614

return a;

4615

}

4616

STATUS(float_exception_flags) |= float_flag_inexact;

4617

aSign = extractFloatx80Sign( a );

4618

switch ( STATUS(float_rounding_mode) ) {

4619

case float_round_nearest_even:

4620

if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )

4621

) {

4622

return

4623

packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );

4624

}

4625

break;

4626

case float_round_down:

4627

return

4628

aSign ?

4629

packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )

4630

: packFloatx80( 0, 0, 0 );

4631

case float_round_up:

4632

return

4633

aSign ? packFloatx80( 1, 0, 0 )

4634

: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );

4635

}

4636

return packFloatx80( aSign, 0, 0 );

4637

}

4638

lastBitMask = 1;

4639

lastBitMask <<= 0x403E - aExp;

4640

roundBitsMask = lastBitMask - 1;

4641

z = a;

4642

roundingMode = STATUS(float_rounding_mode);

4643

if ( roundingMode == float_round_nearest_even ) {

4644

z.low += lastBitMask>>1;

4645

if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;

4646

}

4647

else if ( roundingMode != float_round_to_zero ) {

4648

if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {

4649

z.low += roundBitsMask;

4650

}

4651

}

4652

z.low &= ~ roundBitsMask;

4653

if ( z.low == 0 ) {

4654

++z.high;

4655

z.low = LIT64( 0x8000000000000000 );

4656

}

4657

if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;

4658

return z;

4659

4660

}

4661

4662

/*----------------------------------------------------------------------------

4663

| Returns the result of adding the absolute values of the extended double-

4664

| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is

4665

| negated before being returned. `zSign' is ignored if the result is a NaN.

4666

| The addition is performed according to the IEC/IEEE Standard for Binary

4667

| Floating-Point Arithmetic.

4668

*----------------------------------------------------------------------------*/

4669

4670

static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)

4671

{

4672

int32 aExp, bExp, zExp;

4673

uint64_t aSig, bSig, zSig0, zSig1;

4674

int32 expDiff;

4675

4676

aSig = extractFloatx80Frac( a );

4677

aExp = extractFloatx80Exp( a );

4678

bSig = extractFloatx80Frac( b );

4679

bExp = extractFloatx80Exp( b );

4680

expDiff = aExp - bExp;

4681

if ( 0 < expDiff ) {

4682

if ( aExp == 0x7FFF ) {

4683

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4684

return a;

4685

}

4686

if ( bExp == 0 ) --expDiff;

4687

shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );

4688

zExp = aExp;

4689

}

4690

else if ( expDiff < 0 ) {

4691

if ( bExp == 0x7FFF ) {

4692

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4693

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4694

}

4695

if ( aExp == 0 ) ++expDiff;

4696

shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );

4697

zExp = bExp;

4698

}

4699

else {

4700

if ( aExp == 0x7FFF ) {

4701

if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {

4702

return propagateFloatx80NaN( a, b STATUS_VAR );

4703

}

4704

return a;

4705

}

4706

zSig1 = 0;

4707

zSig0 = aSig + bSig;

4708

if ( aExp == 0 ) {

4709

normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );

4710

goto roundAndPack;

4711

}

4712

zExp = aExp;

4713

goto shiftRight1;

4714

}

4715

zSig0 = aSig + bSig;

4716

if ( (int64_t) zSig0 < 0 ) goto roundAndPack;

4717

shiftRight1:

4718

shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );

4719

zSig0 |= LIT64( 0x8000000000000000 );

4720

++zExp;

4721

roundAndPack:

4722

return

4723

roundAndPackFloatx80(

4724

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4725

4726

}

4727

4728

/*----------------------------------------------------------------------------

4729

| Returns the result of subtracting the absolute values of the extended

4730

| double-precision floating-point values `a' and `b'. If `zSign' is 1, the

4731

| difference is negated before being returned. `zSign' is ignored if the

4732

| result is a NaN. The subtraction is performed according to the IEC/IEEE

4733

| Standard for Binary Floating-Point Arithmetic.

4734

*----------------------------------------------------------------------------*/

4735

4736

static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )

4737

{

4738

int32 aExp, bExp, zExp;

4739

uint64_t aSig, bSig, zSig0, zSig1;

4740

int32 expDiff;

4741

floatx80 z;

4742

4743

aSig = extractFloatx80Frac( a );

4744

aExp = extractFloatx80Exp( a );

4745

bSig = extractFloatx80Frac( b );

4746

bExp = extractFloatx80Exp( b );

4747

expDiff = aExp - bExp;

4748

if ( 0 < expDiff ) goto aExpBigger;

4749

if ( expDiff < 0 ) goto bExpBigger;

4750

if ( aExp == 0x7FFF ) {

4751

if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {

4752

return propagateFloatx80NaN( a, b STATUS_VAR );

4753

}

4754

float_raise( float_flag_invalid STATUS_VAR);

4755

z.low = floatx80_default_nan_low;

4756

z.high = floatx80_default_nan_high;

4757

return z;

4758

}

4759

if ( aExp == 0 ) {

4760

aExp = 1;

4761

bExp = 1;

4762

}

4763

zSig1 = 0;

4764

if ( bSig < aSig ) goto aBigger;

4765

if ( aSig < bSig ) goto bBigger;

4766

return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

4767

bExpBigger:

4768

if ( bExp == 0x7FFF ) {

4769

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4770

return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );

4771

}

4772

if ( aExp == 0 ) ++expDiff;

4773

shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );

4774

bBigger:

4775

sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );

4776

zExp = bExp;

4777

zSign ^= 1;

4778

goto normalizeRoundAndPack;

4779

aExpBigger:

4780

if ( aExp == 0x7FFF ) {

4781

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4782

return a;

4783

}

4784

if ( bExp == 0 ) --expDiff;

4785

shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );

4786

aBigger:

4787

sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );

4788

zExp = aExp;

4789

normalizeRoundAndPack:

4790

return

4791

normalizeRoundAndPackFloatx80(

4792

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4793

4794

}

4795

4796

/*----------------------------------------------------------------------------

4797

| Returns the result of adding the extended double-precision floating-point

4798

| values `a' and `b'. The operation is performed according to the IEC/IEEE

4799

| Standard for Binary Floating-Point Arithmetic.

4800

*----------------------------------------------------------------------------*/

4801

4802

floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )

4803

{

4804

flag aSign, bSign;

4805

4806

aSign = extractFloatx80Sign( a );

4807

bSign = extractFloatx80Sign( b );

4808

if ( aSign == bSign ) {

4809

return addFloatx80Sigs( a, b, aSign STATUS_VAR );

4810

}

4811

else {

4812

return subFloatx80Sigs( a, b, aSign STATUS_VAR );

4813

}

4814

4815

}

4816

4817

/*----------------------------------------------------------------------------

4818

| Returns the result of subtracting the extended double-precision floating-

4819

| point values `a' and `b'. The operation is performed according to the

4820

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4821

*----------------------------------------------------------------------------*/

4822

4823

floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )

4824

{

4825

flag aSign, bSign;

4826

4827

aSign = extractFloatx80Sign( a );

4828

bSign = extractFloatx80Sign( b );

4829

if ( aSign == bSign ) {

4830

return subFloatx80Sigs( a, b, aSign STATUS_VAR );

4831

}

4832

else {

4833

return addFloatx80Sigs( a, b, aSign STATUS_VAR );

4834

}

4835

4836

}

4837

4838

/*----------------------------------------------------------------------------

4839

| Returns the result of multiplying the extended double-precision floating-

4840

| point values `a' and `b'. The operation is performed according to the

4841

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4842

*----------------------------------------------------------------------------*/

4843

4844

floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )

4845

{

4846

flag aSign, bSign, zSign;

4847

int32 aExp, bExp, zExp;

4848

uint64_t aSig, bSig, zSig0, zSig1;

4849

floatx80 z;

4850

4851

aSig = extractFloatx80Frac( a );

4852

aExp = extractFloatx80Exp( a );

4853

aSign = extractFloatx80Sign( a );

4854

bSig = extractFloatx80Frac( b );

4855

bExp = extractFloatx80Exp( b );

4856

bSign = extractFloatx80Sign( b );

4857

zSign = aSign ^ bSign;

4858

if ( aExp == 0x7FFF ) {

4859

if ( (uint64_t) ( aSig<<1 )

4860

|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {

4861

return propagateFloatx80NaN( a, b STATUS_VAR );

4862

}

4863

if ( ( bExp | bSig ) == 0 ) goto invalid;

4864

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4865

}

4866

if ( bExp == 0x7FFF ) {

4867

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4868

if ( ( aExp | aSig ) == 0 ) {

4869

invalid:

4870

float_raise( float_flag_invalid STATUS_VAR);

4871

z.low = floatx80_default_nan_low;

4872

z.high = floatx80_default_nan_high;

4873

return z;

4874

}

4875

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4876

}

4877

if ( aExp == 0 ) {

4878

if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );

4879

normalizeFloatx80Subnormal( aSig, &aExp, &aSig );

4880

}

4881

if ( bExp == 0 ) {

4882

if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );

4883

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

4884

}

4885

zExp = aExp + bExp - 0x3FFE;

4886

mul64To128( aSig, bSig, &zSig0, &zSig1 );

4887

if ( 0 < (int64_t) zSig0 ) {

4888

shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );

4889

--zExp;

4890

}

4891

return

4892

roundAndPackFloatx80(

4893

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4894

4895

}

4896

4897

/*----------------------------------------------------------------------------

4898

| Returns the result of dividing the extended double-precision floating-point

4899

| value `a' by the corresponding value `b'. The operation is performed

4900

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4901

*----------------------------------------------------------------------------*/

4902

4903

floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )

4904

{

4905

flag aSign, bSign, zSign;

4906

int32 aExp, bExp, zExp;

4907

uint64_t aSig, bSig, zSig0, zSig1;

4908

uint64_t rem0, rem1, rem2, term0, term1, term2;

4909

floatx80 z;

4910

4911

aSig = extractFloatx80Frac( a );

4912

aExp = extractFloatx80Exp( a );

4913

aSign = extractFloatx80Sign( a );

4914

bSig = extractFloatx80Frac( b );

4915

bExp = extractFloatx80Exp( b );

4916

bSign = extractFloatx80Sign( b );

4917

zSign = aSign ^ bSign;

4918

if ( aExp == 0x7FFF ) {

4919

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4920

if ( bExp == 0x7FFF ) {

4921

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4922

goto invalid;

4923

}

4924

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4925

}

4926

if ( bExp == 0x7FFF ) {

4927

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4928

return packFloatx80( zSign, 0, 0 );

4929

}

4930

if ( bExp == 0 ) {

4931

if ( bSig == 0 ) {

4932

if ( ( aExp | aSig ) == 0 ) {

4933

invalid:

4934

float_raise( float_flag_invalid STATUS_VAR);

4935

z.low = floatx80_default_nan_low;

4936

z.high = floatx80_default_nan_high;

4937

return z;

4938

}

4939

float_raise( float_flag_divbyzero STATUS_VAR);

4940

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4941

}

4942

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

4943

}

4944

if ( aExp == 0 ) {

4945

if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );

4946

normalizeFloatx80Subnormal( aSig, &aExp, &aSig );

4947

}

4948

zExp = aExp - bExp + 0x3FFE;

4949

rem1 = 0;

4950

if ( bSig <= aSig ) {

4951

shift128Right( aSig, 0, 1, &aSig, &rem1 );

4952

++zExp;

4953

}

4954

zSig0 = estimateDiv128To64( aSig, rem1, bSig );

4955

mul64To128( bSig, zSig0, &term0, &term1 );

4956

sub128( aSig, rem1, term0, term1, &rem0, &rem1 );

4957

while ( (int64_t) rem0 < 0 ) {

4958

--zSig0;

4959

add128( rem0, rem1, 0, bSig, &rem0, &rem1 );

4960

}

4961

zSig1 = estimateDiv128To64( rem1, 0, bSig );

4962

if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {

4963

mul64To128( bSig, zSig1, &term1, &term2 );

4964

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

4965

while ( (int64_t) rem1 < 0 ) {

4966

--zSig1;

4967

add128( rem1, rem2, 0, bSig, &rem1, &rem2 );

4968

}

4969

zSig1 |= ( ( rem1 | rem2 ) != 0 );

4970

}

4971

return

4972

roundAndPackFloatx80(

4973

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4974

4975

}

4976

4977

/*----------------------------------------------------------------------------

4978

| Returns the remainder of the extended double-precision floating-point value

4979

| `a' with respect to the corresponding value `b'. The operation is performed

4980

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4981

*----------------------------------------------------------------------------*/

4982

4983

floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )

4984

{

4985

flag aSign, zSign;

4986

int32 aExp, bExp, expDiff;

4987

uint64_t aSig0, aSig1, bSig;

4988

uint64_t q, term0, term1, alternateASig0, alternateASig1;

4989

floatx80 z;

4990

4991

aSig0 = extractFloatx80Frac( a );

4992

aExp = extractFloatx80Exp( a );

4993

aSign = extractFloatx80Sign( a );

4994

bSig = extractFloatx80Frac( b );

4995

bExp = extractFloatx80Exp( b );

4996

if ( aExp == 0x7FFF ) {

4997

if ( (uint64_t) ( aSig0<<1 )

4998

|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {

4999

return propagateFloatx80NaN( a, b STATUS_VAR );

5000

}

5001

goto invalid;

5002

}

5003

if ( bExp == 0x7FFF ) {

5004

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

5005

return a;

5006

}

5007

if ( bExp == 0 ) {

5008

if ( bSig == 0 ) {

5009

invalid:

5010

float_raise( float_flag_invalid STATUS_VAR);

5011

z.low = floatx80_default_nan_low;

5012

z.high = floatx80_default_nan_high;

5013

return z;

5014

}

5015

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

5016

}

5017

if ( aExp == 0 ) {

5018

if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;

5019

normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );

5020

}

5021

bSig |= LIT64( 0x8000000000000000 );

5022

zSign = aSign;

5023

expDiff = aExp - bExp;

5024

aSig1 = 0;

5025

if ( expDiff < 0 ) {

5026

if ( expDiff < -1 ) return a;

5027

shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );

5028

expDiff = 0;

5029

}

5030

q = ( bSig <= aSig0 );

5031

if ( q ) aSig0 -= bSig;

5032

expDiff -= 64;

5033

while ( 0 < expDiff ) {

5034

q = estimateDiv128To64( aSig0, aSig1, bSig );

5035

q = ( 2 < q ) ? q - 2 : 0;

5036

mul64To128( bSig, q, &term0, &term1 );

5037

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5038

shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );

5039

expDiff -= 62;

5040

}

5041

expDiff += 64;

5042

if ( 0 < expDiff ) {

5043

q = estimateDiv128To64( aSig0, aSig1, bSig );

5044

q = ( 2 < q ) ? q - 2 : 0;

5045

q >>= 64 - expDiff;

5046

mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );

5047

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5048

shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );

5049

while ( le128( term0, term1, aSig0, aSig1 ) ) {

5050

++q;

5051

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5052

}

5053

}

5054

else {

5055

term1 = 0;

5056

term0 = bSig;

5057

}

5058

sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );

5059

if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )

5060

|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )

5061

&& ( q & 1 ) )

5062

) {

5063

aSig0 = alternateASig0;

5064

aSig1 = alternateASig1;

5065

zSign = ! zSign;

5066

}

5067

return

5068

normalizeRoundAndPackFloatx80(

5069

80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );

5070

5071

}

5072

5073

/*----------------------------------------------------------------------------

5074

| Returns the square root of the extended double-precision floating-point

5075

| value `a'. The operation is performed according to the IEC/IEEE Standard

5076

| for Binary Floating-Point Arithmetic.

5077

*----------------------------------------------------------------------------*/

5078

5079

floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )

5080

{

5081

flag aSign;

5082

int32 aExp, zExp;

5083

uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;

5084

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

5085

floatx80 z;

5086

5087

aSig0 = extractFloatx80Frac( a );

5088

aExp = extractFloatx80Exp( a );

5089

aSign = extractFloatx80Sign( a );

5090

if ( aExp == 0x7FFF ) {

5091

if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );

5092

if ( ! aSign ) return a;

5093

goto invalid;

5094

}

5095

if ( aSign ) {

5096

if ( ( aExp | aSig0 ) == 0 ) return a;

5097

invalid:

5098

float_raise( float_flag_invalid STATUS_VAR);

5099

z.low = floatx80_default_nan_low;

5100

z.high = floatx80_default_nan_high;

5101

return z;

5102

}

5103

if ( aExp == 0 ) {

5104

if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );

5105

normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );

5106

}

5107

zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;

5108

zSig0 = estimateSqrt32( aExp, aSig0>>32 );

5109

shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );

5110

zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );

5111

doubleZSig0 = zSig0<<1;

5112

mul64To128( zSig0, zSig0, &term0, &term1 );

5113

sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );

5114

while ( (int64_t) rem0 < 0 ) {

5115

--zSig0;

5116

doubleZSig0 -= 2;

5117

add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );

5118

}

5119

zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );

5120

if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {

5121

if ( zSig1 == 0 ) zSig1 = 1;

5122

mul64To128( doubleZSig0, zSig1, &term1, &term2 );

5123

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

5124

mul64To128( zSig1, zSig1, &term2, &term3 );

5125

sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );

5126

while ( (int64_t) rem1 < 0 ) {

5127

--zSig1;

5128

shortShift128Left( 0, zSig1, 1, &term2, &term3 );

5129

term3 |= 1;

5130

term2 |= doubleZSig0;

5131

add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );

5132

}

5133

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

5134

}

5135

shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );

5136

zSig0 |= doubleZSig0;

5137

return

5138

roundAndPackFloatx80(

5139

STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );

5140

5141

}

5142

5143

/*----------------------------------------------------------------------------

5144

| Returns 1 if the extended double-precision floating-point value `a' is equal

5145

| to the corresponding value `b', and 0 otherwise. The invalid exception is

5146

| raised if either operand is a NaN. Otherwise, the comparison is performed

5147

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5148

*----------------------------------------------------------------------------*/

5149

5150

int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )

5151

{

5152

5153

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5154

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5155

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5156

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5157

) {

5158

float_raise( float_flag_invalid STATUS_VAR);

5159

return 0;

5160

}

5161

return

5162

( a.low == b.low )

5163

&& ( ( a.high == b.high )

5164

|| ( ( a.low == 0 )

5165

&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )

5166

);

5167

5168

}

5169

5170

/*----------------------------------------------------------------------------

5171

| Returns 1 if the extended double-precision floating-point value `a' is

5172

| less than or equal to the corresponding value `b', and 0 otherwise. The

5173

| invalid exception is raised if either operand is a NaN. The comparison is

5174

| performed according to the IEC/IEEE Standard for Binary Floating-Point

5175

| Arithmetic.

5176

*----------------------------------------------------------------------------*/

5177

5178

int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )

5179

{

5180

flag aSign, bSign;

5181

5182

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5183

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5184

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5185

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5186

) {

5187

float_raise( float_flag_invalid STATUS_VAR);

5188

return 0;

5189

}

5190

aSign = extractFloatx80Sign( a );

5191

bSign = extractFloatx80Sign( b );

5192

if ( aSign != bSign ) {

5193

return

5194

aSign

5195

|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5196

== 0 );

5197

}

5198

return

5199

aSign ? le128( b.high, b.low, a.high, a.low )

5200

: le128( a.high, a.low, b.high, b.low );

5201

5202

}

5203

5204

/*----------------------------------------------------------------------------

5205

| Returns 1 if the extended double-precision floating-point value `a' is

5206

| less than the corresponding value `b', and 0 otherwise. The invalid

5207

| exception is raised if either operand is a NaN. The comparison is performed

5208

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5209

*----------------------------------------------------------------------------*/

5210

5211

int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )

5212

{

5213

flag aSign, bSign;

5214

5215

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5216

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5217

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5218

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5219

) {

5220

float_raise( float_flag_invalid STATUS_VAR);

5221

return 0;

5222

}

5223

aSign = extractFloatx80Sign( a );

5224

bSign = extractFloatx80Sign( b );

5225

if ( aSign != bSign ) {

5226

return

5227

aSign

5228

&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5229

!= 0 );

5230

}

5231

return

5232

aSign ? lt128( b.high, b.low, a.high, a.low )

5233

: lt128( a.high, a.low, b.high, b.low );

5234

5235

}

5236

5237

/*----------------------------------------------------------------------------

5238

| Returns 1 if the extended double-precision floating-point values `a' and `b'

5239

| cannot be compared, and 0 otherwise. The invalid exception is raised if

5240

| either operand is a NaN. The comparison is performed according to the

5241

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5242

*----------------------------------------------------------------------------*/

5243

int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )

5244

{

5245

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5246

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5247

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5248

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5249

) {

5250

float_raise( float_flag_invalid STATUS_VAR);

5251

return 1;

5252

}

5253

return 0;

5254

}

5255

5256

/*----------------------------------------------------------------------------

5257

| Returns 1 if the extended double-precision floating-point value `a' is

5258

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

5259

| cause an exception. The comparison is performed according to the IEC/IEEE

5260

| Standard for Binary Floating-Point Arithmetic.

5261

*----------------------------------------------------------------------------*/

5262

5263

int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5264

{

5265

5266

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5267

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5268

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5269

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5270

) {

5271

if ( floatx80_is_signaling_nan( a )

5272

|| floatx80_is_signaling_nan( b ) ) {

5273

float_raise( float_flag_invalid STATUS_VAR);

5274

}

5275

return 0;

5276

}

5277

return

5278

( a.low == b.low )

5279

&& ( ( a.high == b.high )

5280

|| ( ( a.low == 0 )

5281

&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )

5282

);

5283

5284

}

5285

5286

/*----------------------------------------------------------------------------

5287

| Returns 1 if the extended double-precision floating-point value `a' is less

5288

| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs

5289

| do not cause an exception. Otherwise, the comparison is performed according

5290

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5291

*----------------------------------------------------------------------------*/

5292

5293

int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5294

{

5295

flag aSign, bSign;

5296

5297

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5298

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5299

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5300

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5301

) {

5302

if ( floatx80_is_signaling_nan( a )

5303

|| floatx80_is_signaling_nan( b ) ) {

5304

float_raise( float_flag_invalid STATUS_VAR);

5305

}

5306

return 0;

5307

}

5308

aSign = extractFloatx80Sign( a );

5309

bSign = extractFloatx80Sign( b );

5310

if ( aSign != bSign ) {

5311

return

5312

aSign

5313

|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5314

== 0 );

5315

}

5316

return

5317

aSign ? le128( b.high, b.low, a.high, a.low )

5318

: le128( a.high, a.low, b.high, b.low );

5319

5320

}

5321

5322

/*----------------------------------------------------------------------------

5323

| Returns 1 if the extended double-precision floating-point value `a' is less

5324

| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause

5325

| an exception. Otherwise, the comparison is performed according to the

5326

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5327

*----------------------------------------------------------------------------*/

5328

5329

int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5330

{

5331

flag aSign, bSign;

5332

5333

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5334

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5335

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5336

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5337

) {

5338

if ( floatx80_is_signaling_nan( a )

5339

|| floatx80_is_signaling_nan( b ) ) {

5340

float_raise( float_flag_invalid STATUS_VAR);

5341

}

5342

return 0;

5343

}

5344

aSign = extractFloatx80Sign( a );

5345

bSign = extractFloatx80Sign( b );

5346

if ( aSign != bSign ) {

5347

return

5348

aSign

5349

&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5350

!= 0 );

5351

}

5352

return

5353

aSign ? lt128( b.high, b.low, a.high, a.low )

5354

: lt128( a.high, a.low, b.high, b.low );

5355

5356

}

5357

5358

/*----------------------------------------------------------------------------

5359

| Returns 1 if the extended double-precision floating-point values `a' and `b'

5360

| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.

5361

| The comparison is performed according to the IEC/IEEE Standard for Binary

5362

| Floating-Point Arithmetic.

5363

*----------------------------------------------------------------------------*/

5364

int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5365

{

5366

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5367

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5368

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5369

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5370

) {

5371

if ( floatx80_is_signaling_nan( a )

5372

|| floatx80_is_signaling_nan( b ) ) {

5373

float_raise( float_flag_invalid STATUS_VAR);

5374

}

5375

return 1;

5376

}

5377

return 0;

5378

}

5379

5380

/*----------------------------------------------------------------------------

5381

| Returns the result of converting the quadruple-precision floating-point

5382

| value `a' to the 32-bit two's complement integer format. The conversion

5383

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5384

| Arithmetic---which means in particular that the conversion is rounded

5385

| according to the current rounding mode. If `a' is a NaN, the largest

5386

| positive integer is returned. Otherwise, if the conversion overflows, the

5387

| largest integer with the same sign as `a' is returned.

5388

*----------------------------------------------------------------------------*/

5389

5390

int32 float128_to_int32( float128 a STATUS_PARAM )

5391

{

5392

flag aSign;

5393

int32 aExp, shiftCount;

5394

uint64_t aSig0, aSig1;

5395

5396

aSig1 = extractFloat128Frac1( a );

5397

aSig0 = extractFloat128Frac0( a );

5398

aExp = extractFloat128Exp( a );

5399

aSign = extractFloat128Sign( a );

5400

if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;

5401

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5402

aSig0 |= ( aSig1 != 0 );

5403

shiftCount = 0x4028 - aExp;

5404

if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );

5405

return roundAndPackInt32( aSign, aSig0 STATUS_VAR );

5406

5407

}

5408

5409

/*----------------------------------------------------------------------------

5410

| Returns the result of converting the quadruple-precision floating-point

5411

| value `a' to the 32-bit two's complement integer format. The conversion

5412

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5413

| Arithmetic, except that the conversion is always rounded toward zero. If

5414

| `a' is a NaN, the largest positive integer is returned. Otherwise, if the

5415

| conversion overflows, the largest integer with the same sign as `a' is

5416

| returned.

5417

*----------------------------------------------------------------------------*/

5418

5419

int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )

5420

{

5421

flag aSign;

5422

int32 aExp, shiftCount;

5423

uint64_t aSig0, aSig1, savedASig;

5424

int32_t z;

5425

5426

aSig1 = extractFloat128Frac1( a );

5427

aSig0 = extractFloat128Frac0( a );

5428

aExp = extractFloat128Exp( a );

5429

aSign = extractFloat128Sign( a );

5430

aSig0 |= ( aSig1 != 0 );

5431

if ( 0x401E < aExp ) {

5432

if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;

5433

goto invalid;

5434

}

5435

else if ( aExp < 0x3FFF ) {

5436

if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;

5437

return 0;

5438

}

5439

aSig0 |= LIT64( 0x0001000000000000 );

5440

shiftCount = 0x402F - aExp;

5441

savedASig = aSig0;

5442

aSig0 >>= shiftCount;

5443

z = aSig0;

5444

if ( aSign ) z = - z;

5445

if ( ( z < 0 ) ^ aSign ) {

5446

invalid:

5447

float_raise( float_flag_invalid STATUS_VAR);

5448

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

5449

}

5450

if ( ( aSig0<<shiftCount ) != savedASig ) {

5451

STATUS(float_exception_flags) |= float_flag_inexact;

5452

}

5453

return z;

5454

5455

}

5456

5457

/*----------------------------------------------------------------------------

5458

| Returns the result of converting the quadruple-precision floating-point

5459

| value `a' to the 64-bit two's complement integer format. The conversion

5460

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5461

| Arithmetic---which means in particular that the conversion is rounded

5462

| according to the current rounding mode. If `a' is a NaN, the largest

5463

| positive integer is returned. Otherwise, if the conversion overflows, the

5464

| largest integer with the same sign as `a' is returned.

5465

*----------------------------------------------------------------------------*/

5466

5467

int64 float128_to_int64( float128 a STATUS_PARAM )

5468

{

5469

flag aSign;

5470

int32 aExp, shiftCount;

5471

uint64_t aSig0, aSig1;

5472

5473

aSig1 = extractFloat128Frac1( a );

5474

aSig0 = extractFloat128Frac0( a );

5475

aExp = extractFloat128Exp( a );

5476

aSign = extractFloat128Sign( a );

5477

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5478

shiftCount = 0x402F - aExp;

5479

if ( shiftCount <= 0 ) {

5480

if ( 0x403E < aExp ) {

5481

float_raise( float_flag_invalid STATUS_VAR);

5482

if ( ! aSign

5483

|| ( ( aExp == 0x7FFF )

5484

&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )

5485

)

5486

) {

5487

return LIT64( 0x7FFFFFFFFFFFFFFF );

5488

}

5489

return (int64_t) LIT64( 0x8000000000000000 );

5490

}

5491

shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );

5492

}

5493

else {

5494

shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );

5495

}

5496

return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );

5497

5498

}

5499

5500

/*----------------------------------------------------------------------------

5501

| Returns the result of converting the quadruple-precision floating-point

5502

| value `a' to the 64-bit two's complement integer format. The conversion

5503

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5504

| Arithmetic, except that the conversion is always rounded toward zero.

5505

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

5506

| the conversion overflows, the largest integer with the same sign as `a' is

5507

| returned.

5508

*----------------------------------------------------------------------------*/

5509

5510

int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )

5511

{

5512

flag aSign;

5513

int32 aExp, shiftCount;

5514

uint64_t aSig0, aSig1;

5515

int64 z;

5516

5517

aSig1 = extractFloat128Frac1( a );

5518

aSig0 = extractFloat128Frac0( a );

5519

aExp = extractFloat128Exp( a );

5520

aSign = extractFloat128Sign( a );

5521

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5522

shiftCount = aExp - 0x402F;

5523

if ( 0 < shiftCount ) {

5524

if ( 0x403E <= aExp ) {

5525

aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );

5526

if ( ( a.high == LIT64( 0xC03E000000000000 ) )

5527

&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {

5528

if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

5529

}

5530

else {

5531

float_raise( float_flag_invalid STATUS_VAR);

5532

if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {

5533

return LIT64( 0x7FFFFFFFFFFFFFFF );

5534

}

5535

}

5536

return (int64_t) LIT64( 0x8000000000000000 );

5537

}

5538

z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );

5539

if ( (uint64_t) ( aSig1<<shiftCount ) ) {

5540

STATUS(float_exception_flags) |= float_flag_inexact;

5541

}

5542

}

5543

else {

5544

if ( aExp < 0x3FFF ) {

5545

if ( aExp | aSig0 | aSig1 ) {

5546

STATUS(float_exception_flags) |= float_flag_inexact;

5547

}

5548

return 0;

5549

}

5550

z = aSig0>>( - shiftCount );

5551

if ( aSig1

5552

|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {

5553

STATUS(float_exception_flags) |= float_flag_inexact;

5554

}

5555

}

5556

if ( aSign ) z = - z;

5557

return z;

5558

5559

}

5560

5561

/*----------------------------------------------------------------------------

5562

| Returns the result of converting the quadruple-precision floating-point

5563

| value `a' to the single-precision floating-point format. The conversion

5564

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5565

| Arithmetic.

5566

*----------------------------------------------------------------------------*/

5567

5568

float32 float128_to_float32( float128 a STATUS_PARAM )

5569

{

5570

flag aSign;

5571

int32 aExp;

5572

uint64_t aSig0, aSig1;

5573

uint32_t zSig;

5574

5575

aSig1 = extractFloat128Frac1( a );

5576

aSig0 = extractFloat128Frac0( a );

5577

aExp = extractFloat128Exp( a );

5578

aSign = extractFloat128Sign( a );

5579

if ( aExp == 0x7FFF ) {

5580

if ( aSig0 | aSig1 ) {

5581

return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5582

}

5583

return packFloat32( aSign, 0xFF, 0 );

5584

}

5585

aSig0 |= ( aSig1 != 0 );

5586

shift64RightJamming( aSig0, 18, &aSig0 );

5587

zSig = aSig0;

5588

if ( aExp || zSig ) {

5589

zSig |= 0x40000000;

5590

aExp -= 0x3F81;

5591

}

5592

return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );

5593

5594

}

5595

5596

/*----------------------------------------------------------------------------

5597

| Returns the result of converting the quadruple-precision floating-point

5598

| value `a' to the double-precision floating-point format. The conversion

5599

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5600

| Arithmetic.

5601

*----------------------------------------------------------------------------*/

5602

5603

float64 float128_to_float64( float128 a STATUS_PARAM )

5604

{

5605

flag aSign;

5606

int32 aExp;

5607

uint64_t aSig0, aSig1;

5608

5609

aSig1 = extractFloat128Frac1( a );

5610

aSig0 = extractFloat128Frac0( a );

5611

aExp = extractFloat128Exp( a );

5612

aSign = extractFloat128Sign( a );

5613

if ( aExp == 0x7FFF ) {

5614

if ( aSig0 | aSig1 ) {

5615

return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5616

}

5617

return packFloat64( aSign, 0x7FF, 0 );

5618

}

5619

shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );

5620

aSig0 |= ( aSig1 != 0 );

5621

if ( aExp || aSig0 ) {

5622

aSig0 |= LIT64( 0x4000000000000000 );

5623

aExp -= 0x3C01;

5624

}

5625

return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );

5626

5627

}

5628

5629

/*----------------------------------------------------------------------------

5630

| Returns the result of converting the quadruple-precision floating-point

5631

| value `a' to the extended double-precision floating-point format. The

5632

| conversion is performed according to the IEC/IEEE Standard for Binary

5633

| Floating-Point Arithmetic.

5634

*----------------------------------------------------------------------------*/

5635

5636

floatx80 float128_to_floatx80( float128 a STATUS_PARAM )

5637

{

5638

flag aSign;

5639

int32 aExp;

5640

uint64_t aSig0, aSig1;

5641

5642

aSig1 = extractFloat128Frac1( a );

5643

aSig0 = extractFloat128Frac0( a );

5644

aExp = extractFloat128Exp( a );

5645

aSign = extractFloat128Sign( a );

5646

if ( aExp == 0x7FFF ) {

5647

if ( aSig0 | aSig1 ) {

5648

return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5649

}

5650

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5651

}

5652

if ( aExp == 0 ) {

5653

if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );

5654

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

5655

}

5656

else {

5657

aSig0 |= LIT64( 0x0001000000000000 );

5658

}

5659

shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );

5660

return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );

5661

5662

}

5663

5664

/*----------------------------------------------------------------------------

5665

| Rounds the quadruple-precision floating-point value `a' to an integer, and

5666

| returns the result as a quadruple-precision floating-point value. The

5667

| operation is performed according to the IEC/IEEE Standard for Binary

5668

| Floating-Point Arithmetic.

5669

*----------------------------------------------------------------------------*/

5670

5671

float128 float128_round_to_int( float128 a STATUS_PARAM )

5672

{

5673

flag aSign;

5674

int32 aExp;

5675

uint64_t lastBitMask, roundBitsMask;

5676

int8 roundingMode;

5677

float128 z;

5678

5679

aExp = extractFloat128Exp( a );

5680

if ( 0x402F <= aExp ) {

5681

if ( 0x406F <= aExp ) {

5682

if ( ( aExp == 0x7FFF )

5683

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )

5684

) {

5685

return propagateFloat128NaN( a, a STATUS_VAR );

5686

}

5687

return a;

5688

}

5689

lastBitMask = 1;

5690

lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;

5691

roundBitsMask = lastBitMask - 1;

5692

z = a;

5693

roundingMode = STATUS(float_rounding_mode);

5694

if ( roundingMode == float_round_nearest_even ) {

5695

if ( lastBitMask ) {

5696

add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );

5697

if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;

5698

}

5699

else {

5700

if ( (int64_t) z.low < 0 ) {

5701

++z.high;

5702

if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;

5703

}

5704

}

5705

}

5706

else if ( roundingMode != float_round_to_zero ) {

5707

if ( extractFloat128Sign( z )

5708

^ ( roundingMode == float_round_up ) ) {

5709

add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );

5710

}

5711

}

5712

z.low &= ~ roundBitsMask;

5713

}

5714

else {

5715

if ( aExp < 0x3FFF ) {

5716

if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;

5717

STATUS(float_exception_flags) |= float_flag_inexact;

5718

aSign = extractFloat128Sign( a );

5719

switch ( STATUS(float_rounding_mode) ) {

5720

case float_round_nearest_even:

5721

if ( ( aExp == 0x3FFE )

5722

&& ( extractFloat128Frac0( a )

5723

| extractFloat128Frac1( a ) )

5724

) {

5725

return packFloat128( aSign, 0x3FFF, 0, 0 );

5726

}

5727

break;

5728

case float_round_down:

5729

return

5730

aSign ? packFloat128( 1, 0x3FFF, 0, 0 )

5731

: packFloat128( 0, 0, 0, 0 );

5732

case float_round_up:

5733

return

5734

aSign ? packFloat128( 1, 0, 0, 0 )

5735

: packFloat128( 0, 0x3FFF, 0, 0 );

5736

}

5737

return packFloat128( aSign, 0, 0, 0 );

5738

}

5739

lastBitMask = 1;

5740

lastBitMask <<= 0x402F - aExp;

5741

roundBitsMask = lastBitMask - 1;

5742

z.low = 0;

5743

z.high = a.high;

5744

roundingMode = STATUS(float_rounding_mode);

5745

if ( roundingMode == float_round_nearest_even ) {

5746

z.high += lastBitMask>>1;

5747

if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {

5748

z.high &= ~ lastBitMask;

5749

}

5750

}

5751

else if ( roundingMode != float_round_to_zero ) {

5752

if ( extractFloat128Sign( z )

5753

^ ( roundingMode == float_round_up ) ) {

5754

z.high |= ( a.low != 0 );

5755

z.high += roundBitsMask;

5756

}

5757

}

5758

z.high &= ~ roundBitsMask;

5759

}

5760

if ( ( z.low != a.low ) || ( z.high != a.high ) ) {

5761

STATUS(float_exception_flags) |= float_flag_inexact;

5762

}

5763

return z;

5764

5765

}

5766

5767

/*----------------------------------------------------------------------------

5768

| Returns the result of adding the absolute values of the quadruple-precision

5769

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

5770

| before being returned. `zSign' is ignored if the result is a NaN.

5771

| The addition is performed according to the IEC/IEEE Standard for Binary

5772

| Floating-Point Arithmetic.

5773

*----------------------------------------------------------------------------*/

5774

5775

static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)

5776

{

5777

int32 aExp, bExp, zExp;

5778

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;

5779

int32 expDiff;

5780

5781

aSig1 = extractFloat128Frac1( a );

5782

aSig0 = extractFloat128Frac0( a );

5783

aExp = extractFloat128Exp( a );

5784

bSig1 = extractFloat128Frac1( b );

5785

bSig0 = extractFloat128Frac0( b );

5786

bExp = extractFloat128Exp( b );

5787

expDiff = aExp - bExp;

5788

if ( 0 < expDiff ) {

5789

if ( aExp == 0x7FFF ) {

5790

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5791

return a;

5792

}

5793

if ( bExp == 0 ) {

5794

--expDiff;

5795

}

5796

else {

5797

bSig0 |= LIT64( 0x0001000000000000 );

5798

}

5799

shift128ExtraRightJamming(

5800

bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );

5801

zExp = aExp;

5802

}

5803

else if ( expDiff < 0 ) {

5804

if ( bExp == 0x7FFF ) {

5805

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5806

return packFloat128( zSign, 0x7FFF, 0, 0 );

5807

}

5808

if ( aExp == 0 ) {

5809

++expDiff;

5810

}

5811

else {

5812

aSig0 |= LIT64( 0x0001000000000000 );

5813

}

5814

shift128ExtraRightJamming(

5815

aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );

5816

zExp = bExp;

5817

}

5818

else {

5819

if ( aExp == 0x7FFF ) {

5820

if ( aSig0 | aSig1 | bSig0 | bSig1 ) {

5821

return propagateFloat128NaN( a, b STATUS_VAR );

5822

}

5823

return a;

5824

}

5825

add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

5826

if ( aExp == 0 ) {

5827

if (STATUS(flush_to_zero)) {

5828

if (zSig0 | zSig1) {

5829

float_raise(float_flag_output_denormal STATUS_VAR);

5830

}

5831

return packFloat128(zSign, 0, 0, 0);

5832

}

5833

return packFloat128( zSign, 0, zSig0, zSig1 );

5834

}

5835

zSig2 = 0;

5836

zSig0 |= LIT64( 0x0002000000000000 );

5837

zExp = aExp;

5838

goto shiftRight1;

5839

}

5840

aSig0 |= LIT64( 0x0001000000000000 );

5841

add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

5842

--zExp;

5843

if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;

5844

++zExp;

5845

shiftRight1:

5846

shift128ExtraRightJamming(

5847

zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );

5848

roundAndPack:

5849

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

5850

5851

}

5852

5853

/*----------------------------------------------------------------------------

5854

| Returns the result of subtracting the absolute values of the quadruple-

5855

| precision floating-point values `a' and `b'. If `zSign' is 1, the

5856

| difference is negated before being returned. `zSign' is ignored if the

5857

| result is a NaN. The subtraction is performed according to the IEC/IEEE

5858

| Standard for Binary Floating-Point Arithmetic.

5859

*----------------------------------------------------------------------------*/

5860

5861

static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)

5862

{

5863

int32 aExp, bExp, zExp;

5864

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;

5865

int32 expDiff;

5866

float128 z;

5867

5868

aSig1 = extractFloat128Frac1( a );

5869

aSig0 = extractFloat128Frac0( a );

5870

aExp = extractFloat128Exp( a );

5871

bSig1 = extractFloat128Frac1( b );

5872

bSig0 = extractFloat128Frac0( b );

5873

bExp = extractFloat128Exp( b );

5874

expDiff = aExp - bExp;

5875

shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );

5876

shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );

5877

if ( 0 < expDiff ) goto aExpBigger;

5878

if ( expDiff < 0 ) goto bExpBigger;

5879

if ( aExp == 0x7FFF ) {

5880

if ( aSig0 | aSig1 | bSig0 | bSig1 ) {

5881

return propagateFloat128NaN( a, b STATUS_VAR );

5882

}

5883

float_raise( float_flag_invalid STATUS_VAR);

5884

z.low = float128_default_nan_low;

5885

z.high = float128_default_nan_high;

5886

return z;

5887

}

5888

if ( aExp == 0 ) {

5889

aExp = 1;

5890

bExp = 1;

5891

}

5892

if ( bSig0 < aSig0 ) goto aBigger;

5893

if ( aSig0 < bSig0 ) goto bBigger;

5894

if ( bSig1 < aSig1 ) goto aBigger;

5895

if ( aSig1 < bSig1 ) goto bBigger;

5896

return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );

5897

bExpBigger:

5898

if ( bExp == 0x7FFF ) {

5899

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5900

return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );

5901

}

5902

if ( aExp == 0 ) {

5903

++expDiff;

5904

}

5905

else {

5906

aSig0 |= LIT64( 0x4000000000000000 );

5907

}

5908

shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );

5909

bSig0 |= LIT64( 0x4000000000000000 );

5910

bBigger:

5911

sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );

5912

zExp = bExp;

5913

zSign ^= 1;

5914

goto normalizeRoundAndPack;

5915

aExpBigger:

5916

if ( aExp == 0x7FFF ) {

5917

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5918

return a;

5919

}

5920

if ( bExp == 0 ) {

5921

--expDiff;

5922

}

5923

else {

5924

bSig0 |= LIT64( 0x4000000000000000 );

5925

}

5926

shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );

5927

aSig0 |= LIT64( 0x4000000000000000 );

5928

aBigger:

5929

sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

5930

zExp = aExp;

5931

normalizeRoundAndPack:

5932

--zExp;

5933

return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );

5934

5935

}

5936

5937

/*----------------------------------------------------------------------------

5938

| Returns the result of adding the quadruple-precision floating-point values

5939

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

5940

| for Binary Floating-Point Arithmetic.

5941

*----------------------------------------------------------------------------*/

5942

5943

float128 float128_add( float128 a, float128 b STATUS_PARAM )

5944

{

5945

flag aSign, bSign;

5946

5947

aSign = extractFloat128Sign( a );

5948

bSign = extractFloat128Sign( b );

5949

if ( aSign == bSign ) {

5950

return addFloat128Sigs( a, b, aSign STATUS_VAR );

5951

}

5952

else {

5953

return subFloat128Sigs( a, b, aSign STATUS_VAR );

5954

}

5955

5956

}

5957

5958

/*----------------------------------------------------------------------------

5959

| Returns the result of subtracting the quadruple-precision floating-point

5960

| values `a' and `b'. The operation is performed according to the IEC/IEEE

5961

| Standard for Binary Floating-Point Arithmetic.

5962

*----------------------------------------------------------------------------*/

5963

5964

float128 float128_sub( float128 a, float128 b STATUS_PARAM )

5965

{

5966

flag aSign, bSign;

5967

5968

aSign = extractFloat128Sign( a );

5969

bSign = extractFloat128Sign( b );

5970

if ( aSign == bSign ) {

5971

return subFloat128Sigs( a, b, aSign STATUS_VAR );

5972

}

5973

else {

5974

return addFloat128Sigs( a, b, aSign STATUS_VAR );

5975

}

5976

5977

}

5978

5979

/*----------------------------------------------------------------------------

5980

| Returns the result of multiplying the quadruple-precision floating-point

5981

| values `a' and `b'. The operation is performed according to the IEC/IEEE

5982

| Standard for Binary Floating-Point Arithmetic.

5983

*----------------------------------------------------------------------------*/

5984

5985

float128 float128_mul( float128 a, float128 b STATUS_PARAM )

5986

{

5987

flag aSign, bSign, zSign;

5988

int32 aExp, bExp, zExp;

5989

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;

5990

float128 z;

5991

5992

aSig1 = extractFloat128Frac1( a );

5993

aSig0 = extractFloat128Frac0( a );

5994

aExp = extractFloat128Exp( a );

5995

aSign = extractFloat128Sign( a );

5996

bSig1 = extractFloat128Frac1( b );

5997

bSig0 = extractFloat128Frac0( b );

5998

bExp = extractFloat128Exp( b );

5999

bSign = extractFloat128Sign( b );

6000

zSign = aSign ^ bSign;

6001

if ( aExp == 0x7FFF ) {

6002

if ( ( aSig0 | aSig1 )

6003

|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {

6004

return propagateFloat128NaN( a, b STATUS_VAR );

6005

}

6006

if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;

6007

return packFloat128( zSign, 0x7FFF, 0, 0 );

6008

}

6009

if ( bExp == 0x7FFF ) {

6010

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6011

if ( ( aExp | aSig0 | aSig1 ) == 0 ) {

6012

invalid:

6013

float_raise( float_flag_invalid STATUS_VAR);

6014

z.low = float128_default_nan_low;

6015

z.high = float128_default_nan_high;

6016

return z;

6017

}

6018

return packFloat128( zSign, 0x7FFF, 0, 0 );

6019

}

6020

if ( aExp == 0 ) {

6021

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6022

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6023

}

6024

if ( bExp == 0 ) {

6025

if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6026

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6027

}

6028

zExp = aExp + bExp - 0x4000;

6029

aSig0 |= LIT64( 0x0001000000000000 );

6030

shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );

6031

mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );

6032

add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );

6033

zSig2 |= ( zSig3 != 0 );

6034

if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {

6035

shift128ExtraRightJamming(

6036

zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );

6037

++zExp;

6038

}

6039

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6040

6041

}

6042

6043

/*----------------------------------------------------------------------------

6044

| Returns the result of dividing the quadruple-precision floating-point value

6045

| `a' by the corresponding value `b'. The operation is performed according to

6046

| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6047

*----------------------------------------------------------------------------*/

6048

6049

float128 float128_div( float128 a, float128 b STATUS_PARAM )

6050

{

6051

flag aSign, bSign, zSign;

6052

int32 aExp, bExp, zExp;

6053

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;

6054

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

6055

float128 z;

6056

6057

aSig1 = extractFloat128Frac1( a );

6058

aSig0 = extractFloat128Frac0( a );

6059

aExp = extractFloat128Exp( a );

6060

aSign = extractFloat128Sign( a );

6061

bSig1 = extractFloat128Frac1( b );

6062

bSig0 = extractFloat128Frac0( b );

6063

bExp = extractFloat128Exp( b );

6064

bSign = extractFloat128Sign( b );

6065

zSign = aSign ^ bSign;

6066

if ( aExp == 0x7FFF ) {

6067

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6068

if ( bExp == 0x7FFF ) {

6069

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6070

goto invalid;

6071

}

6072

return packFloat128( zSign, 0x7FFF, 0, 0 );

6073

}

6074

if ( bExp == 0x7FFF ) {

6075

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6076

return packFloat128( zSign, 0, 0, 0 );

6077

}

6078

if ( bExp == 0 ) {

6079

if ( ( bSig0 | bSig1 ) == 0 ) {

6080

if ( ( aExp | aSig0 | aSig1 ) == 0 ) {

6081

invalid:

6082

float_raise( float_flag_invalid STATUS_VAR);

6083

z.low = float128_default_nan_low;

6084

z.high = float128_default_nan_high;

6085

return z;

6086

}

6087

float_raise( float_flag_divbyzero STATUS_VAR);

6088

return packFloat128( zSign, 0x7FFF, 0, 0 );

6089

}

6090

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6091

}

6092

if ( aExp == 0 ) {

6093

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6094

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6095

}

6096

zExp = aExp - bExp + 0x3FFD;

6097

shortShift128Left(

6098

aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );

6099

shortShift128Left(

6100

bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );

6101

if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {

6102

shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );

6103

++zExp;

6104

}

6105

zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );

6106

mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );

6107

sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );

6108

while ( (int64_t) rem0 < 0 ) {

6109

--zSig0;

6110

add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );

6111

}

6112

zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );

6113

if ( ( zSig1 & 0x3FFF ) <= 4 ) {

6114

mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );

6115

sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );

6116

while ( (int64_t) rem1 < 0 ) {

6117

--zSig1;

6118

add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );

6119

}

6120

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

6121

}

6122

shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );

6123

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6124

6125

}

6126

6127

/*----------------------------------------------------------------------------

6128

| Returns the remainder of the quadruple-precision floating-point value `a'

6129

| with respect to the corresponding value `b'. The operation is performed

6130

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6131

*----------------------------------------------------------------------------*/

6132

6133

float128 float128_rem( float128 a, float128 b STATUS_PARAM )

6134

{

6135

flag aSign, zSign;

6136

int32 aExp, bExp, expDiff;

6137

uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;

6138

uint64_t allZero, alternateASig0, alternateASig1, sigMean1;

6139

int64_t sigMean0;

6140

float128 z;

6141

6142

aSig1 = extractFloat128Frac1( a );

6143

aSig0 = extractFloat128Frac0( a );

6144

aExp = extractFloat128Exp( a );

6145

aSign = extractFloat128Sign( a );

6146

bSig1 = extractFloat128Frac1( b );

6147

bSig0 = extractFloat128Frac0( b );

6148

bExp = extractFloat128Exp( b );

6149

if ( aExp == 0x7FFF ) {

6150

if ( ( aSig0 | aSig1 )

6151

|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {

6152

return propagateFloat128NaN( a, b STATUS_VAR );

6153

}

6154

goto invalid;

6155

}

6156

if ( bExp == 0x7FFF ) {

6157

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6158

return a;

6159

}

6160

if ( bExp == 0 ) {

6161

if ( ( bSig0 | bSig1 ) == 0 ) {

6162

invalid:

6163

float_raise( float_flag_invalid STATUS_VAR);

6164

z.low = float128_default_nan_low;

6165

z.high = float128_default_nan_high;

6166

return z;

6167

}

6168

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6169

}

6170

if ( aExp == 0 ) {

6171

if ( ( aSig0 | aSig1 ) == 0 ) return a;

6172

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6173

}

6174

expDiff = aExp - bExp;

6175

if ( expDiff < -1 ) return a;

6176

shortShift128Left(

6177

aSig0 | LIT64( 0x0001000000000000 ),

6178

aSig1,

6179

15 - ( expDiff < 0 ),

6180

&aSig0,

6181

&aSig1

6182

);

6183

shortShift128Left(

6184

bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );

6185

q = le128( bSig0, bSig1, aSig0, aSig1 );

6186

if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );

6187

expDiff -= 64;

6188

while ( 0 < expDiff ) {

6189

q = estimateDiv128To64( aSig0, aSig1, bSig0 );

6190

q = ( 4 < q ) ? q - 4 : 0;

6191

mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );

6192

shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );

6193

shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );

6194

sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );

6195

expDiff -= 61;

6196

}

6197

if ( -64 < expDiff ) {

6198

q = estimateDiv128To64( aSig0, aSig1, bSig0 );

6199

q = ( 4 < q ) ? q - 4 : 0;

6200

q >>= - expDiff;

6201

shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );

6202

expDiff += 52;

6203

if ( expDiff < 0 ) {

6204

shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );

6205

}

6206

else {

6207

shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );

6208

}

6209

mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );

6210

sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );

6211

}

6212

else {

6213

shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );

6214

shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );

6215

}

6216

do {

6217

alternateASig0 = aSig0;

6218

alternateASig1 = aSig1;

6219

++q;

6220

sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );

6221

} while ( 0 <= (int64_t) aSig0 );

6222

add128(

6223

aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );

6224

if ( ( sigMean0 < 0 )

6225

|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {

6226

aSig0 = alternateASig0;

6227

aSig1 = alternateASig1;

6228

}

6229

zSign = ( (int64_t) aSig0 < 0 );

6230

if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );

6231

return

6232

normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );

6233

6234

}

6235

6236

/*----------------------------------------------------------------------------

6237

| Returns the square root of the quadruple-precision floating-point value `a'.

6238

| The operation is performed according to the IEC/IEEE Standard for Binary

6239

| Floating-Point Arithmetic.

6240

*----------------------------------------------------------------------------*/

6241

6242

float128 float128_sqrt( float128 a STATUS_PARAM )

6243

{

6244

flag aSign;

6245

int32 aExp, zExp;

6246

uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;

6247

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

6248

float128 z;

6249

6250

aSig1 = extractFloat128Frac1( a );

6251

aSig0 = extractFloat128Frac0( a );

6252

aExp = extractFloat128Exp( a );

6253

aSign = extractFloat128Sign( a );

6254

if ( aExp == 0x7FFF ) {

6255

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );

6256

if ( ! aSign ) return a;

6257

goto invalid;

6258

}

6259

if ( aSign ) {

6260

if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;

6261

invalid:

6262

float_raise( float_flag_invalid STATUS_VAR);

6263

z.low = float128_default_nan_low;

6264

z.high = float128_default_nan_high;

6265

return z;

6266

}

6267

if ( aExp == 0 ) {

6268

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );

6269

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6270

}

6271

zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;

6272

aSig0 |= LIT64( 0x0001000000000000 );

6273

zSig0 = estimateSqrt32( aExp, aSig0>>17 );

6274

shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );

6275

zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );

6276

doubleZSig0 = zSig0<<1;

6277

mul64To128( zSig0, zSig0, &term0, &term1 );

6278

sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );

6279

while ( (int64_t) rem0 < 0 ) {

6280

--zSig0;

6281

doubleZSig0 -= 2;

6282

add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );

6283

}

6284

zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );

6285

if ( ( zSig1 & 0x1FFF ) <= 5 ) {

6286

if ( zSig1 == 0 ) zSig1 = 1;

6287

mul64To128( doubleZSig0, zSig1, &term1, &term2 );

6288

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

6289

mul64To128( zSig1, zSig1, &term2, &term3 );

6290

sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );

6291

while ( (int64_t) rem1 < 0 ) {

6292

--zSig1;

6293

shortShift128Left( 0, zSig1, 1, &term2, &term3 );

6294

term3 |= 1;

6295

term2 |= doubleZSig0;

6296

add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );

6297

}

6298

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

6299

}

6300

shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );

6301

return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6302

6303

}

6304

6305

/*----------------------------------------------------------------------------

6306

| Returns 1 if the quadruple-precision floating-point value `a' is equal to

6307

| the corresponding value `b', and 0 otherwise. The invalid exception is

6308

| raised if either operand is a NaN. Otherwise, the comparison is performed

6309

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6310

*----------------------------------------------------------------------------*/

6311

6312

int float128_eq( float128 a, float128 b STATUS_PARAM )

6313

{

6314

6315

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6316

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6317

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6318

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6319

) {

6320

float_raise( float_flag_invalid STATUS_VAR);

6321

return 0;

6322

}

6323

return

6324

( a.low == b.low )

6325

&& ( ( a.high == b.high )

6326

|| ( ( a.low == 0 )

6327

&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )

6328

);

6329

6330

}

6331

6332

/*----------------------------------------------------------------------------

6333

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6334

| or equal to the corresponding value `b', and 0 otherwise. The invalid

6335

| exception is raised if either operand is a NaN. The comparison is performed

6336

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6337

*----------------------------------------------------------------------------*/

6338

6339

int float128_le( float128 a, float128 b STATUS_PARAM )

6340

{

6341

flag aSign, bSign;

6342

6343

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6344

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6345

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6346

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6347

) {

6348

float_raise( float_flag_invalid STATUS_VAR);

6349

return 0;

6350

}

6351

aSign = extractFloat128Sign( a );

6352

bSign = extractFloat128Sign( b );

6353

if ( aSign != bSign ) {

6354

return

6355

aSign

6356

|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6357

== 0 );

6358

}

6359

return

6360

aSign ? le128( b.high, b.low, a.high, a.low )

6361

: le128( a.high, a.low, b.high, b.low );

6362

6363

}

6364

6365

/*----------------------------------------------------------------------------

6366

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6367

| the corresponding value `b', and 0 otherwise. The invalid exception is

6368

| raised if either operand is a NaN. The comparison is performed according

6369

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6370

*----------------------------------------------------------------------------*/

6371

6372

int float128_lt( float128 a, float128 b STATUS_PARAM )

6373

{

6374

flag aSign, bSign;

6375

6376

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6377

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6378

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6379

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6380

) {

6381

float_raise( float_flag_invalid STATUS_VAR);

6382

return 0;

6383

}

6384

aSign = extractFloat128Sign( a );

6385

bSign = extractFloat128Sign( b );

6386

if ( aSign != bSign ) {

6387

return

6388

aSign

6389

&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6390

!= 0 );

6391

}

6392

return

6393

aSign ? lt128( b.high, b.low, a.high, a.low )

6394

: lt128( a.high, a.low, b.high, b.low );

6395

6396

}

6397

6398

/*----------------------------------------------------------------------------

6399

| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot

6400

| be compared, and 0 otherwise. The invalid exception is raised if either

6401

| operand is a NaN. The comparison is performed according to the IEC/IEEE

6402

| Standard for Binary Floating-Point Arithmetic.

6403

*----------------------------------------------------------------------------*/

6404

6405

int float128_unordered( float128 a, float128 b STATUS_PARAM )

6406

{

6407

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6408

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6409

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6410

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6411

) {

6412

float_raise( float_flag_invalid STATUS_VAR);

6413

return 1;

6414

}

6415

return 0;

6416

}

6417

6418

/*----------------------------------------------------------------------------

6419

| Returns 1 if the quadruple-precision floating-point value `a' is equal to

6420

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

6421

| exception. The comparison is performed according to the IEC/IEEE Standard

6422

| for Binary Floating-Point Arithmetic.

6423

*----------------------------------------------------------------------------*/

6424

6425

int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )

6426

{

6427

6428

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6429

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6430

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6431

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6432

) {

6433

if ( float128_is_signaling_nan( a )

6434

|| float128_is_signaling_nan( b ) ) {

6435

float_raise( float_flag_invalid STATUS_VAR);

6436

}

6437

return 0;

6438

}

6439

return

6440

( a.low == b.low )

6441

&& ( ( a.high == b.high )

6442

|| ( ( a.low == 0 )

6443

&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )

6444

);

6445

6446

}

6447

6448

/*----------------------------------------------------------------------------

6449

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6450

| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

6451

| cause an exception. Otherwise, the comparison is performed according to the

6452

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6453

*----------------------------------------------------------------------------*/

6454

6455

int float128_le_quiet( float128 a, float128 b STATUS_PARAM )

6456

{

6457

flag aSign, bSign;

6458

6459

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6460

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6461

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6462

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6463

) {

6464

if ( float128_is_signaling_nan( a )

6465

|| float128_is_signaling_nan( b ) ) {

6466

float_raise( float_flag_invalid STATUS_VAR);

6467

}

6468

return 0;

6469

}

6470

aSign = extractFloat128Sign( a );

6471

bSign = extractFloat128Sign( b );

6472

if ( aSign != bSign ) {

6473

return

6474

aSign

6475

|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6476

== 0 );

6477

}

6478

return

6479

aSign ? le128( b.high, b.low, a.high, a.low )

6480

: le128( a.high, a.low, b.high, b.low );

6481

6482

}

6483

6484

/*----------------------------------------------------------------------------

6485

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6486

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

6487

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

6488

| Standard for Binary Floating-Point Arithmetic.

6489

*----------------------------------------------------------------------------*/

6490

6491

int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )

6492

{

6493

flag aSign, bSign;

6494

6495

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6496

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6497

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6498

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6499

) {

6500

if ( float128_is_signaling_nan( a )

6501

|| float128_is_signaling_nan( b ) ) {

6502

float_raise( float_flag_invalid STATUS_VAR);

6503

}

6504

return 0;

6505

}

6506

aSign = extractFloat128Sign( a );

6507

bSign = extractFloat128Sign( b );

6508

if ( aSign != bSign ) {

6509

return

6510

aSign

6511

&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6512

!= 0 );

6513

}

6514

return

6515

aSign ? lt128( b.high, b.low, a.high, a.low )

6516

: lt128( a.high, a.low, b.high, b.low );

6517

6518

}

6519

6520

/*----------------------------------------------------------------------------

6521

| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot

6522

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

6523

| comparison is performed according to the IEC/IEEE Standard for Binary

6524

| Floating-Point Arithmetic.

6525

*----------------------------------------------------------------------------*/

6526

6527

int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )

6528

{

6529

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6530

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6531

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6532

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6533

) {

6534

if ( float128_is_signaling_nan( a )

6535

|| float128_is_signaling_nan( b ) ) {

6536

float_raise( float_flag_invalid STATUS_VAR);

6537

}

6538

return 1;

6539

}

6540

return 0;

6541

}

6542

6543

/* misc functions */

6544

float32 uint32_to_float32(uint32_t a STATUS_PARAM)

6545

{

6546

return int64_to_float32(a STATUS_VAR);

6547

}

6548

6549

float64 uint32_to_float64(uint32_t a STATUS_PARAM)

6550

{

6551

return int64_to_float64(a STATUS_VAR);

6552

}

6553

6554

uint32 float32_to_uint32( float32 a STATUS_PARAM )

6555

{

6556

int64_t v;

6557

uint32 res;

6558

int old_exc_flags = get_float_exception_flags(status);

6559

6560

v = float32_to_int64(a STATUS_VAR);

6561

if (v < 0) {

6562

res = 0;

6563

} else if (v > 0xffffffff) {

6564

res = 0xffffffff;

6565

} else {

6566

return v;

6567

}

6568

set_float_exception_flags(old_exc_flags, status);

6569

float_raise(float_flag_invalid STATUS_VAR);

6570

return res;

6571

}

6572

6573

uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )

6574

{

6575

int64_t v;

6576

uint32 res;

6577

int old_exc_flags = get_float_exception_flags(status);

6578

6579

v = float32_to_int64_round_to_zero(a STATUS_VAR);

6580

if (v < 0) {

6581

res = 0;

6582

} else if (v > 0xffffffff) {

6583

res = 0xffffffff;

6584

} else {

6585

return v;

6586

}

6587

set_float_exception_flags(old_exc_flags, status);

6588

float_raise(float_flag_invalid STATUS_VAR);

6589

return res;

6590

}

6591

6592

int_fast16_t float32_to_int16(float32 a STATUS_PARAM)

6593

{

6594

int32_t v;

6595

int_fast16_t res;

6596

int old_exc_flags = get_float_exception_flags(status);

6597

6598

v = float32_to_int32(a STATUS_VAR);

6599

if (v < -0x8000) {

6600

res = -0x8000;

6601

} else if (v > 0x7fff) {

6602

res = 0x7fff;

6603

} else {

6604

return v;

6605

}

6606

6607

set_float_exception_flags(old_exc_flags, status);

6608

float_raise(float_flag_invalid STATUS_VAR);

6609

return res;

6610

}

6611

6612

uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)

6613

{

6614

int32_t v;

6615

uint_fast16_t res;

6616

int old_exc_flags = get_float_exception_flags(status);

6617

6618

v = float32_to_int32(a STATUS_VAR);

6619

if (v < 0) {

6620

res = 0;

6621

} else if (v > 0xffff) {

6622

res = 0xffff;

6623

} else {

6624

return v;

6625

}

6626

6627

set_float_exception_flags(old_exc_flags, status);

6628

float_raise(float_flag_invalid STATUS_VAR);

6629

return res;

6630

}

6631

6632

uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)

6633

{

6634

int64_t v;

6635

uint_fast16_t res;

6636

int old_exc_flags = get_float_exception_flags(status);

6637

6638

v = float32_to_int64_round_to_zero(a STATUS_VAR);

6639

if (v < 0) {

6640

res = 0;

6641

} else if (v > 0xffff) {

6642

res = 0xffff;

6643

} else {

6644

return v;

6645

}

6646

set_float_exception_flags(old_exc_flags, status);

6647

float_raise(float_flag_invalid STATUS_VAR);

6648

return res;

6649

}

6650

6651

uint32 float64_to_uint32( float64 a STATUS_PARAM )

6652

{

6653

int64_t v;

6654

uint32 res;

6655

6656

v = float64_to_int64(a STATUS_VAR);

6657

if (v < 0) {

6658

res = 0;

6659

float_raise( float_flag_invalid STATUS_VAR);

6660

} else if (v > 0xffffffff) {

6661

res = 0xffffffff;

6662

float_raise( float_flag_invalid STATUS_VAR);

6663

} else {

6664

res = v;

6665

}

6666

return res;

6667

}

6668

6669

uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )

6670

{

6671

int64_t v;

6672

uint32 res;

6673

6674

v = float64_to_int64_round_to_zero(a STATUS_VAR);

6675

if (v < 0) {

6676

res = 0;

6677

float_raise( float_flag_invalid STATUS_VAR);

6678

} else if (v > 0xffffffff) {

6679

res = 0xffffffff;

6680

float_raise( float_flag_invalid STATUS_VAR);

6681

} else {

6682

res = v;

6683

}

6684

return res;

6685

}

6686

6687

int_fast16_t float64_to_int16(float64 a STATUS_PARAM)

6688

{

6689

int64_t v;

6690

int_fast16_t res;

6691

int old_exc_flags = get_float_exception_flags(status);

6692

6693

v = float64_to_int32(a STATUS_VAR);

6694

if (v < -0x8000) {

6695

res = -0x8000;

6696

} else if (v > 0x7fff) {

6697

res = 0x7fff;

6698

} else {

6699

return v;

6700

}

6701

6702

set_float_exception_flags(old_exc_flags, status);

6703

float_raise(float_flag_invalid STATUS_VAR);

6704

return res;

6705

}

6706

6707

uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)

6708

{

6709

int64_t v;

6710

uint_fast16_t res;

6711

int old_exc_flags = get_float_exception_flags(status);

6712

6713

v = float64_to_int32(a STATUS_VAR);

6714

if (v < 0) {

6715

res = 0;

6716

} else if (v > 0xffff) {

6717

res = 0xffff;

6718

} else {

6719

return v;

6720

}

6721

6722

set_float_exception_flags(old_exc_flags, status);

6723

float_raise(float_flag_invalid STATUS_VAR);

6724

return res;

6725

}

6726

6727

uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)

6728

{

6729

int64_t v;

6730

uint_fast16_t res;

6731

int old_exc_flags = get_float_exception_flags(status);

6732

6733

v = float64_to_int64_round_to_zero(a STATUS_VAR);

6734

if (v < 0) {

6735

res = 0;

6736

} else if (v > 0xffff) {

6737

res = 0xffff;

6738

} else {

6739

return v;

6740

}

6741

set_float_exception_flags(old_exc_flags, status);

6742

float_raise(float_flag_invalid STATUS_VAR);

6743

return res;

6744

}

6745

6746

/*----------------------------------------------------------------------------

6747

| Returns the result of converting the double-precision floating-point value

6748

| `a' to the 64-bit unsigned integer format. The conversion is

6749

| performed according to the IEC/IEEE Standard for Binary Floating-Point

6750

| Arithmetic---which means in particular that the conversion is rounded

6751

| according to the current rounding mode. If `a' is a NaN, the largest

6752

| positive integer is returned. If the conversion overflows, the

6753

| largest unsigned integer is returned. If 'a' is negative, the value is

6754

| rounded and zero is returned; negative values that do not round to zero

6755

| will raise the inexact exception.

6756

*----------------------------------------------------------------------------*/

6757

6758

uint64_t float64_to_uint64(float64 a STATUS_PARAM)

6759

{

6760

flag aSign;

6761

int_fast16_t aExp, shiftCount;

6762

uint64_t aSig, aSigExtra;

6763

a = float64_squash_input_denormal(a STATUS_VAR);

6764

6765

aSig = extractFloat64Frac(a);

6766

aExp = extractFloat64Exp(a);

6767

aSign = extractFloat64Sign(a);

6768

if (aSign && (aExp > 1022)) {

6769

float_raise(float_flag_invalid STATUS_VAR);

6770

if (float64_is_any_nan(a)) {

6771

return LIT64(0xFFFFFFFFFFFFFFFF);

6772

} else {

6773

return 0;

6774

}

6775

}

6776

if (aExp) {

6777

aSig |= LIT64(0x0010000000000000);

6778

}

6779

shiftCount = 0x433 - aExp;

6780

if (shiftCount <= 0) {

6781

if (0x43E < aExp) {

6782

float_raise(float_flag_invalid STATUS_VAR);

6783

return LIT64(0xFFFFFFFFFFFFFFFF);

6784

}

6785

aSigExtra = 0;

6786

aSig <<= -shiftCount;

6787

} else {

6788

shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);

6789

}

6790

return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);

6791

}

6792

6793

uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)

6794

{

6795

int64_t v;

6796

6797

v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));

6798

v += float64_val(a);

6799

v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);

6800

6801

return v - INT64_MIN;

6802

}

6803

6804

#define COMPARE(s, nan_exp) \

6805

INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \

6806

int is_quiet STATUS_PARAM ) \

6807

{ \

6808

flag aSign, bSign; \

6809

uint ## s ## _t av, bv; \

6810

a = float ## s ## _squash_input_denormal(a STATUS_VAR); \

6811

b = float ## s ## _squash_input_denormal(b STATUS_VAR); \

6812

6813

if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \

6814

extractFloat ## s ## Frac( a ) ) || \

6815

( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \

6816

extractFloat ## s ## Frac( b ) )) { \

6817

if (!is_quiet || \

6818

float ## s ## _is_signaling_nan( a ) || \

6819

float ## s ## _is_signaling_nan( b ) ) { \

6820

float_raise( float_flag_invalid STATUS_VAR); \

6821

} \

6822

return float_relation_unordered; \

6823

} \

6824

aSign = extractFloat ## s ## Sign( a ); \

6825

bSign = extractFloat ## s ## Sign( b ); \

6826

av = float ## s ## _val(a); \

6827

bv = float ## s ## _val(b); \

6828

if ( aSign != bSign ) { \

6829

if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \

6830

/* zero case */ \

6831

return float_relation_equal; \

6832

} else { \

6833

return 1 - (2 * aSign); \

6834

} \

6835

} else { \

6836

if (av == bv) { \

6837

return float_relation_equal; \

6838

} else { \

6839

return 1 - 2 * (aSign ^ ( av < bv )); \

6840

} \

6841

} \

6842

} \

6843

6844

int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \

6845

{ \

6846

return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \

6847

} \

6848

6849

int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \

6850

{ \

6851

return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \

6852

}

6853

6854

COMPARE(32, 0xff)

6855

COMPARE(64, 0x7ff)

6856

6857

INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,

6858

int is_quiet STATUS_PARAM )

6859

{

6860

flag aSign, bSign;

6861

6862

if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&

6863

( extractFloatx80Frac( a )<<1 ) ) ||

6864

( ( extractFloatx80Exp( b ) == 0x7fff ) &&

6865

( extractFloatx80Frac( b )<<1 ) )) {

6866

if (!is_quiet ||

6867

floatx80_is_signaling_nan( a ) ||

6868

floatx80_is_signaling_nan( b ) ) {

6869

float_raise( float_flag_invalid STATUS_VAR);

6870

}

6871

return float_relation_unordered;

6872

}

6873

aSign = extractFloatx80Sign( a );

6874

bSign = extractFloatx80Sign( b );

6875

if ( aSign != bSign ) {

6876

6877

if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&

6878

( ( a.low | b.low ) == 0 ) ) {

6879

/* zero case */

6880

return float_relation_equal;

6881

} else {

6882

return 1 - (2 * aSign);

6883

}

6884

} else {

6885

if (a.low == b.low && a.high == b.high) {

6886

return float_relation_equal;

6887

} else {

6888

return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));

6889

}

6890

}

6891

}

6892

6893

int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )

6894

{

6895

return floatx80_compare_internal(a, b, 0 STATUS_VAR);

6896

}

6897

6898

int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )

6899

{

6900

return floatx80_compare_internal(a, b, 1 STATUS_VAR);

6901

}

6902

6903

INLINE int float128_compare_internal( float128 a, float128 b,

6904

int is_quiet STATUS_PARAM )

6905

{

6906

flag aSign, bSign;

6907

6908

if (( ( extractFloat128Exp( a ) == 0x7fff ) &&

6909

( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||

6910

( ( extractFloat128Exp( b ) == 0x7fff ) &&

6911

( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {

6912

if (!is_quiet ||

6913

float128_is_signaling_nan( a ) ||

6914

float128_is_signaling_nan( b ) ) {

6915

float_raise( float_flag_invalid STATUS_VAR);

6916

}

6917

return float_relation_unordered;

6918

}

6919

aSign = extractFloat128Sign( a );

6920

bSign = extractFloat128Sign( b );

6921

if ( aSign != bSign ) {

6922

if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {

6923

/* zero case */

6924

return float_relation_equal;

6925

} else {

6926

return 1 - (2 * aSign);

6927

}

6928

} else {

6929

if (a.low == b.low && a.high == b.high) {

6930

return float_relation_equal;

6931

} else {

6932

return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));

6933

}

6934

}

6935

}

6936

6937

int float128_compare( float128 a, float128 b STATUS_PARAM )

6938

{

6939

return float128_compare_internal(a, b, 0 STATUS_VAR);

6940

}

6941

6942

int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )

6943

{

6944

return float128_compare_internal(a, b, 1 STATUS_VAR);

6945

}

6946

6947

/* min() and max() functions. These can't be implemented as

6948

* 'compare and pick one input' because that would mishandle

6949

* NaNs and +0 vs -0.

6950

6951

* minnum() and maxnum() functions. These are similar to the min()

6952

* and max() functions but if one of the arguments is a QNaN and

6953

* the other is numerical then the numerical argument is returned.

6954

* minnum() and maxnum correspond to the IEEE 754-2008 minNum()

6955

* and maxNum() operations. min() and max() are the typical min/max

6956

* semantics provided by many CPUs which predate that specification.

6957

6958

#define MINMAX(s, nan_exp) \

6959

INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \

6960

int ismin, int isieee STATUS_PARAM) \

6961

{ \

6962

flag aSign, bSign; \

6963

uint ## s ## _t av, bv; \

6964

a = float ## s ## _squash_input_denormal(a STATUS_VAR); \

6965

b = float ## s ## _squash_input_denormal(b STATUS_VAR); \

6966

if (float ## s ## _is_any_nan(a) || \

6967

float ## s ## _is_any_nan(b)) { \

6968

if (isieee) { \

6969

if (float ## s ## _is_quiet_nan(a) && \

6970

!float ## s ##_is_any_nan(b)) { \

6971

return b; \

6972

} else if (float ## s ## _is_quiet_nan(b) && \

6973

!float ## s ## _is_any_nan(a)) { \

6974

return a; \

6975

} \

6976

} \

6977

return propagateFloat ## s ## NaN(a, b STATUS_VAR); \

6978

} \

6979

aSign = extractFloat ## s ## Sign(a); \

6980

bSign = extractFloat ## s ## Sign(b); \

6981

av = float ## s ## _val(a); \

6982

bv = float ## s ## _val(b); \

6983

if (aSign != bSign) { \

6984

if (ismin) { \

6985

return aSign ? a : b; \

6986

} else { \

6987

return aSign ? b : a; \

6988

} \

6989

} else { \

6990

if (ismin) { \

6991

return (aSign ^ (av < bv)) ? a : b; \

6992

} else { \

6993

return (aSign ^ (av < bv)) ? b : a; \

6994

} \

6995

} \

6996

} \

6997

6998

float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \

6999

{ \

7000

return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \

7001

} \

7002

7003

float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \

7004

{ \

7005

return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \

7006

} \

7007

7008

float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \

7009

{ \

7010

return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \

7011

} \

7012

7013

float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \

7014

{ \

7015

return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \

7016

}

7017

7018

MINMAX(32, 0xff)

7019

MINMAX(64, 0x7ff)

7020

7021

7022

/* Multiply A by 2 raised to the power N. */

7023

float32 float32_scalbn( float32 a, int n STATUS_PARAM )

7024

{

7025

flag aSign;

7026

int16_t aExp;

7027

uint32_t aSig;

7028

7029

a = float32_squash_input_denormal(a STATUS_VAR);

7030

aSig = extractFloat32Frac( a );

7031

aExp = extractFloat32Exp( a );

7032

aSign = extractFloat32Sign( a );

7033

7034

if ( aExp == 0xFF ) {

7035

if ( aSig ) {

7036

return propagateFloat32NaN( a, a STATUS_VAR );

7037

}

7038

return a;

7039

}

7040

if (aExp != 0) {

7041

aSig |= 0x00800000;

7042

} else if (aSig == 0) {

7043

return a;

7044

} else {

7045

aExp++;

7046

}

7047

7048

if (n > 0x200) {

7049

n = 0x200;

7050

} else if (n < -0x200) {

7051

n = -0x200;

7052

}

7053

7054

aExp += n - 1;

7055

aSig <<= 7;

7056

return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );

7057

}

7058

7059

float64 float64_scalbn( float64 a, int n STATUS_PARAM )

7060

{

7061

flag aSign;

7062

int16_t aExp;

7063

uint64_t aSig;

7064

7065

a = float64_squash_input_denormal(a STATUS_VAR);

7066

aSig = extractFloat64Frac( a );

7067

aExp = extractFloat64Exp( a );

7068

aSign = extractFloat64Sign( a );

7069

7070

if ( aExp == 0x7FF ) {

7071

if ( aSig ) {

7072

return propagateFloat64NaN( a, a STATUS_VAR );

7073

}

7074

return a;

7075

}

7076

if (aExp != 0) {

7077

aSig |= LIT64( 0x0010000000000000 );

7078

} else if (aSig == 0) {

7079

return a;

7080

} else {

7081

aExp++;

7082

}

7083

7084

if (n > 0x1000) {

7085

n = 0x1000;

7086

} else if (n < -0x1000) {

7087

n = -0x1000;

7088

}

7089

7090

aExp += n - 1;

7091

aSig <<= 10;

7092

return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );

7093

}

7094

7095

floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )

7096

{

7097

flag aSign;

7098

int32_t aExp;

7099

uint64_t aSig;

7100

7101

aSig = extractFloatx80Frac( a );

7102

aExp = extractFloatx80Exp( a );

7103

aSign = extractFloatx80Sign( a );

7104

7105

if ( aExp == 0x7FFF ) {

7106

if ( aSig<<1 ) {

7107

return propagateFloatx80NaN( a, a STATUS_VAR );

7108

}

7109

return a;

7110

}

7111

7112

if (aExp == 0) {

7113

if (aSig == 0) {

7114

return a;

7115

}

7116

aExp++;

7117

}

7118

7119

if (n > 0x10000) {

7120

n = 0x10000;

7121

} else if (n < -0x10000) {

7122

n = -0x10000;

7123

}

7124

7125

aExp += n;

7126

return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),

7127

aSign, aExp, aSig, 0 STATUS_VAR );

7128

}

7129

7130

float128 float128_scalbn( float128 a, int n STATUS_PARAM )

7131

{

7132

flag aSign;

7133

int32_t aExp;

7134

uint64_t aSig0, aSig1;

7135

7136

aSig1 = extractFloat128Frac1( a );

7137

aSig0 = extractFloat128Frac0( a );

7138

aExp = extractFloat128Exp( a );

7139

aSign = extractFloat128Sign( a );

7140

if ( aExp == 0x7FFF ) {

7141

if ( aSig0 | aSig1 ) {

7142

return propagateFloat128NaN( a, a STATUS_VAR );

7143

}

7144

return a;

7145

}

7146

if (aExp != 0) {

7147

aSig0 |= LIT64( 0x0001000000000000 );

7148

} else if (aSig0 == 0 && aSig1 == 0) {

7149

return a;

7150

} else {

7151

aExp++;

7152

}

7153

7154

if (n > 0x10000) {

7155

n = 0x10000;

7156

} else if (n < -0x10000) {

7157

n = -0x10000;

7158

}

7159

7160

aExp += n - 1;

7161

return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1

7162

STATUS_VAR );

7163

7164

}

Older »