~ubuntu-branches/ubuntu/vivid/qemu/vivid

Viewing changes to .pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu/softfloat.c

Committer: Package Import Robot
Author(s): dann frazier
Date: 2014-02-11 15:41:53 UTC
Revision ID: package-import@ubuntu.com-20140211154153-2d001tf0ium08u81

Tags: 1.7.0+dfsg-3ubuntu2

* Backport changes to enable qemu-user-static support for aarch64
* debian/control: add ppc64el to Architectures
* debian/rules: only install qemu-system-aarch64 on arm64.
Fixes a FTBFS when built twice in a row on non-arm64 due to a stale
debian/qemu-system-aarch64 directory

files added:
.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch

.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch/target-arm

.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch/target-arm/cpu64.c

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch/target-arm

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch/target-arm

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch/target-arm

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch/target-arm

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch/target-arm

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch/target-arm

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch/target-arm

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch/target-arm

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch/target-arm

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch/target-arm

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch/target-arm

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/helper.c

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/kvm-consts.h

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/helper.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate.h

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch/target-arm

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/aarch64

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/aarch64/target_cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/arm

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/arm/target_cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/main.c

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch/target-arm

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch/target-arm

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch/target-arm

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/linux-user

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/linux-user/main.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/machine.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/translate.c

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/linux-user

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/linux-user/main.c

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/target-arm

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user/aarch64

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user/aarch64/syscall.h

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch/linux-user

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch/linux-user/signal.c

.pc/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch

.pc/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch/.travis.yml

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch/default-configs

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch/default-configs/aarch64-linux-user.mak

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch/target-arm

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch/target-arm

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/helper.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/helper.h

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/neon_helper.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/translate.c

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch/target-arm

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch/target-arm

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch/target-arm

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/helper-a64.c

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/helper-a64.h

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch/target-arm

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch/target-arm

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm/helper.c

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch/fpu

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/fpu

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include/fpu

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include/fpu

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/fpu

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include/fpu

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch/fpu

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch/fpu

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch/fpu

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include/fpu

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch/fpu

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch/fpu

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/fpu

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include/fpu

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch/fpu

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/fpu

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include/fpu

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch/fpu

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include/fpu

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch/target-arm

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch/target-arm/helper.c

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/helper.c

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/helper.h

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/translate.c

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch/target-arm

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch/target-arm/helper.c

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm/helper.h

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/helper.c

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/helper.h

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch/target-arm

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/helper.c

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/helper.h

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/helper.c

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/helper.h

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch/target-arm

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch/target-arm/helper.c

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch/target-arm

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch/target-arm

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch/target-arm

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch/target-arm

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch/target-arm

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/helper-a64.c

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/helper-a64.h

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch/target-arm

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch/target-arm

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch/target-arm

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch/target-arm

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch/target-arm

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/helper.c

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch/target-arm

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch/target-arm/translate.c

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch/target-arm

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch/target-arm/translate.c

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch/target-arm

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch/target-arm/translate.c

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch/target-arm

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch/target-arm/translate.c

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch/target-arm

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch/target-arm/translate.c

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm/helper.c

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm/helper.h

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch/target-arm

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch/target-arm/translate.c

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch/target-arm

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch/target-arm/translate.c

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch/target-arm

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch/target-arm/translate.c

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch/target-arm

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch/target-arm

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch/target-arm

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch/target-arm

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch/target-arm

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch/target-arm

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch/target-arm

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch/target-arm

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch/target-arm

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch/target-arm

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch/target-arm

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch/tcg

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch/tcg/tcg.h

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch/target-arm

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch/target-arm

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch/target-arm

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch/target-arm

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch/target-arm

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/helper.h

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/neon_helper.c

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch/target-arm

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch/target-arm

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch/target-arm

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch/target-arm

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch/target-arm/translate.c

.pc/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch

.pc/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch/rules.mak

.pc/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch

.pc/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch/rules.mak

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/LICENCE

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/README

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/assembler-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/constants-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/cpu-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/decoder-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/decoder-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/disasm-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/disasm-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/instructions-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/instructions-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/globals.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/platform.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/utils.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/utils.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/a64

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/a64/instructions-a64.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/globals.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/utils.h

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/configure

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas.c

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/Makefile.objs

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/arm-a64.cc

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/libvixl

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/libvixl/Makefile.objs

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include/disas

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include/disas/bfd.h

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/target-arm

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc/linux-user

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc/linux-user/main.c

debian/binfmts/qemu-aarch64

debian/patches/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch

debian/patches/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch

debian/patches/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch

debian/patches/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch

debian/patches/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch

debian/patches/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch

debian/patches/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch

debian/patches/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch

debian/patches/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch

debian/patches/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch

debian/patches/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch

debian/patches/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch

debian/patches/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch

debian/patches/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch

debian/patches/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch

debian/patches/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch

debian/patches/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch

debian/patches/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch

debian/patches/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch

debian/patches/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch

debian/patches/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch

debian/patches/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch

debian/patches/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch

debian/patches/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch

debian/patches/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch

debian/patches/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch

debian/patches/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch

debian/patches/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch

debian/patches/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch

debian/patches/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch

debian/patches/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch

debian/patches/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch

debian/patches/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch

debian/patches/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch

debian/patches/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch

debian/patches/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch

debian/patches/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch

debian/patches/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch

debian/patches/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch

debian/patches/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch

debian/patches/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch

debian/patches/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch

debian/patches/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch

debian/patches/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch

debian/patches/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch

debian/patches/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch

debian/patches/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch

debian/patches/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch

debian/patches/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch

debian/patches/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch

debian/patches/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch

debian/patches/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch

debian/patches/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch

debian/patches/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch

debian/patches/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch

debian/patches/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch

debian/patches/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch

debian/patches/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch

debian/patches/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch

debian/patches/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch

debian/patches/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch

debian/patches/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch

debian/patches/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch

debian/patches/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch

debian/patches/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch

debian/patches/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch

debian/patches/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch

debian/patches/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch

debian/patches/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch

debian/patches/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch

debian/patches/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch

debian/patches/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch

debian/patches/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch

debian/patches/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch

debian/patches/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch

debian/patches/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch

debian/patches/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch

debian/patches/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch

debian/patches/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch

debian/patches/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch

debian/patches/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch

debian/patches/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch

debian/patches/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch

debian/patches/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch

debian/patches/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch

debian/patches/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch

debian/patches/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch

debian/patches/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch

debian/patches/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch

debian/patches/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch

debian/patches/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch

debian/patches/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch

debian/patches/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch

debian/patches/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch

debian/patches/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch

debian/patches/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch

debian/patches/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch

debian/patches/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch

debian/patches/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch

debian/patches/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch

debian/patches/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch

debian/patches/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch

debian/patches/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch

debian/patches/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch

debian/patches/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch

debian/patches/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch

debian/patches/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch

debian/patches/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch

debian/patches/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch

debian/patches/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc

default-configs/aarch64-linux-user.mak

disas/arm-a64.cc

disas/libvixl

disas/libvixl/LICENCE

disas/libvixl/Makefile.objs

disas/libvixl/README

disas/libvixl/a64

disas/libvixl/a64/assembler-a64.h

disas/libvixl/a64/constants-a64.h

disas/libvixl/a64/cpu-a64.h

disas/libvixl/a64/decoder-a64.cc

disas/libvixl/a64/decoder-a64.h

disas/libvixl/a64/disasm-a64.cc

disas/libvixl/a64/disasm-a64.h

disas/libvixl/a64/instructions-a64.cc

disas/libvixl/a64/instructions-a64.h

disas/libvixl/globals.h

disas/libvixl/platform.h

disas/libvixl/utils.cc

disas/libvixl/utils.h

files modified:
.pc/applied-patches

.travis.yml

configure

debian/changelog

debian/patches/series

debian/rules

disas.c

disas/Makefile.objs

fpu/softfloat.c

include/disas/bfd.h

include/fpu/softfloat.h

linux-user/aarch64/syscall.h

linux-user/aarch64/target_cpu.h

linux-user/arm/target_cpu.h

linux-user/main.c

linux-user/signal.c

rules.mak

target-arm/cpu.h

target-arm/cpu64.c

target-arm/helper-a64.c

target-arm/helper-a64.h

target-arm/helper.c

target-arm/helper.h

target-arm/kvm-consts.h

target-arm/machine.c

target-arm/neon_helper.c

target-arm/translate-a64.c

target-arm/translate.c

target-arm/translate.h

tcg/tcg.h

Show diffs side-by-side

added added

removed removed

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu/softfloat.c

* QEMU float support

* Derived from SoftFloat.

/*============================================================================

This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic

Package, Release 2b.

Written by John R. Hauser. This work was made possible in part by the

International Computer Science Institute, located at Suite 600, 1947 Center

Street, Berkeley, California 94704. Funding was partially provided by the

National Science Foundation under grant MIP-9311980. The original version

of this code was written as part of a project to build a fixed-point vector

processor in collaboration with the University of California at Berkeley,

overseen by Profs. Nelson Morgan and John Wawrzynek. More information

is available through the Web page `http://www.cs.berkeley.edu/~jhauser/

arithmetic/SoftFloat.html'.

THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has

been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES

RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS

AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,

COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE

EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE

INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR

OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.

Derivative works are acceptable, even for commercial purposes, so long as

(1) the source code for the derivative work includes prominent notice that

the work is derivative, and (2) the source code includes prominent notice with

these four paragraphs for those parts of this code that are retained.

=============================================================================*/

/* softfloat (and in particular the code in softfloat-specialize.h) is

* target-dependent and needs the TARGET_* macros.

#include "config.h"

#include "fpu/softfloat.h"

/* We only need stdlib for abort() */

#include <stdlib.h>

/*----------------------------------------------------------------------------

| Primitive arithmetic functions, including multi-word arithmetic, and

| division and square root approximations. (Can be specialized to target if

| desired.)

*----------------------------------------------------------------------------*/

#include "softfloat-macros.h"

/*----------------------------------------------------------------------------

| Functions and definitions to determine: (1) whether tininess for underflow

| is detected before or after rounding by default, (2) what (if anything)

| happens when exceptions are raised, (3) how signaling NaNs are distinguished

| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs

| are propagated from function inputs to output. These details are target-

| specific.

*----------------------------------------------------------------------------*/

#include "softfloat-specialize.h"

/*----------------------------------------------------------------------------

| Returns the fraction bits of the half-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE uint32_t extractFloat16Frac(float16 a)

{

return float16_val(a) & 0x3ff;

}

/*----------------------------------------------------------------------------

| Returns the exponent bits of the half-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE int_fast16_t extractFloat16Exp(float16 a)

{

return (float16_val(a) >> 10) & 0x1f;

}

/*----------------------------------------------------------------------------

| Returns the sign bit of the single-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE flag extractFloat16Sign(float16 a)

{

return float16_val(a)>>15;

}

/*----------------------------------------------------------------------------

| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6

| and 7, and returns the properly rounded 32-bit integer corresponding to the

| input. If `zSign' is 1, the input is negated before being converted to an

| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input

| is simply rounded to an integer, with the inexact exception raised if the

| input cannot be represented exactly as an integer. However, if the fixed-

| point input is too large, the invalid exception is raised and the largest

100

| positive or negative integer is returned.

101

*----------------------------------------------------------------------------*/

102

103

static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)

104

{

105

int8 roundingMode;

106

flag roundNearestEven;

107

int8 roundIncrement, roundBits;

108

int32_t z;

109

110

roundingMode = STATUS(float_rounding_mode);

111

roundNearestEven = ( roundingMode == float_round_nearest_even );

112

switch (roundingMode) {

113

case float_round_nearest_even:

114

roundIncrement = 0x40;

115

break;

116

case float_round_to_zero:

117

roundIncrement = 0;

118

break;

119

case float_round_up:

120

roundIncrement = zSign ? 0 : 0x7f;

121

break;

122

case float_round_down:

123

roundIncrement = zSign ? 0x7f : 0;

124

break;

125

default:

126

abort();

127

}

128

roundBits = absZ & 0x7F;

129

absZ = ( absZ + roundIncrement )>>7;

130

absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );

131

z = absZ;

132

if ( zSign ) z = - z;

133

if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {

134

float_raise( float_flag_invalid STATUS_VAR);

135

return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

136

}

137

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

138

return z;

139

140

}

141

142

/*----------------------------------------------------------------------------

143

| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and

144

| `absZ1', with binary point between bits 63 and 64 (between the input words),

145

| and returns the properly rounded 64-bit integer corresponding to the input.

146

| If `zSign' is 1, the input is negated before being converted to an integer.

147

| Ordinarily, the fixed-point input is simply rounded to an integer, with

148

| the inexact exception raised if the input cannot be represented exactly as

149

| an integer. However, if the fixed-point input is too large, the invalid

150

| exception is raised and the largest positive or negative integer is

151

| returned.

152

*----------------------------------------------------------------------------*/

153

154

static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)

155

{

156

int8 roundingMode;

157

flag roundNearestEven, increment;

158

int64_t z;

159

160

roundingMode = STATUS(float_rounding_mode);

161

roundNearestEven = ( roundingMode == float_round_nearest_even );

162

switch (roundingMode) {

163

case float_round_nearest_even:

164

increment = ((int64_t) absZ1 < 0);

165

break;

166

case float_round_to_zero:

167

increment = 0;

168

break;

169

case float_round_up:

170

increment = !zSign && absZ1;

171

break;

172

case float_round_down:

173

increment = zSign && absZ1;

174

break;

175

default:

176

abort();

177

}

178

if ( increment ) {

179

++absZ0;

180

if ( absZ0 == 0 ) goto overflow;

181

absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );

182

}

183

z = absZ0;

184

if ( zSign ) z = - z;

185

if ( z && ( ( z < 0 ) ^ zSign ) ) {

186

overflow:

187

float_raise( float_flag_invalid STATUS_VAR);

188

return

189

zSign ? (int64_t) LIT64( 0x8000000000000000 )

190

: LIT64( 0x7FFFFFFFFFFFFFFF );

191

}

192

if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;

193

return z;

194

195

}

196

197

/*----------------------------------------------------------------------------

198

| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and

199

| `absZ1', with binary point between bits 63 and 64 (between the input words),

200

| and returns the properly rounded 64-bit unsigned integer corresponding to the

201

| input. Ordinarily, the fixed-point input is simply rounded to an integer,

202

| with the inexact exception raised if the input cannot be represented exactly

203

| as an integer. However, if the fixed-point input is too large, the invalid

204

| exception is raised and the largest unsigned integer is returned.

205

*----------------------------------------------------------------------------*/

206

207

static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,

208

uint64_t absZ1 STATUS_PARAM)

209

{

210

int8 roundingMode;

211

flag roundNearestEven, increment;

212

213

roundingMode = STATUS(float_rounding_mode);

214

roundNearestEven = (roundingMode == float_round_nearest_even);

215

switch (roundingMode) {

216

case float_round_nearest_even:

217

increment = ((int64_t)absZ1 < 0);

218

break;

219

case float_round_to_zero:

220

increment = 0;

221

break;

222

case float_round_up:

223

increment = !zSign && absZ1;

224

break;

225

case float_round_down:

226

increment = zSign && absZ1;

227

break;

228

default:

229

abort();

230

}

231

if (increment) {

232

++absZ0;

233

if (absZ0 == 0) {

234

float_raise(float_flag_invalid STATUS_VAR);

235

return LIT64(0xFFFFFFFFFFFFFFFF);

236

}

237

absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);

238

}

239

240

if (zSign && absZ0) {

241

float_raise(float_flag_invalid STATUS_VAR);

242

return 0;

243

}

244

245

if (absZ1) {

246

STATUS(float_exception_flags) |= float_flag_inexact;

247

}

248

return absZ0;

249

}

250

251

/*----------------------------------------------------------------------------

252

| Returns the fraction bits of the single-precision floating-point value `a'.

253

*----------------------------------------------------------------------------*/

254

255

INLINE uint32_t extractFloat32Frac( float32 a )

256

{

257

258

return float32_val(a) & 0x007FFFFF;

259

260

}

261

262

/*----------------------------------------------------------------------------

263

| Returns the exponent bits of the single-precision floating-point value `a'.

264

*----------------------------------------------------------------------------*/

265

266

INLINE int_fast16_t extractFloat32Exp(float32 a)

267

{

268

269

return ( float32_val(a)>>23 ) & 0xFF;

270

271

}

272

273

/*----------------------------------------------------------------------------

274

| Returns the sign bit of the single-precision floating-point value `a'.

275

*----------------------------------------------------------------------------*/

276

277

INLINE flag extractFloat32Sign( float32 a )

278

{

279

280

return float32_val(a)>>31;

281

282

}

283

284

/*----------------------------------------------------------------------------

285

| If `a' is denormal and we are in flush-to-zero mode then set the

286

| input-denormal exception and return zero. Otherwise just return the value.

287

*----------------------------------------------------------------------------*/

288

static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)

289

{

290

if (STATUS(flush_inputs_to_zero)) {

291

if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {

292

float_raise(float_flag_input_denormal STATUS_VAR);

293

return make_float32(float32_val(a) & 0x80000000);

294

}

295

}

296

return a;

297

}

298

299

/*----------------------------------------------------------------------------

300

| Normalizes the subnormal single-precision floating-point value represented

301

| by the denormalized significand `aSig'. The normalized exponent and

302

| significand are stored at the locations pointed to by `zExpPtr' and

303

| `zSigPtr', respectively.

304

*----------------------------------------------------------------------------*/

305

306

static void

307

normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)

308

{

309

int8 shiftCount;

310

311

shiftCount = countLeadingZeros32( aSig ) - 8;

312

*zSigPtr = aSig<<shiftCount;

313

*zExpPtr = 1 - shiftCount;

314

315

}

316

317

/*----------------------------------------------------------------------------

318

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

319

| single-precision floating-point value, returning the result. After being

320

| shifted into the proper positions, the three fields are simply added

321

| together to form the result. This means that any integer portion of `zSig'

322

| will be added into the exponent. Since a properly normalized significand

323

| will have an integer portion equal to 1, the `zExp' input should be 1 less

324

| than the desired result exponent whenever `zSig' is a complete, normalized

325

| significand.

326

*----------------------------------------------------------------------------*/

327

328

INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)

329

{

330

331

return make_float32(

332

( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);

333

334

}

335

336

/*----------------------------------------------------------------------------

337

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

338

| and significand `zSig', and returns the proper single-precision floating-

339

| point value corresponding to the abstract input. Ordinarily, the abstract

340

| value is simply rounded and packed into the single-precision format, with

341

| the inexact exception raised if the abstract input cannot be represented

342

| exactly. However, if the abstract value is too large, the overflow and

343

| inexact exceptions are raised and an infinity or maximal finite value is

344

| returned. If the abstract value is too small, the input value is rounded to

345

| a subnormal number, and the underflow and inexact exceptions are raised if

346

| the abstract input cannot be represented exactly as a subnormal single-

347

| precision floating-point number.

348

| The input significand `zSig' has its binary point between bits 30

349

| and 29, which is 7 bits to the left of the usual location. This shifted

350

| significand must be normalized or smaller. If `zSig' is not normalized,

351

| `zExp' must be 0; in that case, the result returned is a subnormal number,

352

| and it must not require rounding. In the usual case that `zSig' is

353

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

354

| The handling of underflow and overflow follows the IEC/IEEE Standard for

355

| Binary Floating-Point Arithmetic.

356

*----------------------------------------------------------------------------*/

357

358

static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)

359

{

360

int8 roundingMode;

361

flag roundNearestEven;

362

int8 roundIncrement, roundBits;

363

flag isTiny;

364

365

roundingMode = STATUS(float_rounding_mode);

366

roundNearestEven = ( roundingMode == float_round_nearest_even );

367

switch (roundingMode) {

368

case float_round_nearest_even:

369

roundIncrement = 0x40;

370

break;

371

case float_round_to_zero:

372

roundIncrement = 0;

373

break;

374

case float_round_up:

375

roundIncrement = zSign ? 0 : 0x7f;

376

break;

377

case float_round_down:

378

roundIncrement = zSign ? 0x7f : 0;

379

break;

380

default:

381

abort();

382

break;

383

}

384

roundBits = zSig & 0x7F;

385

if ( 0xFD <= (uint16_t) zExp ) {

386

if ( ( 0xFD < zExp )

387

|| ( ( zExp == 0xFD )

388

&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )

389

) {

390

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

391

return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));

392

}

393

if ( zExp < 0 ) {

394

if (STATUS(flush_to_zero)) {

395

float_raise(float_flag_output_denormal STATUS_VAR);

396

return packFloat32(zSign, 0, 0);

397

}

398

isTiny =

399

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

400

|| ( zExp < -1 )

401

|| ( zSig + roundIncrement < 0x80000000 );

402

shift32RightJamming( zSig, - zExp, &zSig );

403

zExp = 0;

404

roundBits = zSig & 0x7F;

405

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

406

}

407

}

408

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

409

zSig = ( zSig + roundIncrement )>>7;

410

zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );

411

if ( zSig == 0 ) zExp = 0;

412

return packFloat32( zSign, zExp, zSig );

413

414

}

415

416

/*----------------------------------------------------------------------------

417

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

418

| and significand `zSig', and returns the proper single-precision floating-

419

| point value corresponding to the abstract input. This routine is just like

420

| `roundAndPackFloat32' except that `zSig' does not have to be normalized.

421

| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''

422

| floating-point exponent.

423

*----------------------------------------------------------------------------*/

424

425

static float32

426

normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)

427

{

428

int8 shiftCount;

429

430

shiftCount = countLeadingZeros32( zSig ) - 1;

431

return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);

432

433

}

434

435

/*----------------------------------------------------------------------------

436

| Returns the fraction bits of the double-precision floating-point value `a'.

437

*----------------------------------------------------------------------------*/

438

439

INLINE uint64_t extractFloat64Frac( float64 a )

440

{

441

442

return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );

443

444

}

445

446

/*----------------------------------------------------------------------------

447

| Returns the exponent bits of the double-precision floating-point value `a'.

448

*----------------------------------------------------------------------------*/

449

450

INLINE int_fast16_t extractFloat64Exp(float64 a)

451

{

452

453

return ( float64_val(a)>>52 ) & 0x7FF;

454

455

}

456

457

/*----------------------------------------------------------------------------

458

| Returns the sign bit of the double-precision floating-point value `a'.

459

*----------------------------------------------------------------------------*/

460

461

INLINE flag extractFloat64Sign( float64 a )

462

{

463

464

return float64_val(a)>>63;

465

466

}

467

468

/*----------------------------------------------------------------------------

469

| If `a' is denormal and we are in flush-to-zero mode then set the

470

| input-denormal exception and return zero. Otherwise just return the value.

471

*----------------------------------------------------------------------------*/

472

static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)

473

{

474

if (STATUS(flush_inputs_to_zero)) {

475

if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {

476

float_raise(float_flag_input_denormal STATUS_VAR);

477

return make_float64(float64_val(a) & (1ULL << 63));

478

}

479

}

480

return a;

481

}

482

483

/*----------------------------------------------------------------------------

484

| Normalizes the subnormal double-precision floating-point value represented

485

| by the denormalized significand `aSig'. The normalized exponent and

486

| significand are stored at the locations pointed to by `zExpPtr' and

487

| `zSigPtr', respectively.

488

*----------------------------------------------------------------------------*/

489

490

static void

491

normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)

492

{

493

int8 shiftCount;

494

495

shiftCount = countLeadingZeros64( aSig ) - 11;

496

*zSigPtr = aSig<<shiftCount;

497

*zExpPtr = 1 - shiftCount;

498

499

}

500

501

/*----------------------------------------------------------------------------

502

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

503

| double-precision floating-point value, returning the result. After being

504

| shifted into the proper positions, the three fields are simply added

505

| together to form the result. This means that any integer portion of `zSig'

506

| will be added into the exponent. Since a properly normalized significand

507

| will have an integer portion equal to 1, the `zExp' input should be 1 less

508

| than the desired result exponent whenever `zSig' is a complete, normalized

509

| significand.

510

*----------------------------------------------------------------------------*/

511

512

INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)

513

{

514

515

return make_float64(

516

( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);

517

518

}

519

520

/*----------------------------------------------------------------------------

521

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

522

| and significand `zSig', and returns the proper double-precision floating-

523

| point value corresponding to the abstract input. Ordinarily, the abstract

524

| value is simply rounded and packed into the double-precision format, with

525

| the inexact exception raised if the abstract input cannot be represented

526

| exactly. However, if the abstract value is too large, the overflow and

527

| inexact exceptions are raised and an infinity or maximal finite value is

528

| returned. If the abstract value is too small, the input value is rounded

529

| to a subnormal number, and the underflow and inexact exceptions are raised

530

| if the abstract input cannot be represented exactly as a subnormal double-

531

| precision floating-point number.

532

| The input significand `zSig' has its binary point between bits 62

533

| and 61, which is 10 bits to the left of the usual location. This shifted

534

| significand must be normalized or smaller. If `zSig' is not normalized,

535

| `zExp' must be 0; in that case, the result returned is a subnormal number,

536

| and it must not require rounding. In the usual case that `zSig' is

537

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

538

| The handling of underflow and overflow follows the IEC/IEEE Standard for

539

| Binary Floating-Point Arithmetic.

540

*----------------------------------------------------------------------------*/

541

542

static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)

543

{

544

int8 roundingMode;

545

flag roundNearestEven;

546

int_fast16_t roundIncrement, roundBits;

547

flag isTiny;

548

549

roundingMode = STATUS(float_rounding_mode);

550

roundNearestEven = ( roundingMode == float_round_nearest_even );

551

switch (roundingMode) {

552

case float_round_nearest_even:

553

roundIncrement = 0x200;

554

break;

555

case float_round_to_zero:

556

roundIncrement = 0;

557

break;

558

case float_round_up:

559

roundIncrement = zSign ? 0 : 0x3ff;

560

break;

561

case float_round_down:

562

roundIncrement = zSign ? 0x3ff : 0;

563

break;

564

default:

565

abort();

566

}

567

roundBits = zSig & 0x3FF;

568

if ( 0x7FD <= (uint16_t) zExp ) {

569

if ( ( 0x7FD < zExp )

570

|| ( ( zExp == 0x7FD )

571

&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )

572

) {

573

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

574

return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));

575

}

576

if ( zExp < 0 ) {

577

if (STATUS(flush_to_zero)) {

578

float_raise(float_flag_output_denormal STATUS_VAR);

579

return packFloat64(zSign, 0, 0);

580

}

581

isTiny =

582

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

583

|| ( zExp < -1 )

584

|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );

585

shift64RightJamming( zSig, - zExp, &zSig );

586

zExp = 0;

587

roundBits = zSig & 0x3FF;

588

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

589

}

590

}

591

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

592

zSig = ( zSig + roundIncrement )>>10;

593

zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );

594

if ( zSig == 0 ) zExp = 0;

595

return packFloat64( zSign, zExp, zSig );

596

597

}

598

599

/*----------------------------------------------------------------------------

600

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

601

| and significand `zSig', and returns the proper double-precision floating-

602

| point value corresponding to the abstract input. This routine is just like

603

| `roundAndPackFloat64' except that `zSig' does not have to be normalized.

604

| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''

605

| floating-point exponent.

606

*----------------------------------------------------------------------------*/

607

608

static float64

609

normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)

610

{

611

int8 shiftCount;

612

613

shiftCount = countLeadingZeros64( zSig ) - 1;

614

return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);

615

616

}

617

618

/*----------------------------------------------------------------------------

619

| Returns the fraction bits of the extended double-precision floating-point

620

| value `a'.

621

*----------------------------------------------------------------------------*/

622

623

INLINE uint64_t extractFloatx80Frac( floatx80 a )

624

{

625

626

return a.low;

627

628

}

629

630

/*----------------------------------------------------------------------------

631

| Returns the exponent bits of the extended double-precision floating-point

632

| value `a'.

633

*----------------------------------------------------------------------------*/

634

635

INLINE int32 extractFloatx80Exp( floatx80 a )

636

{

637

638

return a.high & 0x7FFF;

639

640

}

641

642

/*----------------------------------------------------------------------------

643

| Returns the sign bit of the extended double-precision floating-point value

644

| `a'.

645

*----------------------------------------------------------------------------*/

646

647

INLINE flag extractFloatx80Sign( floatx80 a )

648

{

649

650

return a.high>>15;

651

652

}

653

654

/*----------------------------------------------------------------------------

655

| Normalizes the subnormal extended double-precision floating-point value

656

| represented by the denormalized significand `aSig'. The normalized exponent

657

| and significand are stored at the locations pointed to by `zExpPtr' and

658

| `zSigPtr', respectively.

659

*----------------------------------------------------------------------------*/

660

661

static void

662

normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )

663

{

664

int8 shiftCount;

665

666

shiftCount = countLeadingZeros64( aSig );

667

*zSigPtr = aSig<<shiftCount;

668

*zExpPtr = 1 - shiftCount;

669

670

}

671

672

/*----------------------------------------------------------------------------

673

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an

674

| extended double-precision floating-point value, returning the result.

675

*----------------------------------------------------------------------------*/

676

677

INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )

678

{

679

floatx80 z;

680

681

z.low = zSig;

682

z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;

683

return z;

684

685

}

686

687

/*----------------------------------------------------------------------------

688

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

689

| and extended significand formed by the concatenation of `zSig0' and `zSig1',

690

| and returns the proper extended double-precision floating-point value

691

| corresponding to the abstract input. Ordinarily, the abstract value is

692

| rounded and packed into the extended double-precision format, with the

693

| inexact exception raised if the abstract input cannot be represented

694

| exactly. However, if the abstract value is too large, the overflow and

695

| inexact exceptions are raised and an infinity or maximal finite value is

696

| returned. If the abstract value is too small, the input value is rounded to

697

| a subnormal number, and the underflow and inexact exceptions are raised if

698

| the abstract input cannot be represented exactly as a subnormal extended

699

| double-precision floating-point number.

700

| If `roundingPrecision' is 32 or 64, the result is rounded to the same

701

| number of bits as single or double precision, respectively. Otherwise, the

702

| result is rounded to the full precision of the extended double-precision

703

| format.

704

| The input significand must be normalized or smaller. If the input

705

| significand is not normalized, `zExp' must be 0; in that case, the result

706

| returned is a subnormal number, and it must not require rounding. The

707

| handling of underflow and overflow follows the IEC/IEEE Standard for Binary

708

| Floating-Point Arithmetic.

709

*----------------------------------------------------------------------------*/

710

711

static floatx80

712

roundAndPackFloatx80(

713

int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1

714

STATUS_PARAM)

715

{

716

int8 roundingMode;

717

flag roundNearestEven, increment, isTiny;

718

int64 roundIncrement, roundMask, roundBits;

719

720

roundingMode = STATUS(float_rounding_mode);

721

roundNearestEven = ( roundingMode == float_round_nearest_even );

722

if ( roundingPrecision == 80 ) goto precision80;

723

if ( roundingPrecision == 64 ) {

724

roundIncrement = LIT64( 0x0000000000000400 );

725

roundMask = LIT64( 0x00000000000007FF );

726

}

727

else if ( roundingPrecision == 32 ) {

728

roundIncrement = LIT64( 0x0000008000000000 );

729

roundMask = LIT64( 0x000000FFFFFFFFFF );

730

}

731

else {

732

goto precision80;

733

}

734

zSig0 |= ( zSig1 != 0 );

735

switch (roundingMode) {

736

case float_round_nearest_even:

737

break;

738

case float_round_to_zero:

739

roundIncrement = 0;

740

break;

741

case float_round_up:

742

roundIncrement = zSign ? 0 : roundMask;

743

break;

744

case float_round_down:

745

roundIncrement = zSign ? roundMask : 0;

746

break;

747

default:

748

abort();

749

}

750

roundBits = zSig0 & roundMask;

751

if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {

752

if ( ( 0x7FFE < zExp )

753

|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )

754

) {

755

goto overflow;

756

}

757

if ( zExp <= 0 ) {

758

if (STATUS(flush_to_zero)) {

759

float_raise(float_flag_output_denormal STATUS_VAR);

760

return packFloatx80(zSign, 0, 0);

761

}

762

isTiny =

763

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

764

|| ( zExp < 0 )

765

|| ( zSig0 <= zSig0 + roundIncrement );

766

shift64RightJamming( zSig0, 1 - zExp, &zSig0 );

767

zExp = 0;

768

roundBits = zSig0 & roundMask;

769

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

770

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

771

zSig0 += roundIncrement;

772

if ( (int64_t) zSig0 < 0 ) zExp = 1;

773

roundIncrement = roundMask + 1;

774

if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {

775

roundMask |= roundIncrement;

776

}

777

zSig0 &= ~ roundMask;

778

return packFloatx80( zSign, zExp, zSig0 );

779

}

780

}

781

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

782

zSig0 += roundIncrement;

783

if ( zSig0 < roundIncrement ) {

784

++zExp;

785

zSig0 = LIT64( 0x8000000000000000 );

786

}

787

roundIncrement = roundMask + 1;

788

if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {

789

roundMask |= roundIncrement;

790

}

791

zSig0 &= ~ roundMask;

792

if ( zSig0 == 0 ) zExp = 0;

793

return packFloatx80( zSign, zExp, zSig0 );

794

precision80:

795

switch (roundingMode) {

796

case float_round_nearest_even:

797

increment = ((int64_t)zSig1 < 0);

798

break;

799

case float_round_to_zero:

800

increment = 0;

801

break;

802

case float_round_up:

803

increment = !zSign && zSig1;

804

break;

805

case float_round_down:

806

increment = zSign && zSig1;

807

break;

808

default:

809

abort();

810

}

811

if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {

812

if ( ( 0x7FFE < zExp )

813

|| ( ( zExp == 0x7FFE )

814

&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )

815

&& increment

816

)

817

) {

818

roundMask = 0;

819

overflow:

820

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

821

if ( ( roundingMode == float_round_to_zero )

822

|| ( zSign && ( roundingMode == float_round_up ) )

823

|| ( ! zSign && ( roundingMode == float_round_down ) )

824

) {

825

return packFloatx80( zSign, 0x7FFE, ~ roundMask );

826

}

827

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

828

}

829

if ( zExp <= 0 ) {

830

isTiny =

831

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

832

|| ( zExp < 0 )

833

|| ! increment

834

|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );

835

shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );

836

zExp = 0;

837

if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);

838

if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

839

switch (roundingMode) {

840

case float_round_nearest_even:

841

increment = ((int64_t)zSig1 < 0);

842

break;

843

case float_round_to_zero:

844

increment = 0;

845

break;

846

case float_round_up:

847

increment = !zSign && zSig1;

848

break;

849

case float_round_down:

850

increment = zSign && zSig1;

851

break;

852

default:

853

abort();

854

}

855

if ( increment ) {

856

++zSig0;

857

zSig0 &=

858

~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );

859

if ( (int64_t) zSig0 < 0 ) zExp = 1;

860

}

861

return packFloatx80( zSign, zExp, zSig0 );

862

}

863

}

864

if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

865

if ( increment ) {

866

++zSig0;

867

if ( zSig0 == 0 ) {

868

++zExp;

869

zSig0 = LIT64( 0x8000000000000000 );

870

}

871

else {

872

zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );

873

}

874

}

875

else {

876

if ( zSig0 == 0 ) zExp = 0;

877

}

878

return packFloatx80( zSign, zExp, zSig0 );

879

880

}

881

882

/*----------------------------------------------------------------------------

883

| Takes an abstract floating-point value having sign `zSign', exponent

884

| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',

885

| and returns the proper extended double-precision floating-point value

886

| corresponding to the abstract input. This routine is just like

887

| `roundAndPackFloatx80' except that the input significand does not have to be

888

| normalized.

889

*----------------------------------------------------------------------------*/

890

891

static floatx80

892

normalizeRoundAndPackFloatx80(

893

int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1

894

STATUS_PARAM)

895

{

896

int8 shiftCount;

897

898

if ( zSig0 == 0 ) {

899

zSig0 = zSig1;

900

zSig1 = 0;

901

zExp -= 64;

902

}

903

shiftCount = countLeadingZeros64( zSig0 );

904

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

905

zExp -= shiftCount;

906

return

907

roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);

908

909

}

910

911

/*----------------------------------------------------------------------------

912

| Returns the least-significant 64 fraction bits of the quadruple-precision

913

| floating-point value `a'.

914

*----------------------------------------------------------------------------*/

915

916

INLINE uint64_t extractFloat128Frac1( float128 a )

917

{

918

919

return a.low;

920

921

}

922

923

/*----------------------------------------------------------------------------

924

| Returns the most-significant 48 fraction bits of the quadruple-precision

925

| floating-point value `a'.

926

*----------------------------------------------------------------------------*/

927

928

INLINE uint64_t extractFloat128Frac0( float128 a )

929

{

930

931

return a.high & LIT64( 0x0000FFFFFFFFFFFF );

932

933

}

934

935

/*----------------------------------------------------------------------------

936

| Returns the exponent bits of the quadruple-precision floating-point value

937

| `a'.

938

*----------------------------------------------------------------------------*/

939

940

INLINE int32 extractFloat128Exp( float128 a )

941

{

942

943

return ( a.high>>48 ) & 0x7FFF;

944

945

}

946

947

/*----------------------------------------------------------------------------

948

| Returns the sign bit of the quadruple-precision floating-point value `a'.

949

*----------------------------------------------------------------------------*/

950

951

INLINE flag extractFloat128Sign( float128 a )

952

{

953

954

return a.high>>63;

955

956

}

957

958

/*----------------------------------------------------------------------------

959

| Normalizes the subnormal quadruple-precision floating-point value

960

| represented by the denormalized significand formed by the concatenation of

961

| `aSig0' and `aSig1'. The normalized exponent is stored at the location

962

| pointed to by `zExpPtr'. The most significant 49 bits of the normalized

963

| significand are stored at the location pointed to by `zSig0Ptr', and the

964

| least significant 64 bits of the normalized significand are stored at the

965

| location pointed to by `zSig1Ptr'.

966

*----------------------------------------------------------------------------*/

967

968

static void

969

normalizeFloat128Subnormal(

970

uint64_t aSig0,

971

uint64_t aSig1,

972

int32 *zExpPtr,

973

uint64_t *zSig0Ptr,

974

uint64_t *zSig1Ptr

975

)

976

{

977

int8 shiftCount;

978

979

if ( aSig0 == 0 ) {

980

shiftCount = countLeadingZeros64( aSig1 ) - 15;

981

if ( shiftCount < 0 ) {

982

*zSig0Ptr = aSig1>>( - shiftCount );

983

*zSig1Ptr = aSig1<<( shiftCount & 63 );

984

}

985

else {

986

*zSig0Ptr = aSig1<<shiftCount;

987

*zSig1Ptr = 0;

988

}

989

*zExpPtr = - shiftCount - 63;

990

}

991

else {

992

shiftCount = countLeadingZeros64( aSig0 ) - 15;

993

shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );

994

*zExpPtr = 1 - shiftCount;

995

}

996

997

}

998

999

/*----------------------------------------------------------------------------

1000

| Packs the sign `zSign', the exponent `zExp', and the significand formed

1001

| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision

1002

| floating-point value, returning the result. After being shifted into the

1003

| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply

1004

| added together to form the most significant 32 bits of the result. This

1005

| means that any integer portion of `zSig0' will be added into the exponent.

1006

| Since a properly normalized significand will have an integer portion equal

1007

| to 1, the `zExp' input should be 1 less than the desired result exponent

1008

| whenever `zSig0' and `zSig1' concatenated form a complete, normalized

1009

| significand.

1010

*----------------------------------------------------------------------------*/

1011

1012

INLINE float128

1013

packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )

1014

{

1015

float128 z;

1016

1017

z.low = zSig1;

1018

z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;

1019

return z;

1020

1021

}

1022

1023

/*----------------------------------------------------------------------------

1024

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

1025

| and extended significand formed by the concatenation of `zSig0', `zSig1',

1026

| and `zSig2', and returns the proper quadruple-precision floating-point value

1027

| corresponding to the abstract input. Ordinarily, the abstract value is

1028

| simply rounded and packed into the quadruple-precision format, with the

1029

| inexact exception raised if the abstract input cannot be represented

1030

| exactly. However, if the abstract value is too large, the overflow and

1031

| inexact exceptions are raised and an infinity or maximal finite value is

1032

| returned. If the abstract value is too small, the input value is rounded to

1033

| a subnormal number, and the underflow and inexact exceptions are raised if

1034

| the abstract input cannot be represented exactly as a subnormal quadruple-

1035

| precision floating-point number.

1036

| The input significand must be normalized or smaller. If the input

1037

| significand is not normalized, `zExp' must be 0; in that case, the result

1038

| returned is a subnormal number, and it must not require rounding. In the

1039

| usual case that the input significand is normalized, `zExp' must be 1 less

1040

| than the ``true'' floating-point exponent. The handling of underflow and

1041

| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1042

*----------------------------------------------------------------------------*/

1043

1044

static float128

1045

roundAndPackFloat128(

1046

flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)

1047

{

1048

int8 roundingMode;

1049

flag roundNearestEven, increment, isTiny;

1050

1051

roundingMode = STATUS(float_rounding_mode);

1052

roundNearestEven = ( roundingMode == float_round_nearest_even );

1053

switch (roundingMode) {

1054

case float_round_nearest_even:

1055

increment = ((int64_t)zSig2 < 0);

1056

break;

1057

case float_round_to_zero:

1058

increment = 0;

1059

break;

1060

case float_round_up:

1061

increment = !zSign && zSig2;

1062

break;

1063

case float_round_down:

1064

increment = zSign && zSig2;

1065

break;

1066

default:

1067

abort();

1068

}

1069

if ( 0x7FFD <= (uint32_t) zExp ) {

1070

if ( ( 0x7FFD < zExp )

1071

|| ( ( zExp == 0x7FFD )

1072

&& eq128(

1073

LIT64( 0x0001FFFFFFFFFFFF ),

1074

LIT64( 0xFFFFFFFFFFFFFFFF ),

1075

zSig0,

1076

zSig1

1077

)

1078

&& increment

1079

)

1080

) {

1081

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

1082

if ( ( roundingMode == float_round_to_zero )

1083

|| ( zSign && ( roundingMode == float_round_up ) )

1084

|| ( ! zSign && ( roundingMode == float_round_down ) )

1085

) {

1086

return

1087

packFloat128(

1088

zSign,

1089

0x7FFE,

1090

LIT64( 0x0000FFFFFFFFFFFF ),

1091

LIT64( 0xFFFFFFFFFFFFFFFF )

1092

);

1093

}

1094

return packFloat128( zSign, 0x7FFF, 0, 0 );

1095

}

1096

if ( zExp < 0 ) {

1097

if (STATUS(flush_to_zero)) {

1098

float_raise(float_flag_output_denormal STATUS_VAR);

1099

return packFloat128(zSign, 0, 0, 0);

1100

}

1101

isTiny =

1102

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

1103

|| ( zExp < -1 )

1104

|| ! increment

1105

|| lt128(

1106

zSig0,

1107

zSig1,

1108

LIT64( 0x0001FFFFFFFFFFFF ),

1109

LIT64( 0xFFFFFFFFFFFFFFFF )

1110

);

1111

shift128ExtraRightJamming(

1112

zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );

1113

zExp = 0;

1114

if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);

1115

switch (roundingMode) {

1116

case float_round_nearest_even:

1117

increment = ((int64_t)zSig2 < 0);

1118

break;

1119

case float_round_to_zero:

1120

increment = 0;

1121

break;

1122

case float_round_up:

1123

increment = !zSign && zSig2;

1124

break;

1125

case float_round_down:

1126

increment = zSign && zSig2;

1127

break;

1128

default:

1129

abort();

1130

}

1131

}

1132

}

1133

if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;

1134

if ( increment ) {

1135

add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );

1136

zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );

1137

}

1138

else {

1139

if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;

1140

}

1141

return packFloat128( zSign, zExp, zSig0, zSig1 );

1142

1143

}

1144

1145

/*----------------------------------------------------------------------------

1146

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

1147

| and significand formed by the concatenation of `zSig0' and `zSig1', and

1148

| returns the proper quadruple-precision floating-point value corresponding

1149

| to the abstract input. This routine is just like `roundAndPackFloat128'

1150

| except that the input significand has fewer bits and does not have to be

1151

| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-

1152

| point exponent.

1153

*----------------------------------------------------------------------------*/

1154

1155

static float128

1156

normalizeRoundAndPackFloat128(

1157

flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)

1158

{

1159

int8 shiftCount;

1160

uint64_t zSig2;

1161

1162

if ( zSig0 == 0 ) {

1163

zSig0 = zSig1;

1164

zSig1 = 0;

1165

zExp -= 64;

1166

}

1167

shiftCount = countLeadingZeros64( zSig0 ) - 15;

1168

if ( 0 <= shiftCount ) {

1169

zSig2 = 0;

1170

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

1171

}

1172

else {

1173

shift128ExtraRightJamming(

1174

zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );

1175

}

1176

zExp -= shiftCount;

1177

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);

1178

1179

}

1180

1181

/*----------------------------------------------------------------------------

1182

| Returns the result of converting the 32-bit two's complement integer `a'

1183

| to the single-precision floating-point format. The conversion is performed

1184

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1185

*----------------------------------------------------------------------------*/

1186

1187

float32 int32_to_float32(int32_t a STATUS_PARAM)

1188

{

1189

flag zSign;

1190

1191

if ( a == 0 ) return float32_zero;

1192

if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );

1193

zSign = ( a < 0 );

1194

return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );

1195

1196

}

1197

1198

/*----------------------------------------------------------------------------

1199

| Returns the result of converting the 32-bit two's complement integer `a'

1200

| to the double-precision floating-point format. The conversion is performed

1201

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1202

*----------------------------------------------------------------------------*/

1203

1204

float64 int32_to_float64(int32_t a STATUS_PARAM)

1205

{

1206

flag zSign;

1207

uint32 absA;

1208

int8 shiftCount;

1209

uint64_t zSig;

1210

1211

if ( a == 0 ) return float64_zero;

1212

zSign = ( a < 0 );

1213

absA = zSign ? - a : a;

1214

shiftCount = countLeadingZeros32( absA ) + 21;

1215

zSig = absA;

1216

return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );

1217

1218

}

1219

1220

/*----------------------------------------------------------------------------

1221

| Returns the result of converting the 32-bit two's complement integer `a'

1222

| to the extended double-precision floating-point format. The conversion

1223

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1224

| Arithmetic.

1225

*----------------------------------------------------------------------------*/

1226

1227

floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)

1228

{

1229

flag zSign;

1230

uint32 absA;

1231

int8 shiftCount;

1232

uint64_t zSig;

1233

1234

if ( a == 0 ) return packFloatx80( 0, 0, 0 );

1235

zSign = ( a < 0 );

1236

absA = zSign ? - a : a;

1237

shiftCount = countLeadingZeros32( absA ) + 32;

1238

zSig = absA;

1239

return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );

1240

1241

}

1242

1243

/*----------------------------------------------------------------------------

1244

| Returns the result of converting the 32-bit two's complement integer `a' to

1245

| the quadruple-precision floating-point format. The conversion is performed

1246

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1247

*----------------------------------------------------------------------------*/

1248

1249

float128 int32_to_float128(int32_t a STATUS_PARAM)

1250

{

1251

flag zSign;

1252

uint32 absA;

1253

int8 shiftCount;

1254

uint64_t zSig0;

1255

1256

if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );

1257

zSign = ( a < 0 );

1258

absA = zSign ? - a : a;

1259

shiftCount = countLeadingZeros32( absA ) + 17;

1260

zSig0 = absA;

1261

return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );

1262

1263

}

1264

1265

/*----------------------------------------------------------------------------

1266

| Returns the result of converting the 64-bit two's complement integer `a'

1267

| to the single-precision floating-point format. The conversion is performed

1268

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1269

*----------------------------------------------------------------------------*/

1270

1271

float32 int64_to_float32(int64_t a STATUS_PARAM)

1272

{

1273

flag zSign;

1274

uint64 absA;

1275

int8 shiftCount;

1276

1277

if ( a == 0 ) return float32_zero;

1278

zSign = ( a < 0 );

1279

absA = zSign ? - a : a;

1280

shiftCount = countLeadingZeros64( absA ) - 40;

1281

if ( 0 <= shiftCount ) {

1282

return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );

1283

}

1284

else {

1285

shiftCount += 7;

1286

if ( shiftCount < 0 ) {

1287

shift64RightJamming( absA, - shiftCount, &absA );

1288

}

1289

else {

1290

absA <<= shiftCount;

1291

}

1292

return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );

1293

}

1294

1295

}

1296

1297

float32 uint64_to_float32(uint64_t a STATUS_PARAM)

1298

{

1299

int8 shiftCount;

1300

1301

if ( a == 0 ) return float32_zero;

1302

shiftCount = countLeadingZeros64( a ) - 40;

1303

if ( 0 <= shiftCount ) {

1304

return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);

1305

}

1306

else {

1307

shiftCount += 7;

1308

if ( shiftCount < 0 ) {

1309

shift64RightJamming( a, - shiftCount, &a );

1310

}

1311

else {

1312

a <<= shiftCount;

1313

}

1314

return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);

1315

}

1316

}

1317

1318

/*----------------------------------------------------------------------------

1319

| Returns the result of converting the 64-bit two's complement integer `a'

1320

| to the double-precision floating-point format. The conversion is performed

1321

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1322

*----------------------------------------------------------------------------*/

1323

1324

float64 int64_to_float64(int64_t a STATUS_PARAM)

1325

{

1326

flag zSign;

1327

1328

if ( a == 0 ) return float64_zero;

1329

if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {

1330

return packFloat64( 1, 0x43E, 0 );

1331

}

1332

zSign = ( a < 0 );

1333

return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );

1334

1335

}

1336

1337

float64 uint64_to_float64(uint64_t a STATUS_PARAM)

1338

{

1339

int exp = 0x43C;

1340

1341

if (a == 0) {

1342

return float64_zero;

1343

}

1344

if ((int64_t)a < 0) {

1345

shift64RightJamming(a, 1, &a);

1346

exp += 1;

1347

}

1348

return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);

1349

}

1350

1351

/*----------------------------------------------------------------------------

1352

| Returns the result of converting the 64-bit two's complement integer `a'

1353

| to the extended double-precision floating-point format. The conversion

1354

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1355

| Arithmetic.

1356

*----------------------------------------------------------------------------*/

1357

1358

floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)

1359

{

1360

flag zSign;

1361

uint64 absA;

1362

int8 shiftCount;

1363

1364

if ( a == 0 ) return packFloatx80( 0, 0, 0 );

1365

zSign = ( a < 0 );

1366

absA = zSign ? - a : a;

1367

shiftCount = countLeadingZeros64( absA );

1368

return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );

1369

1370

}

1371

1372

/*----------------------------------------------------------------------------

1373

| Returns the result of converting the 64-bit two's complement integer `a' to

1374

| the quadruple-precision floating-point format. The conversion is performed

1375

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1376

*----------------------------------------------------------------------------*/

1377

1378

float128 int64_to_float128(int64_t a STATUS_PARAM)

1379

{

1380

flag zSign;

1381

uint64 absA;

1382

int8 shiftCount;

1383

int32 zExp;

1384

uint64_t zSig0, zSig1;

1385

1386

if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );

1387

zSign = ( a < 0 );

1388

absA = zSign ? - a : a;

1389

shiftCount = countLeadingZeros64( absA ) + 49;

1390

zExp = 0x406E - shiftCount;

1391

if ( 64 <= shiftCount ) {

1392

zSig1 = 0;

1393

zSig0 = absA;

1394

shiftCount -= 64;

1395

}

1396

else {

1397

zSig1 = absA;

1398

zSig0 = 0;

1399

}

1400

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

1401

return packFloat128( zSign, zExp, zSig0, zSig1 );

1402

1403

}

1404

1405

float128 uint64_to_float128(uint64_t a STATUS_PARAM)

1406

{

1407

if (a == 0) {

1408

return float128_zero;

1409

}

1410

return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);

1411

}

1412

1413

/*----------------------------------------------------------------------------

1414

| Returns the result of converting the single-precision floating-point value

1415

| `a' to the 32-bit two's complement integer format. The conversion is

1416

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1417

| Arithmetic---which means in particular that the conversion is rounded

1418

| according to the current rounding mode. If `a' is a NaN, the largest

1419

| positive integer is returned. Otherwise, if the conversion overflows, the

1420

| largest integer with the same sign as `a' is returned.

1421

*----------------------------------------------------------------------------*/

1422

1423

int32 float32_to_int32( float32 a STATUS_PARAM )

1424

{

1425

flag aSign;

1426

int_fast16_t aExp, shiftCount;

1427

uint32_t aSig;

1428

uint64_t aSig64;

1429

1430

a = float32_squash_input_denormal(a STATUS_VAR);

1431

aSig = extractFloat32Frac( a );

1432

aExp = extractFloat32Exp( a );

1433

aSign = extractFloat32Sign( a );

1434

if ( ( aExp == 0xFF ) && aSig ) aSign = 0;

1435

if ( aExp ) aSig |= 0x00800000;

1436

shiftCount = 0xAF - aExp;

1437

aSig64 = aSig;

1438

aSig64 <<= 32;

1439

if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );

1440

return roundAndPackInt32( aSign, aSig64 STATUS_VAR );

1441

1442

}

1443

1444

/*----------------------------------------------------------------------------

1445

| Returns the result of converting the single-precision floating-point value

1446

| `a' to the 32-bit two's complement integer format. The conversion is

1447

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1448

| Arithmetic, except that the conversion is always rounded toward zero.

1449

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

1450

| the conversion overflows, the largest integer with the same sign as `a' is

1451

| returned.

1452

*----------------------------------------------------------------------------*/

1453

1454

int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )

1455

{

1456

flag aSign;

1457

int_fast16_t aExp, shiftCount;

1458

uint32_t aSig;

1459

int32_t z;

1460

a = float32_squash_input_denormal(a STATUS_VAR);

1461

1462

aSig = extractFloat32Frac( a );

1463

aExp = extractFloat32Exp( a );

1464

aSign = extractFloat32Sign( a );

1465

shiftCount = aExp - 0x9E;

1466

if ( 0 <= shiftCount ) {

1467

if ( float32_val(a) != 0xCF000000 ) {

1468

float_raise( float_flag_invalid STATUS_VAR);

1469

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;

1470

}

1471

return (int32_t) 0x80000000;

1472

}

1473

else if ( aExp <= 0x7E ) {

1474

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

1475

return 0;

1476

}

1477

aSig = ( aSig | 0x00800000 )<<8;

1478

z = aSig>>( - shiftCount );

1479

if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {

1480

STATUS(float_exception_flags) |= float_flag_inexact;

1481

}

1482

if ( aSign ) z = - z;

1483

return z;

1484

1485

}

1486

1487

/*----------------------------------------------------------------------------

1488

| Returns the result of converting the single-precision floating-point value

1489

| `a' to the 16-bit two's complement integer format. The conversion is

1490

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1491

| Arithmetic, except that the conversion is always rounded toward zero.

1492

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

1493

| the conversion overflows, the largest integer with the same sign as `a' is

1494

| returned.

1495

*----------------------------------------------------------------------------*/

1496

1497

int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)

1498

{

1499

flag aSign;

1500

int_fast16_t aExp, shiftCount;

1501

uint32_t aSig;

1502

int32 z;

1503

1504

aSig = extractFloat32Frac( a );

1505

aExp = extractFloat32Exp( a );

1506

aSign = extractFloat32Sign( a );

1507

shiftCount = aExp - 0x8E;

1508

if ( 0 <= shiftCount ) {

1509

if ( float32_val(a) != 0xC7000000 ) {

1510

float_raise( float_flag_invalid STATUS_VAR);

1511

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1512

return 0x7FFF;

1513

}

1514

}

1515

return (int32_t) 0xffff8000;

1516

}

1517

else if ( aExp <= 0x7E ) {

1518

if ( aExp | aSig ) {

1519

STATUS(float_exception_flags) |= float_flag_inexact;

1520

}

1521

return 0;

1522

}

1523

shiftCount -= 0x10;

1524

aSig = ( aSig | 0x00800000 )<<8;

1525

z = aSig>>( - shiftCount );

1526

if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {

1527

STATUS(float_exception_flags) |= float_flag_inexact;

1528

}

1529

if ( aSign ) {

1530

z = - z;

1531

}

1532

return z;

1533

1534

}

1535

1536

/*----------------------------------------------------------------------------

1537

| Returns the result of converting the single-precision floating-point value

1538

| `a' to the 64-bit two's complement integer format. The conversion is

1539

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1540

| Arithmetic---which means in particular that the conversion is rounded

1541

| according to the current rounding mode. If `a' is a NaN, the largest

1542

| positive integer is returned. Otherwise, if the conversion overflows, the

1543

| largest integer with the same sign as `a' is returned.

1544

*----------------------------------------------------------------------------*/

1545

1546

int64 float32_to_int64( float32 a STATUS_PARAM )

1547

{

1548

flag aSign;

1549

int_fast16_t aExp, shiftCount;

1550

uint32_t aSig;

1551

uint64_t aSig64, aSigExtra;

1552

a = float32_squash_input_denormal(a STATUS_VAR);

1553

1554

aSig = extractFloat32Frac( a );

1555

aExp = extractFloat32Exp( a );

1556

aSign = extractFloat32Sign( a );

1557

shiftCount = 0xBE - aExp;

1558

if ( shiftCount < 0 ) {

1559

float_raise( float_flag_invalid STATUS_VAR);

1560

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1561

return LIT64( 0x7FFFFFFFFFFFFFFF );

1562

}

1563

return (int64_t) LIT64( 0x8000000000000000 );

1564

}

1565

if ( aExp ) aSig |= 0x00800000;

1566

aSig64 = aSig;

1567

aSig64 <<= 40;

1568

shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );

1569

return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );

1570

1571

}

1572

1573

/*----------------------------------------------------------------------------

1574

| Returns the result of converting the single-precision floating-point value

1575

| `a' to the 64-bit unsigned integer format. The conversion is

1576

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1577

| Arithmetic---which means in particular that the conversion is rounded

1578

| according to the current rounding mode. If `a' is a NaN, the largest

1579

| unsigned integer is returned. Otherwise, if the conversion overflows, the

1580

| largest unsigned integer is returned. If the 'a' is negative, the result

1581

| is rounded and zero is returned; values that do not round to zero will

1582

| raise the inexact exception flag.

1583

*----------------------------------------------------------------------------*/

1584

1585

uint64 float32_to_uint64(float32 a STATUS_PARAM)

1586

{

1587

flag aSign;

1588

int_fast16_t aExp, shiftCount;

1589

uint32_t aSig;

1590

uint64_t aSig64, aSigExtra;

1591

a = float32_squash_input_denormal(a STATUS_VAR);

1592

1593

aSig = extractFloat32Frac(a);

1594

aExp = extractFloat32Exp(a);

1595

aSign = extractFloat32Sign(a);

1596

if ((aSign) && (aExp > 126)) {

1597

float_raise(float_flag_invalid STATUS_VAR);

1598

if (float32_is_any_nan(a)) {

1599

return LIT64(0xFFFFFFFFFFFFFFFF);

1600

} else {

1601

return 0;

1602

}

1603

}

1604

shiftCount = 0xBE - aExp;

1605

if (aExp) {

1606

aSig |= 0x00800000;

1607

}

1608

if (shiftCount < 0) {

1609

float_raise(float_flag_invalid STATUS_VAR);

1610

return LIT64(0xFFFFFFFFFFFFFFFF);

1611

}

1612

1613

aSig64 = aSig;

1614

aSig64 <<= 40;

1615

shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);

1616

return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);

1617

}

1618

1619

/*----------------------------------------------------------------------------

1620

| Returns the result of converting the single-precision floating-point value

1621

| `a' to the 64-bit two's complement integer format. The conversion is

1622

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1623

| Arithmetic, except that the conversion is always rounded toward zero. If

1624

| `a' is a NaN, the largest positive integer is returned. Otherwise, if the

1625

| conversion overflows, the largest integer with the same sign as `a' is

1626

| returned.

1627

*----------------------------------------------------------------------------*/

1628

1629

int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )

1630

{

1631

flag aSign;

1632

int_fast16_t aExp, shiftCount;

1633

uint32_t aSig;

1634

uint64_t aSig64;

1635

int64 z;

1636

a = float32_squash_input_denormal(a STATUS_VAR);

1637

1638

aSig = extractFloat32Frac( a );

1639

aExp = extractFloat32Exp( a );

1640

aSign = extractFloat32Sign( a );

1641

shiftCount = aExp - 0xBE;

1642

if ( 0 <= shiftCount ) {

1643

if ( float32_val(a) != 0xDF000000 ) {

1644

float_raise( float_flag_invalid STATUS_VAR);

1645

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1646

return LIT64( 0x7FFFFFFFFFFFFFFF );

1647

}

1648

}

1649

return (int64_t) LIT64( 0x8000000000000000 );

1650

}

1651

else if ( aExp <= 0x7E ) {

1652

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

1653

return 0;

1654

}

1655

aSig64 = aSig | 0x00800000;

1656

aSig64 <<= 40;

1657

z = aSig64>>( - shiftCount );

1658

if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {

1659

STATUS(float_exception_flags) |= float_flag_inexact;

1660

}

1661

if ( aSign ) z = - z;

1662

return z;

1663

1664

}

1665

1666

/*----------------------------------------------------------------------------

1667

| Returns the result of converting the single-precision floating-point value

1668

| `a' to the double-precision floating-point format. The conversion is

1669

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1670

| Arithmetic.

1671

*----------------------------------------------------------------------------*/

1672

1673

float64 float32_to_float64( float32 a STATUS_PARAM )

1674

{

1675

flag aSign;

1676

int_fast16_t aExp;

1677

uint32_t aSig;

1678

a = float32_squash_input_denormal(a STATUS_VAR);

1679

1680

aSig = extractFloat32Frac( a );

1681

aExp = extractFloat32Exp( a );

1682

aSign = extractFloat32Sign( a );

1683

if ( aExp == 0xFF ) {

1684

if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1685

return packFloat64( aSign, 0x7FF, 0 );

1686

}

1687

if ( aExp == 0 ) {

1688

if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );

1689

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1690

--aExp;

1691

}

1692

return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );

1693

1694

}

1695

1696

/*----------------------------------------------------------------------------

1697

| Returns the result of converting the single-precision floating-point value

1698

| `a' to the extended double-precision floating-point format. The conversion

1699

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1700

| Arithmetic.

1701

*----------------------------------------------------------------------------*/

1702

1703

floatx80 float32_to_floatx80( float32 a STATUS_PARAM )

1704

{

1705

flag aSign;

1706

int_fast16_t aExp;

1707

uint32_t aSig;

1708

1709

a = float32_squash_input_denormal(a STATUS_VAR);

1710

aSig = extractFloat32Frac( a );

1711

aExp = extractFloat32Exp( a );

1712

aSign = extractFloat32Sign( a );

1713

if ( aExp == 0xFF ) {

1714

if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1715

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

1716

}

1717

if ( aExp == 0 ) {

1718

if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );

1719

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1720

}

1721

aSig |= 0x00800000;

1722

return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );

1723

1724

}

1725

1726

/*----------------------------------------------------------------------------

1727

| Returns the result of converting the single-precision floating-point value

1728

| `a' to the double-precision floating-point format. The conversion is

1729

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1730

| Arithmetic.

1731

*----------------------------------------------------------------------------*/

1732

1733

float128 float32_to_float128( float32 a STATUS_PARAM )

1734

{

1735

flag aSign;

1736

int_fast16_t aExp;

1737

uint32_t aSig;

1738

1739

a = float32_squash_input_denormal(a STATUS_VAR);

1740

aSig = extractFloat32Frac( a );

1741

aExp = extractFloat32Exp( a );

1742

aSign = extractFloat32Sign( a );

1743

if ( aExp == 0xFF ) {

1744

if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1745

return packFloat128( aSign, 0x7FFF, 0, 0 );

1746

}

1747

if ( aExp == 0 ) {

1748

if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );

1749

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1750

--aExp;

1751

}

1752

return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );

1753

1754

}

1755

1756

/*----------------------------------------------------------------------------

1757

| Rounds the single-precision floating-point value `a' to an integer, and

1758

| returns the result as a single-precision floating-point value. The

1759

| operation is performed according to the IEC/IEEE Standard for Binary

1760

| Floating-Point Arithmetic.

1761

*----------------------------------------------------------------------------*/

1762

1763

float32 float32_round_to_int( float32 a STATUS_PARAM)

1764

{

1765

flag aSign;

1766

int_fast16_t aExp;

1767

uint32_t lastBitMask, roundBitsMask;

1768

uint32_t z;

1769

a = float32_squash_input_denormal(a STATUS_VAR);

1770

1771

aExp = extractFloat32Exp( a );

1772

if ( 0x96 <= aExp ) {

1773

if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {

1774

return propagateFloat32NaN( a, a STATUS_VAR );

1775

}

1776

return a;

1777

}

1778

if ( aExp <= 0x7E ) {

1779

if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;

1780

STATUS(float_exception_flags) |= float_flag_inexact;

1781

aSign = extractFloat32Sign( a );

1782

switch ( STATUS(float_rounding_mode) ) {

1783

case float_round_nearest_even:

1784

if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {

1785

return packFloat32( aSign, 0x7F, 0 );

1786

}

1787

break;

1788

case float_round_down:

1789

return make_float32(aSign ? 0xBF800000 : 0);

1790

case float_round_up:

1791

return make_float32(aSign ? 0x80000000 : 0x3F800000);

1792

}

1793

return packFloat32( aSign, 0, 0 );

1794

}

1795

lastBitMask = 1;

1796

lastBitMask <<= 0x96 - aExp;

1797

roundBitsMask = lastBitMask - 1;

1798

z = float32_val(a);

1799

switch (STATUS(float_rounding_mode)) {

1800

case float_round_nearest_even:

1801

z += lastBitMask>>1;

1802

if ((z & roundBitsMask) == 0) {

1803

z &= ~lastBitMask;

1804

}

1805

break;

1806

case float_round_to_zero:

1807

break;

1808

case float_round_up:

1809

if (!extractFloat32Sign(make_float32(z))) {

1810

z += roundBitsMask;

1811

}

1812

break;

1813

case float_round_down:

1814

if (extractFloat32Sign(make_float32(z))) {

1815

z += roundBitsMask;

1816

}

1817

break;

1818

default:

1819

abort();

1820

}

1821

z &= ~ roundBitsMask;

1822

if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;

1823

return make_float32(z);

1824

1825

}

1826

1827

/*----------------------------------------------------------------------------

1828

| Returns the result of adding the absolute values of the single-precision

1829

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

1830

| before being returned. `zSign' is ignored if the result is a NaN.

1831

| The addition is performed according to the IEC/IEEE Standard for Binary

1832

| Floating-Point Arithmetic.

1833

*----------------------------------------------------------------------------*/

1834

1835

static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)

1836

{

1837

int_fast16_t aExp, bExp, zExp;

1838

uint32_t aSig, bSig, zSig;

1839

int_fast16_t expDiff;

1840

1841

aSig = extractFloat32Frac( a );

1842

aExp = extractFloat32Exp( a );

1843

bSig = extractFloat32Frac( b );

1844

bExp = extractFloat32Exp( b );

1845

expDiff = aExp - bExp;

1846

aSig <<= 6;

1847

bSig <<= 6;

1848

if ( 0 < expDiff ) {

1849

if ( aExp == 0xFF ) {

1850

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1851

return a;

1852

}

1853

if ( bExp == 0 ) {

1854

--expDiff;

1855

}

1856

else {

1857

bSig |= 0x20000000;

1858

}

1859

shift32RightJamming( bSig, expDiff, &bSig );

1860

zExp = aExp;

1861

}

1862

else if ( expDiff < 0 ) {

1863

if ( bExp == 0xFF ) {

1864

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1865

return packFloat32( zSign, 0xFF, 0 );

1866

}

1867

if ( aExp == 0 ) {

1868

++expDiff;

1869

}

1870

else {

1871

aSig |= 0x20000000;

1872

}

1873

shift32RightJamming( aSig, - expDiff, &aSig );

1874

zExp = bExp;

1875

}

1876

else {

1877

if ( aExp == 0xFF ) {

1878

if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1879

return a;

1880

}

1881

if ( aExp == 0 ) {

1882

if (STATUS(flush_to_zero)) {

1883

if (aSig | bSig) {

1884

float_raise(float_flag_output_denormal STATUS_VAR);

1885

}

1886

return packFloat32(zSign, 0, 0);

1887

}

1888

return packFloat32( zSign, 0, ( aSig + bSig )>>6 );

1889

}

1890

zSig = 0x40000000 + aSig + bSig;

1891

zExp = aExp;

1892

goto roundAndPack;

1893

}

1894

aSig |= 0x20000000;

1895

zSig = ( aSig + bSig )<<1;

1896

--zExp;

1897

if ( (int32_t) zSig < 0 ) {

1898

zSig = aSig + bSig;

1899

++zExp;

1900

}

1901

roundAndPack:

1902

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

1903

1904

}

1905

1906

/*----------------------------------------------------------------------------

1907

| Returns the result of subtracting the absolute values of the single-

1908

| precision floating-point values `a' and `b'. If `zSign' is 1, the

1909

| difference is negated before being returned. `zSign' is ignored if the

1910

| result is a NaN. The subtraction is performed according to the IEC/IEEE

1911

| Standard for Binary Floating-Point Arithmetic.

1912

*----------------------------------------------------------------------------*/

1913

1914

static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)

1915

{

1916

int_fast16_t aExp, bExp, zExp;

1917

uint32_t aSig, bSig, zSig;

1918

int_fast16_t expDiff;

1919

1920

aSig = extractFloat32Frac( a );

1921

aExp = extractFloat32Exp( a );

1922

bSig = extractFloat32Frac( b );

1923

bExp = extractFloat32Exp( b );

1924

expDiff = aExp - bExp;

1925

aSig <<= 7;

1926

bSig <<= 7;

1927

if ( 0 < expDiff ) goto aExpBigger;

1928

if ( expDiff < 0 ) goto bExpBigger;

1929

if ( aExp == 0xFF ) {

1930

if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1931

float_raise( float_flag_invalid STATUS_VAR);

1932

return float32_default_nan;

1933

}

1934

if ( aExp == 0 ) {

1935

aExp = 1;

1936

bExp = 1;

1937

}

1938

if ( bSig < aSig ) goto aBigger;

1939

if ( aSig < bSig ) goto bBigger;

1940

return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

1941

bExpBigger:

1942

if ( bExp == 0xFF ) {

1943

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1944

return packFloat32( zSign ^ 1, 0xFF, 0 );

1945

}

1946

if ( aExp == 0 ) {

1947

++expDiff;

1948

}

1949

else {

1950

aSig |= 0x40000000;

1951

}

1952

shift32RightJamming( aSig, - expDiff, &aSig );

1953

bSig |= 0x40000000;

1954

bBigger:

1955

zSig = bSig - aSig;

1956

zExp = bExp;

1957

zSign ^= 1;

1958

goto normalizeRoundAndPack;

1959

aExpBigger:

1960

if ( aExp == 0xFF ) {

1961

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1962

return a;

1963

}

1964

if ( bExp == 0 ) {

1965

--expDiff;

1966

}

1967

else {

1968

bSig |= 0x40000000;

1969

}

1970

shift32RightJamming( bSig, expDiff, &bSig );

1971

aSig |= 0x40000000;

1972

aBigger:

1973

zSig = aSig - bSig;

1974

zExp = aExp;

1975

normalizeRoundAndPack:

1976

--zExp;

1977

return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

1978

1979

}

1980

1981

/*----------------------------------------------------------------------------

1982

| Returns the result of adding the single-precision floating-point values `a'

1983

| and `b'. The operation is performed according to the IEC/IEEE Standard for

1984

| Binary Floating-Point Arithmetic.

1985

*----------------------------------------------------------------------------*/

1986

1987

float32 float32_add( float32 a, float32 b STATUS_PARAM )

1988

{

1989

flag aSign, bSign;

1990

a = float32_squash_input_denormal(a STATUS_VAR);

1991

b = float32_squash_input_denormal(b STATUS_VAR);

1992

1993

aSign = extractFloat32Sign( a );

1994

bSign = extractFloat32Sign( b );

1995

if ( aSign == bSign ) {

1996

return addFloat32Sigs( a, b, aSign STATUS_VAR);

1997

}

1998

else {

1999

return subFloat32Sigs( a, b, aSign STATUS_VAR );

2000

}

2001

2002

}

2003

2004

/*----------------------------------------------------------------------------

2005

| Returns the result of subtracting the single-precision floating-point values

2006

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

2007

| for Binary Floating-Point Arithmetic.

2008

*----------------------------------------------------------------------------*/

2009

2010

float32 float32_sub( float32 a, float32 b STATUS_PARAM )

2011

{

2012

flag aSign, bSign;

2013

a = float32_squash_input_denormal(a STATUS_VAR);

2014

b = float32_squash_input_denormal(b STATUS_VAR);

2015

2016

aSign = extractFloat32Sign( a );

2017

bSign = extractFloat32Sign( b );

2018

if ( aSign == bSign ) {

2019

return subFloat32Sigs( a, b, aSign STATUS_VAR );

2020

}

2021

else {

2022

return addFloat32Sigs( a, b, aSign STATUS_VAR );

2023

}

2024

2025

}

2026

2027

/*----------------------------------------------------------------------------

2028

| Returns the result of multiplying the single-precision floating-point values

2029

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

2030

| for Binary Floating-Point Arithmetic.

2031

*----------------------------------------------------------------------------*/

2032

2033

float32 float32_mul( float32 a, float32 b STATUS_PARAM )

2034

{

2035

flag aSign, bSign, zSign;

2036

int_fast16_t aExp, bExp, zExp;

2037

uint32_t aSig, bSig;

2038

uint64_t zSig64;

2039

uint32_t zSig;

2040

2041

a = float32_squash_input_denormal(a STATUS_VAR);

2042

b = float32_squash_input_denormal(b STATUS_VAR);

2043

2044

aSig = extractFloat32Frac( a );

2045

aExp = extractFloat32Exp( a );

2046

aSign = extractFloat32Sign( a );

2047

bSig = extractFloat32Frac( b );

2048

bExp = extractFloat32Exp( b );

2049

bSign = extractFloat32Sign( b );

2050

zSign = aSign ^ bSign;

2051

if ( aExp == 0xFF ) {

2052

if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {

2053

return propagateFloat32NaN( a, b STATUS_VAR );

2054

}

2055

if ( ( bExp | bSig ) == 0 ) {

2056

float_raise( float_flag_invalid STATUS_VAR);

2057

return float32_default_nan;

2058

}

2059

return packFloat32( zSign, 0xFF, 0 );

2060

}

2061

if ( bExp == 0xFF ) {

2062

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2063

if ( ( aExp | aSig ) == 0 ) {

2064

float_raise( float_flag_invalid STATUS_VAR);

2065

return float32_default_nan;

2066

}

2067

return packFloat32( zSign, 0xFF, 0 );

2068

}

2069

if ( aExp == 0 ) {

2070

if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );

2071

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2072

}

2073

if ( bExp == 0 ) {

2074

if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );

2075

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2076

}

2077

zExp = aExp + bExp - 0x7F;

2078

aSig = ( aSig | 0x00800000 )<<7;

2079

bSig = ( bSig | 0x00800000 )<<8;

2080

shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );

2081

zSig = zSig64;

2082

if ( 0 <= (int32_t) ( zSig<<1 ) ) {

2083

zSig <<= 1;

2084

--zExp;

2085

}

2086

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

2087

2088

}

2089

2090

/*----------------------------------------------------------------------------

2091

| Returns the result of dividing the single-precision floating-point value `a'

2092

| by the corresponding value `b'. The operation is performed according to the

2093

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2094

*----------------------------------------------------------------------------*/

2095

2096

float32 float32_div( float32 a, float32 b STATUS_PARAM )

2097

{

2098

flag aSign, bSign, zSign;

2099

int_fast16_t aExp, bExp, zExp;

2100

uint32_t aSig, bSig, zSig;

2101

a = float32_squash_input_denormal(a STATUS_VAR);

2102

b = float32_squash_input_denormal(b STATUS_VAR);

2103

2104

aSig = extractFloat32Frac( a );

2105

aExp = extractFloat32Exp( a );

2106

aSign = extractFloat32Sign( a );

2107

bSig = extractFloat32Frac( b );

2108

bExp = extractFloat32Exp( b );

2109

bSign = extractFloat32Sign( b );

2110

zSign = aSign ^ bSign;

2111

if ( aExp == 0xFF ) {

2112

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2113

if ( bExp == 0xFF ) {

2114

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2115

float_raise( float_flag_invalid STATUS_VAR);

2116

return float32_default_nan;

2117

}

2118

return packFloat32( zSign, 0xFF, 0 );

2119

}

2120

if ( bExp == 0xFF ) {

2121

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2122

return packFloat32( zSign, 0, 0 );

2123

}

2124

if ( bExp == 0 ) {

2125

if ( bSig == 0 ) {

2126

if ( ( aExp | aSig ) == 0 ) {

2127

float_raise( float_flag_invalid STATUS_VAR);

2128

return float32_default_nan;

2129

}

2130

float_raise( float_flag_divbyzero STATUS_VAR);

2131

return packFloat32( zSign, 0xFF, 0 );

2132

}

2133

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2134

}

2135

if ( aExp == 0 ) {

2136

if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );

2137

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2138

}

2139

zExp = aExp - bExp + 0x7D;

2140

aSig = ( aSig | 0x00800000 )<<7;

2141

bSig = ( bSig | 0x00800000 )<<8;

2142

if ( bSig <= ( aSig + aSig ) ) {

2143

aSig >>= 1;

2144

++zExp;

2145

}

2146

zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;

2147

if ( ( zSig & 0x3F ) == 0 ) {

2148

zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );

2149

}

2150

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

2151

2152

}

2153

2154

/*----------------------------------------------------------------------------

2155

| Returns the remainder of the single-precision floating-point value `a'

2156

| with respect to the corresponding value `b'. The operation is performed

2157

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2158

*----------------------------------------------------------------------------*/

2159

2160

float32 float32_rem( float32 a, float32 b STATUS_PARAM )

2161

{

2162

flag aSign, zSign;

2163

int_fast16_t aExp, bExp, expDiff;

2164

uint32_t aSig, bSig;

2165

uint32_t q;

2166

uint64_t aSig64, bSig64, q64;

2167

uint32_t alternateASig;

2168

int32_t sigMean;

2169

a = float32_squash_input_denormal(a STATUS_VAR);

2170

b = float32_squash_input_denormal(b STATUS_VAR);

2171

2172

aSig = extractFloat32Frac( a );

2173

aExp = extractFloat32Exp( a );

2174

aSign = extractFloat32Sign( a );

2175

bSig = extractFloat32Frac( b );

2176

bExp = extractFloat32Exp( b );

2177

if ( aExp == 0xFF ) {

2178

if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {

2179

return propagateFloat32NaN( a, b STATUS_VAR );

2180

}

2181

float_raise( float_flag_invalid STATUS_VAR);

2182

return float32_default_nan;

2183

}

2184

if ( bExp == 0xFF ) {

2185

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2186

return a;

2187

}

2188

if ( bExp == 0 ) {

2189

if ( bSig == 0 ) {

2190

float_raise( float_flag_invalid STATUS_VAR);

2191

return float32_default_nan;

2192

}

2193

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2194

}

2195

if ( aExp == 0 ) {

2196

if ( aSig == 0 ) return a;

2197

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2198

}

2199

expDiff = aExp - bExp;

2200

aSig |= 0x00800000;

2201

bSig |= 0x00800000;

2202

if ( expDiff < 32 ) {

2203

aSig <<= 8;

2204

bSig <<= 8;

2205

if ( expDiff < 0 ) {

2206

if ( expDiff < -1 ) return a;

2207

aSig >>= 1;

2208

}

2209

q = ( bSig <= aSig );

2210

if ( q ) aSig -= bSig;

2211

if ( 0 < expDiff ) {

2212

q = ( ( (uint64_t) aSig )<<32 ) / bSig;

2213

q >>= 32 - expDiff;

2214

bSig >>= 2;

2215

aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;

2216

}

2217

else {

2218

aSig >>= 2;

2219

bSig >>= 2;

2220

}

2221

}

2222

else {

2223

if ( bSig <= aSig ) aSig -= bSig;

2224

aSig64 = ( (uint64_t) aSig )<<40;

2225

bSig64 = ( (uint64_t) bSig )<<40;

2226

expDiff -= 64;

2227

while ( 0 < expDiff ) {

2228

q64 = estimateDiv128To64( aSig64, 0, bSig64 );

2229

q64 = ( 2 < q64 ) ? q64 - 2 : 0;

2230

aSig64 = - ( ( bSig * q64 )<<38 );

2231

expDiff -= 62;

2232

}

2233

expDiff += 64;

2234

q64 = estimateDiv128To64( aSig64, 0, bSig64 );

2235

q64 = ( 2 < q64 ) ? q64 - 2 : 0;

2236

q = q64>>( 64 - expDiff );

2237

bSig <<= 6;

2238

aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;

2239

}

2240

do {

2241

alternateASig = aSig;

2242

++q;

2243

aSig -= bSig;

2244

} while ( 0 <= (int32_t) aSig );

2245

sigMean = aSig + alternateASig;

2246

if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {

2247

aSig = alternateASig;

2248

}

2249

zSign = ( (int32_t) aSig < 0 );

2250

if ( zSign ) aSig = - aSig;

2251

return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );

2252

2253

}

2254

2255

/*----------------------------------------------------------------------------

2256

| Returns the result of multiplying the single-precision floating-point values

2257

| `a' and `b' then adding 'c', with no intermediate rounding step after the

2258

| multiplication. The operation is performed according to the IEC/IEEE

2259

| Standard for Binary Floating-Point Arithmetic 754-2008.

2260

| The flags argument allows the caller to select negation of the

2261

| addend, the intermediate product, or the final result. (The difference

2262

| between this and having the caller do a separate negation is that negating

2263

| externally will flip the sign bit on NaNs.)

2264

*----------------------------------------------------------------------------*/

2265

2266

float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)

2267

{

2268

flag aSign, bSign, cSign, zSign;

2269

int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;

2270

uint32_t aSig, bSig, cSig;

2271

flag pInf, pZero, pSign;

2272

uint64_t pSig64, cSig64, zSig64;

2273

uint32_t pSig;

2274

int shiftcount;

2275

flag signflip, infzero;

2276

2277

a = float32_squash_input_denormal(a STATUS_VAR);

2278

b = float32_squash_input_denormal(b STATUS_VAR);

2279

c = float32_squash_input_denormal(c STATUS_VAR);

2280

aSig = extractFloat32Frac(a);

2281

aExp = extractFloat32Exp(a);

2282

aSign = extractFloat32Sign(a);

2283

bSig = extractFloat32Frac(b);

2284

bExp = extractFloat32Exp(b);

2285

bSign = extractFloat32Sign(b);

2286

cSig = extractFloat32Frac(c);

2287

cExp = extractFloat32Exp(c);

2288

cSign = extractFloat32Sign(c);

2289

2290

infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||

2291

(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));

2292

2293

/* It is implementation-defined whether the cases of (0,inf,qnan)

2294

* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN

2295

* they return if they do), so we have to hand this information

2296

* off to the target-specific pick-a-NaN routine.

2297

2298

if (((aExp == 0xff) && aSig) ||

2299

((bExp == 0xff) && bSig) ||

2300

((cExp == 0xff) && cSig)) {

2301

return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);

2302

}

2303

2304

if (infzero) {

2305

float_raise(float_flag_invalid STATUS_VAR);

2306

return float32_default_nan;

2307

}

2308

2309

if (flags & float_muladd_negate_c) {

2310

cSign ^= 1;

2311

}

2312

2313

signflip = (flags & float_muladd_negate_result) ? 1 : 0;

2314

2315

/* Work out the sign and type of the product */

2316

pSign = aSign ^ bSign;

2317

if (flags & float_muladd_negate_product) {

2318

pSign ^= 1;

2319

}

2320

pInf = (aExp == 0xff) || (bExp == 0xff);

2321

pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);

2322

2323

if (cExp == 0xff) {

2324

if (pInf && (pSign ^ cSign)) {

2325

/* addition of opposite-signed infinities => InvalidOperation */

2326

float_raise(float_flag_invalid STATUS_VAR);

2327

return float32_default_nan;

2328

}

2329

/* Otherwise generate an infinity of the same sign */

2330

return packFloat32(cSign ^ signflip, 0xff, 0);

2331

}

2332

2333

if (pInf) {

2334

return packFloat32(pSign ^ signflip, 0xff, 0);

2335

}

2336

2337

if (pZero) {

2338

if (cExp == 0) {

2339

if (cSig == 0) {

2340

/* Adding two exact zeroes */

2341

if (pSign == cSign) {

2342

zSign = pSign;

2343

} else if (STATUS(float_rounding_mode) == float_round_down) {

2344

zSign = 1;

2345

} else {

2346

zSign = 0;

2347

}

2348

return packFloat32(zSign ^ signflip, 0, 0);

2349

}

2350

/* Exact zero plus a denorm */

2351

if (STATUS(flush_to_zero)) {

2352

float_raise(float_flag_output_denormal STATUS_VAR);

2353

return packFloat32(cSign ^ signflip, 0, 0);

2354

}

2355

}

2356

/* Zero plus something non-zero : just return the something */

2357

return packFloat32(cSign ^ signflip, cExp, cSig);

2358

}

2359

2360

if (aExp == 0) {

2361

normalizeFloat32Subnormal(aSig, &aExp, &aSig);

2362

}

2363

if (bExp == 0) {

2364

normalizeFloat32Subnormal(bSig, &bExp, &bSig);

2365

}

2366

2367

/* Calculate the actual result a * b + c */

2368

2369

/* Multiply first; this is easy. */

2370

/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f

2371

* because we want the true exponent, not the "one-less-than"

2372

* flavour that roundAndPackFloat32() takes.

2373

2374

pExp = aExp + bExp - 0x7e;

2375

aSig = (aSig | 0x00800000) << 7;

2376

bSig = (bSig | 0x00800000) << 8;

2377

pSig64 = (uint64_t)aSig * bSig;

2378

if ((int64_t)(pSig64 << 1) >= 0) {

2379

pSig64 <<= 1;

2380

pExp--;

2381

}

2382

2383

zSign = pSign ^ signflip;

2384

2385

/* Now pSig64 is the significand of the multiply, with the explicit bit in

2386

* position 62.

2387

2388

if (cExp == 0) {

2389

if (!cSig) {

2390

/* Throw out the special case of c being an exact zero now */

2391

shift64RightJamming(pSig64, 32, &pSig64);

2392

pSig = pSig64;

2393

return roundAndPackFloat32(zSign, pExp - 1,

2394

pSig STATUS_VAR);

2395

}

2396

normalizeFloat32Subnormal(cSig, &cExp, &cSig);

2397

}

2398

2399

cSig64 = (uint64_t)cSig << (62 - 23);

2400

cSig64 |= LIT64(0x4000000000000000);

2401

expDiff = pExp - cExp;

2402

2403

if (pSign == cSign) {

2404

/* Addition */

2405

if (expDiff > 0) {

2406

/* scale c to match p */

2407

shift64RightJamming(cSig64, expDiff, &cSig64);

2408

zExp = pExp;

2409

} else if (expDiff < 0) {

2410

/* scale p to match c */

2411

shift64RightJamming(pSig64, -expDiff, &pSig64);

2412

zExp = cExp;

2413

} else {

2414

/* no scaling needed */

2415

zExp = cExp;

2416

}

2417

/* Add significands and make sure explicit bit ends up in posn 62 */

2418

zSig64 = pSig64 + cSig64;

2419

if ((int64_t)zSig64 < 0) {

2420

shift64RightJamming(zSig64, 1, &zSig64);

2421

} else {

2422

zExp--;

2423

}

2424

} else {

2425

/* Subtraction */

2426

if (expDiff > 0) {

2427

shift64RightJamming(cSig64, expDiff, &cSig64);

2428

zSig64 = pSig64 - cSig64;

2429

zExp = pExp;

2430

} else if (expDiff < 0) {

2431

shift64RightJamming(pSig64, -expDiff, &pSig64);

2432

zSig64 = cSig64 - pSig64;

2433

zExp = cExp;

2434

zSign ^= 1;

2435

} else {

2436

zExp = pExp;

2437

if (cSig64 < pSig64) {

2438

zSig64 = pSig64 - cSig64;

2439

} else if (pSig64 < cSig64) {

2440

zSig64 = cSig64 - pSig64;

2441

zSign ^= 1;

2442

} else {

2443

/* Exact zero */

2444

zSign = signflip;

2445

if (STATUS(float_rounding_mode) == float_round_down) {

2446

zSign ^= 1;

2447

}

2448

return packFloat32(zSign, 0, 0);

2449

}

2450

}

2451

--zExp;

2452

/* Normalize to put the explicit bit back into bit 62. */

2453

shiftcount = countLeadingZeros64(zSig64) - 1;

2454

zSig64 <<= shiftcount;

2455

zExp -= shiftcount;

2456

}

2457

shift64RightJamming(zSig64, 32, &zSig64);

2458

return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);

2459

}

2460

2461

2462

/*----------------------------------------------------------------------------

2463

| Returns the square root of the single-precision floating-point value `a'.

2464

| The operation is performed according to the IEC/IEEE Standard for Binary

2465

| Floating-Point Arithmetic.

2466

*----------------------------------------------------------------------------*/

2467

2468

float32 float32_sqrt( float32 a STATUS_PARAM )

2469

{

2470

flag aSign;

2471

int_fast16_t aExp, zExp;

2472

uint32_t aSig, zSig;

2473

uint64_t rem, term;

2474

a = float32_squash_input_denormal(a STATUS_VAR);

2475

2476

aSig = extractFloat32Frac( a );

2477

aExp = extractFloat32Exp( a );

2478

aSign = extractFloat32Sign( a );

2479

if ( aExp == 0xFF ) {

2480

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2481

if ( ! aSign ) return a;

2482

float_raise( float_flag_invalid STATUS_VAR);

2483

return float32_default_nan;

2484

}

2485

if ( aSign ) {

2486

if ( ( aExp | aSig ) == 0 ) return a;

2487

float_raise( float_flag_invalid STATUS_VAR);

2488

return float32_default_nan;

2489

}

2490

if ( aExp == 0 ) {

2491

if ( aSig == 0 ) return float32_zero;

2492

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2493

}

2494

zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;

2495

aSig = ( aSig | 0x00800000 )<<8;

2496

zSig = estimateSqrt32( aExp, aSig ) + 2;

2497

if ( ( zSig & 0x7F ) <= 5 ) {

2498

if ( zSig < 2 ) {

2499

zSig = 0x7FFFFFFF;

2500

goto roundAndPack;

2501

}

2502

aSig >>= aExp & 1;

2503

term = ( (uint64_t) zSig ) * zSig;

2504

rem = ( ( (uint64_t) aSig )<<32 ) - term;

2505

while ( (int64_t) rem < 0 ) {

2506

--zSig;

2507

rem += ( ( (uint64_t) zSig )<<1 ) | 1;

2508

}

2509

zSig |= ( rem != 0 );

2510

}

2511

shift32RightJamming( zSig, 1, &zSig );

2512

roundAndPack:

2513

return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );

2514

2515

}

2516

2517

/*----------------------------------------------------------------------------

2518

| Returns the binary exponential of the single-precision floating-point value

2519

| `a'. The operation is performed according to the IEC/IEEE Standard for

2520

| Binary Floating-Point Arithmetic.

2521

2522

| Uses the following identities:

2523

2524

| 1. -------------------------------------------------------------------------

2525

| x x*ln(2)

2526

| 2 = e

2527

2528

| 2. -------------------------------------------------------------------------

2529

| 2 3 4 5 n

2530

| x x x x x x x

2531

| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...

2532

| 1! 2! 3! 4! 5! n!

2533

*----------------------------------------------------------------------------*/

2534

2535

static const float64 float32_exp2_coefficients[15] =

2536

{

2537

const_float64( 0x3ff0000000000000ll ), /* 1 */

2538

const_float64( 0x3fe0000000000000ll ), /* 2 */

2539

const_float64( 0x3fc5555555555555ll ), /* 3 */

2540

const_float64( 0x3fa5555555555555ll ), /* 4 */

2541

const_float64( 0x3f81111111111111ll ), /* 5 */

2542

const_float64( 0x3f56c16c16c16c17ll ), /* 6 */

2543

const_float64( 0x3f2a01a01a01a01all ), /* 7 */

2544

const_float64( 0x3efa01a01a01a01all ), /* 8 */

2545

const_float64( 0x3ec71de3a556c734ll ), /* 9 */

2546

const_float64( 0x3e927e4fb7789f5cll ), /* 10 */

2547

const_float64( 0x3e5ae64567f544e4ll ), /* 11 */

2548

const_float64( 0x3e21eed8eff8d898ll ), /* 12 */

2549

const_float64( 0x3de6124613a86d09ll ), /* 13 */

2550

const_float64( 0x3da93974a8c07c9dll ), /* 14 */

2551

const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */

2552

};

2553

2554

float32 float32_exp2( float32 a STATUS_PARAM )

2555

{

2556

flag aSign;

2557

int_fast16_t aExp;

2558

uint32_t aSig;

2559

float64 r, x, xn;

2560

int i;

2561

a = float32_squash_input_denormal(a STATUS_VAR);

2562

2563

aSig = extractFloat32Frac( a );

2564

aExp = extractFloat32Exp( a );

2565

aSign = extractFloat32Sign( a );

2566

2567

if ( aExp == 0xFF) {

2568

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2569

return (aSign) ? float32_zero : a;

2570

}

2571

if (aExp == 0) {

2572

if (aSig == 0) return float32_one;

2573

}

2574

2575

float_raise( float_flag_inexact STATUS_VAR);

2576

2577

/* ******************************* */

2578

/* using float64 for approximation */

2579

/* ******************************* */

2580

x = float32_to_float64(a STATUS_VAR);

2581

x = float64_mul(x, float64_ln2 STATUS_VAR);

2582

2583

xn = x;

2584

r = float64_one;

2585

for (i = 0 ; i < 15 ; i++) {

2586

float64 f;

2587

2588

f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);

2589

r = float64_add(r, f STATUS_VAR);

2590

2591

xn = float64_mul(xn, x STATUS_VAR);

2592

}

2593

2594

return float64_to_float32(r, status);

2595

}

2596

2597

/*----------------------------------------------------------------------------

2598

| Returns the binary log of the single-precision floating-point value `a'.

2599

| The operation is performed according to the IEC/IEEE Standard for Binary

2600

| Floating-Point Arithmetic.

2601

*----------------------------------------------------------------------------*/

2602

float32 float32_log2( float32 a STATUS_PARAM )

2603

{

2604

flag aSign, zSign;

2605

int_fast16_t aExp;

2606

uint32_t aSig, zSig, i;

2607

2608

a = float32_squash_input_denormal(a STATUS_VAR);

2609

aSig = extractFloat32Frac( a );

2610

aExp = extractFloat32Exp( a );

2611

aSign = extractFloat32Sign( a );

2612

2613

if ( aExp == 0 ) {

2614

if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );

2615

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2616

}

2617

if ( aSign ) {

2618

float_raise( float_flag_invalid STATUS_VAR);

2619

return float32_default_nan;

2620

}

2621

if ( aExp == 0xFF ) {

2622

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2623

return a;

2624

}

2625

2626

aExp -= 0x7F;

2627

aSig |= 0x00800000;

2628

zSign = aExp < 0;

2629

zSig = aExp << 23;

2630

2631

for (i = 1 << 22; i > 0; i >>= 1) {

2632

aSig = ( (uint64_t)aSig * aSig ) >> 23;

2633

if ( aSig & 0x01000000 ) {

2634

aSig >>= 1;

2635

zSig |= i;

2636

}

2637

}

2638

2639

if ( zSign )

2640

zSig = -zSig;

2641

2642

return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );

2643

}

2644

2645

/*----------------------------------------------------------------------------

2646

| Returns 1 if the single-precision floating-point value `a' is equal to

2647

| the corresponding value `b', and 0 otherwise. The invalid exception is

2648

| raised if either operand is a NaN. Otherwise, the comparison is performed

2649

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2650

*----------------------------------------------------------------------------*/

2651

2652

int float32_eq( float32 a, float32 b STATUS_PARAM )

2653

{

2654

uint32_t av, bv;

2655

a = float32_squash_input_denormal(a STATUS_VAR);

2656

b = float32_squash_input_denormal(b STATUS_VAR);

2657

2658

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2659

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2660

) {

2661

float_raise( float_flag_invalid STATUS_VAR);

2662

return 0;

2663

}

2664

av = float32_val(a);

2665

bv = float32_val(b);

2666

return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2667

}

2668

2669

/*----------------------------------------------------------------------------

2670

| Returns 1 if the single-precision floating-point value `a' is less than

2671

| or equal to the corresponding value `b', and 0 otherwise. The invalid

2672

| exception is raised if either operand is a NaN. The comparison is performed

2673

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2674

*----------------------------------------------------------------------------*/

2675

2676

int float32_le( float32 a, float32 b STATUS_PARAM )

2677

{

2678

flag aSign, bSign;

2679

uint32_t av, bv;

2680

a = float32_squash_input_denormal(a STATUS_VAR);

2681

b = float32_squash_input_denormal(b STATUS_VAR);

2682

2683

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2684

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2685

) {

2686

float_raise( float_flag_invalid STATUS_VAR);

2687

return 0;

2688

}

2689

aSign = extractFloat32Sign( a );

2690

bSign = extractFloat32Sign( b );

2691

av = float32_val(a);

2692

bv = float32_val(b);

2693

if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2694

return ( av == bv ) || ( aSign ^ ( av < bv ) );

2695

2696

}

2697

2698

/*----------------------------------------------------------------------------

2699

| Returns 1 if the single-precision floating-point value `a' is less than

2700

| the corresponding value `b', and 0 otherwise. The invalid exception is

2701

| raised if either operand is a NaN. The comparison is performed according

2702

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2703

*----------------------------------------------------------------------------*/

2704

2705

int float32_lt( float32 a, float32 b STATUS_PARAM )

2706

{

2707

flag aSign, bSign;

2708

uint32_t av, bv;

2709

a = float32_squash_input_denormal(a STATUS_VAR);

2710

b = float32_squash_input_denormal(b STATUS_VAR);

2711

2712

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2713

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2714

) {

2715

float_raise( float_flag_invalid STATUS_VAR);

2716

return 0;

2717

}

2718

aSign = extractFloat32Sign( a );

2719

bSign = extractFloat32Sign( b );

2720

av = float32_val(a);

2721

bv = float32_val(b);

2722

if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );

2723

return ( av != bv ) && ( aSign ^ ( av < bv ) );

2724

2725

}

2726

2727

/*----------------------------------------------------------------------------

2728

| Returns 1 if the single-precision floating-point values `a' and `b' cannot

2729

| be compared, and 0 otherwise. The invalid exception is raised if either

2730

| operand is a NaN. The comparison is performed according to the IEC/IEEE

2731

| Standard for Binary Floating-Point Arithmetic.

2732

*----------------------------------------------------------------------------*/

2733

2734

int float32_unordered( float32 a, float32 b STATUS_PARAM )

2735

{

2736

a = float32_squash_input_denormal(a STATUS_VAR);

2737

b = float32_squash_input_denormal(b STATUS_VAR);

2738

2739

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2740

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2741

) {

2742

float_raise( float_flag_invalid STATUS_VAR);

2743

return 1;

2744

}

2745

return 0;

2746

}

2747

2748

/*----------------------------------------------------------------------------

2749

| Returns 1 if the single-precision floating-point value `a' is equal to

2750

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

2751

| exception. The comparison is performed according to the IEC/IEEE Standard

2752

| for Binary Floating-Point Arithmetic.

2753

*----------------------------------------------------------------------------*/

2754

2755

int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )

2756

{

2757

a = float32_squash_input_denormal(a STATUS_VAR);

2758

b = float32_squash_input_denormal(b STATUS_VAR);

2759

2760

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2761

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2762

) {

2763

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2764

float_raise( float_flag_invalid STATUS_VAR);

2765

}

2766

return 0;

2767

}

2768

return ( float32_val(a) == float32_val(b) ) ||

2769

( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );

2770

}

2771

2772

/*----------------------------------------------------------------------------

2773

| Returns 1 if the single-precision floating-point value `a' is less than or

2774

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

2775

| cause an exception. Otherwise, the comparison is performed according to the

2776

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2777

*----------------------------------------------------------------------------*/

2778

2779

int float32_le_quiet( float32 a, float32 b STATUS_PARAM )

2780

{

2781

flag aSign, bSign;

2782

uint32_t av, bv;

2783

a = float32_squash_input_denormal(a STATUS_VAR);

2784

b = float32_squash_input_denormal(b STATUS_VAR);

2785

2786

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2787

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2788

) {

2789

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2790

float_raise( float_flag_invalid STATUS_VAR);

2791

}

2792

return 0;

2793

}

2794

aSign = extractFloat32Sign( a );

2795

bSign = extractFloat32Sign( b );

2796

av = float32_val(a);

2797

bv = float32_val(b);

2798

if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2799

return ( av == bv ) || ( aSign ^ ( av < bv ) );

2800

2801

}

2802

2803

/*----------------------------------------------------------------------------

2804

| Returns 1 if the single-precision floating-point value `a' is less than

2805

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

2806

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

2807

| Standard for Binary Floating-Point Arithmetic.

2808

*----------------------------------------------------------------------------*/

2809

2810

int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )

2811

{

2812

flag aSign, bSign;

2813

uint32_t av, bv;

2814

a = float32_squash_input_denormal(a STATUS_VAR);

2815

b = float32_squash_input_denormal(b STATUS_VAR);

2816

2817

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2818

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2819

) {

2820

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2821

float_raise( float_flag_invalid STATUS_VAR);

2822

}

2823

return 0;

2824

}

2825

aSign = extractFloat32Sign( a );

2826

bSign = extractFloat32Sign( b );

2827

av = float32_val(a);

2828

bv = float32_val(b);

2829

if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );

2830

return ( av != bv ) && ( aSign ^ ( av < bv ) );

2831

2832

}

2833

2834

/*----------------------------------------------------------------------------

2835

| Returns 1 if the single-precision floating-point values `a' and `b' cannot

2836

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

2837

| comparison is performed according to the IEC/IEEE Standard for Binary

2838

| Floating-Point Arithmetic.

2839

*----------------------------------------------------------------------------*/

2840

2841

int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )

2842

{

2843

a = float32_squash_input_denormal(a STATUS_VAR);

2844

b = float32_squash_input_denormal(b STATUS_VAR);

2845

2846

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2847

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2848

) {

2849

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2850

float_raise( float_flag_invalid STATUS_VAR);

2851

}

2852

return 1;

2853

}

2854

return 0;

2855

}

2856

2857

/*----------------------------------------------------------------------------

2858

| Returns the result of converting the double-precision floating-point value

2859

| `a' to the 32-bit two's complement integer format. The conversion is

2860

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2861

| Arithmetic---which means in particular that the conversion is rounded

2862

| according to the current rounding mode. If `a' is a NaN, the largest

2863

| positive integer is returned. Otherwise, if the conversion overflows, the

2864

| largest integer with the same sign as `a' is returned.

2865

*----------------------------------------------------------------------------*/

2866

2867

int32 float64_to_int32( float64 a STATUS_PARAM )

2868

{

2869

flag aSign;

2870

int_fast16_t aExp, shiftCount;

2871

uint64_t aSig;

2872

a = float64_squash_input_denormal(a STATUS_VAR);

2873

2874

aSig = extractFloat64Frac( a );

2875

aExp = extractFloat64Exp( a );

2876

aSign = extractFloat64Sign( a );

2877

if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;

2878

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

2879

shiftCount = 0x42C - aExp;

2880

if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );

2881

return roundAndPackInt32( aSign, aSig STATUS_VAR );

2882

2883

}

2884

2885

/*----------------------------------------------------------------------------

2886

| Returns the result of converting the double-precision floating-point value

2887

| `a' to the 32-bit two's complement integer format. The conversion is

2888

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2889

| Arithmetic, except that the conversion is always rounded toward zero.

2890

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2891

| the conversion overflows, the largest integer with the same sign as `a' is

2892

| returned.

2893

*----------------------------------------------------------------------------*/

2894

2895

int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )

2896

{

2897

flag aSign;

2898

int_fast16_t aExp, shiftCount;

2899

uint64_t aSig, savedASig;

2900

int32_t z;

2901

a = float64_squash_input_denormal(a STATUS_VAR);

2902

2903

aSig = extractFloat64Frac( a );

2904

aExp = extractFloat64Exp( a );

2905

aSign = extractFloat64Sign( a );

2906

if ( 0x41E < aExp ) {

2907

if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;

2908

goto invalid;

2909

}

2910

else if ( aExp < 0x3FF ) {

2911

if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

2912

return 0;

2913

}

2914

aSig |= LIT64( 0x0010000000000000 );

2915

shiftCount = 0x433 - aExp;

2916

savedASig = aSig;

2917

aSig >>= shiftCount;

2918

z = aSig;

2919

if ( aSign ) z = - z;

2920

if ( ( z < 0 ) ^ aSign ) {

2921

invalid:

2922

float_raise( float_flag_invalid STATUS_VAR);

2923

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

2924

}

2925

if ( ( aSig<<shiftCount ) != savedASig ) {

2926

STATUS(float_exception_flags) |= float_flag_inexact;

2927

}

2928

return z;

2929

2930

}

2931

2932

/*----------------------------------------------------------------------------

2933

| Returns the result of converting the double-precision floating-point value

2934

| `a' to the 16-bit two's complement integer format. The conversion is

2935

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2936

| Arithmetic, except that the conversion is always rounded toward zero.

2937

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2938

| the conversion overflows, the largest integer with the same sign as `a' is

2939

| returned.

2940

*----------------------------------------------------------------------------*/

2941

2942

int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)

2943

{

2944

flag aSign;

2945

int_fast16_t aExp, shiftCount;

2946

uint64_t aSig, savedASig;

2947

int32 z;

2948

2949

aSig = extractFloat64Frac( a );

2950

aExp = extractFloat64Exp( a );

2951

aSign = extractFloat64Sign( a );

2952

if ( 0x40E < aExp ) {

2953

if ( ( aExp == 0x7FF ) && aSig ) {

2954

aSign = 0;

2955

}

2956

goto invalid;

2957

}

2958

else if ( aExp < 0x3FF ) {

2959

if ( aExp || aSig ) {

2960

STATUS(float_exception_flags) |= float_flag_inexact;

2961

}

2962

return 0;

2963

}

2964

aSig |= LIT64( 0x0010000000000000 );

2965

shiftCount = 0x433 - aExp;

2966

savedASig = aSig;

2967

aSig >>= shiftCount;

2968

z = aSig;

2969

if ( aSign ) {

2970

z = - z;

2971

}

2972

if ( ( (int16_t)z < 0 ) ^ aSign ) {

2973

invalid:

2974

float_raise( float_flag_invalid STATUS_VAR);

2975

return aSign ? (int32_t) 0xffff8000 : 0x7FFF;

2976

}

2977

if ( ( aSig<<shiftCount ) != savedASig ) {

2978

STATUS(float_exception_flags) |= float_flag_inexact;

2979

}

2980

return z;

2981

}

2982

2983

/*----------------------------------------------------------------------------

2984

| Returns the result of converting the double-precision floating-point value

2985

| `a' to the 64-bit two's complement integer format. The conversion is

2986

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2987

| Arithmetic---which means in particular that the conversion is rounded

2988

| according to the current rounding mode. If `a' is a NaN, the largest

2989

| positive integer is returned. Otherwise, if the conversion overflows, the

2990

| largest integer with the same sign as `a' is returned.

2991

*----------------------------------------------------------------------------*/

2992

2993

int64 float64_to_int64( float64 a STATUS_PARAM )

2994

{

2995

flag aSign;

2996

int_fast16_t aExp, shiftCount;

2997

uint64_t aSig, aSigExtra;

2998

a = float64_squash_input_denormal(a STATUS_VAR);

2999

3000

aSig = extractFloat64Frac( a );

3001

aExp = extractFloat64Exp( a );

3002

aSign = extractFloat64Sign( a );

3003

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

3004

shiftCount = 0x433 - aExp;

3005

if ( shiftCount <= 0 ) {

3006

if ( 0x43E < aExp ) {

3007

float_raise( float_flag_invalid STATUS_VAR);

3008

if ( ! aSign

3009

|| ( ( aExp == 0x7FF )

3010

&& ( aSig != LIT64( 0x0010000000000000 ) ) )

3011

) {

3012

return LIT64( 0x7FFFFFFFFFFFFFFF );

3013

}

3014

return (int64_t) LIT64( 0x8000000000000000 );

3015

}

3016

aSigExtra = 0;

3017

aSig <<= - shiftCount;

3018

}

3019

else {

3020

shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );

3021

}

3022

return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );

3023

3024

}

3025

3026

/*----------------------------------------------------------------------------

3027

| Returns the result of converting the double-precision floating-point value

3028

| `a' to the 64-bit two's complement integer format. The conversion is

3029

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3030

| Arithmetic, except that the conversion is always rounded toward zero.

3031

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

3032

| the conversion overflows, the largest integer with the same sign as `a' is

3033

| returned.

3034

*----------------------------------------------------------------------------*/

3035

3036

int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )

3037

{

3038

flag aSign;

3039

int_fast16_t aExp, shiftCount;

3040

uint64_t aSig;

3041

int64 z;

3042

a = float64_squash_input_denormal(a STATUS_VAR);

3043

3044

aSig = extractFloat64Frac( a );

3045

aExp = extractFloat64Exp( a );

3046

aSign = extractFloat64Sign( a );

3047

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

3048

shiftCount = aExp - 0x433;

3049

if ( 0 <= shiftCount ) {

3050

if ( 0x43E <= aExp ) {

3051

if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {

3052

float_raise( float_flag_invalid STATUS_VAR);

3053

if ( ! aSign

3054

|| ( ( aExp == 0x7FF )

3055

&& ( aSig != LIT64( 0x0010000000000000 ) ) )

3056

) {

3057

return LIT64( 0x7FFFFFFFFFFFFFFF );

3058

}

3059

}

3060

return (int64_t) LIT64( 0x8000000000000000 );

3061

}

3062

z = aSig<<shiftCount;

3063

}

3064

else {

3065

if ( aExp < 0x3FE ) {

3066

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

3067

return 0;

3068

}

3069

z = aSig>>( - shiftCount );

3070

if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {

3071

STATUS(float_exception_flags) |= float_flag_inexact;

3072

}

3073

}

3074

if ( aSign ) z = - z;

3075

return z;

3076

3077

}

3078

3079

/*----------------------------------------------------------------------------

3080

| Returns the result of converting the double-precision floating-point value

3081

| `a' to the single-precision floating-point format. The conversion is

3082

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3083

| Arithmetic.

3084

*----------------------------------------------------------------------------*/

3085

3086

float32 float64_to_float32( float64 a STATUS_PARAM )

3087

{

3088

flag aSign;

3089

int_fast16_t aExp;

3090

uint64_t aSig;

3091

uint32_t zSig;

3092

a = float64_squash_input_denormal(a STATUS_VAR);

3093

3094

aSig = extractFloat64Frac( a );

3095

aExp = extractFloat64Exp( a );

3096

aSign = extractFloat64Sign( a );

3097

if ( aExp == 0x7FF ) {

3098

if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3099

return packFloat32( aSign, 0xFF, 0 );

3100

}

3101

shift64RightJamming( aSig, 22, &aSig );

3102

zSig = aSig;

3103

if ( aExp || zSig ) {

3104

zSig |= 0x40000000;

3105

aExp -= 0x381;

3106

}

3107

return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );

3108

3109

}

3110

3111

3112

/*----------------------------------------------------------------------------

3113

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

3114

| half-precision floating-point value, returning the result. After being

3115

| shifted into the proper positions, the three fields are simply added

3116

| together to form the result. This means that any integer portion of `zSig'

3117

| will be added into the exponent. Since a properly normalized significand

3118

| will have an integer portion equal to 1, the `zExp' input should be 1 less

3119

| than the desired result exponent whenever `zSig' is a complete, normalized

3120

| significand.

3121

*----------------------------------------------------------------------------*/

3122

static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)

3123

{

3124

return make_float16(

3125

(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);

3126

}

3127

3128

/*----------------------------------------------------------------------------

3129

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

3130

| and significand `zSig', and returns the proper half-precision floating-

3131

| point value corresponding to the abstract input. Ordinarily, the abstract

3132

| value is simply rounded and packed into the half-precision format, with

3133

| the inexact exception raised if the abstract input cannot be represented

3134

| exactly. However, if the abstract value is too large, the overflow and

3135

| inexact exceptions are raised and an infinity or maximal finite value is

3136

| returned. If the abstract value is too small, the input value is rounded to

3137

| a subnormal number, and the underflow and inexact exceptions are raised if

3138

| the abstract input cannot be represented exactly as a subnormal half-

3139

| precision floating-point number.

3140

| The `ieee' flag indicates whether to use IEEE standard half precision, or

3141

| ARM-style "alternative representation", which omits the NaN and Inf

3142

| encodings in order to raise the maximum representable exponent by one.

3143

| The input significand `zSig' has its binary point between bits 22

3144

| and 23, which is 13 bits to the left of the usual location. This shifted

3145

| significand must be normalized or smaller. If `zSig' is not normalized,

3146

| `zExp' must be 0; in that case, the result returned is a subnormal number,

3147

| and it must not require rounding. In the usual case that `zSig' is

3148

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

3149

| Note the slightly odd position of the binary point in zSig compared with the

3150

| other roundAndPackFloat functions. This should probably be fixed if we

3151

| need to implement more float16 routines than just conversion.

3152

| The handling of underflow and overflow follows the IEC/IEEE Standard for

3153

| Binary Floating-Point Arithmetic.

3154

*----------------------------------------------------------------------------*/

3155

3156

static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,

3157

uint32_t zSig, flag ieee STATUS_PARAM)

3158

{

3159

int maxexp = ieee ? 29 : 30;

3160

uint32_t mask;

3161

uint32_t increment;

3162

bool rounding_bumps_exp;

3163

bool is_tiny = false;

3164

3165

/* Calculate the mask of bits of the mantissa which are not

3166

* representable in half-precision and will be lost.

3167

3168

if (zExp < 1) {

3169

/* Will be denormal in halfprec */

3170

mask = 0x00ffffff;

3171

if (zExp >= -11) {

3172

mask >>= 11 + zExp;

3173

}

3174

} else {

3175

/* Normal number in halfprec */

3176

mask = 0x00001fff;

3177

}

3178

3179

switch (STATUS(float_rounding_mode)) {

3180

case float_round_nearest_even:

3181

increment = (mask + 1) >> 1;

3182

if ((zSig & mask) == increment) {

3183

increment = zSig & (increment << 1);

3184

}

3185

break;

3186

case float_round_up:

3187

increment = zSign ? 0 : mask;

3188

break;

3189

case float_round_down:

3190

increment = zSign ? mask : 0;

3191

break;

3192

default: /* round_to_zero */

3193

increment = 0;

3194

break;

3195

}

3196

3197

rounding_bumps_exp = (zSig + increment >= 0x01000000);

3198

3199

if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {

3200

if (ieee) {

3201

float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);

3202

return packFloat16(zSign, 0x1f, 0);

3203

} else {

3204

float_raise(float_flag_invalid STATUS_VAR);

3205

return packFloat16(zSign, 0x1f, 0x3ff);

3206

}

3207

}

3208

3209

if (zExp < 0) {

3210

/* Note that flush-to-zero does not affect half-precision results */

3211

is_tiny =

3212

(STATUS(float_detect_tininess) == float_tininess_before_rounding)

3213

|| (zExp < -1)

3214

|| (!rounding_bumps_exp);

3215

}

3216

if (zSig & mask) {

3217

float_raise(float_flag_inexact STATUS_VAR);

3218

if (is_tiny) {

3219

float_raise(float_flag_underflow STATUS_VAR);

3220

}

3221

}

3222

3223

zSig += increment;

3224

if (rounding_bumps_exp) {

3225

zSig >>= 1;

3226

zExp++;

3227

}

3228

3229

if (zExp < -10) {

3230

return packFloat16(zSign, 0, 0);

3231

}

3232

if (zExp < 0) {

3233

zSig >>= -zExp;

3234

zExp = 0;

3235

}

3236

return packFloat16(zSign, zExp, zSig >> 13);

3237

}

3238

3239

static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,

3240

uint32_t *zSigPtr)

3241

{

3242

int8_t shiftCount = countLeadingZeros32(aSig) - 21;

3243

*zSigPtr = aSig << shiftCount;

3244

*zExpPtr = 1 - shiftCount;

3245

}

3246

3247

/* Half precision floats come in two formats: standard IEEE and "ARM" format.

3248

The latter gains extra exponent range by omitting the NaN/Inf encodings. */

3249

3250

float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)

3251

{

3252

flag aSign;

3253

int_fast16_t aExp;

3254

uint32_t aSig;

3255

3256

aSign = extractFloat16Sign(a);

3257

aExp = extractFloat16Exp(a);

3258

aSig = extractFloat16Frac(a);

3259

3260

if (aExp == 0x1f && ieee) {

3261

if (aSig) {

3262

return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3263

}

3264

return packFloat32(aSign, 0xff, 0);

3265

}

3266

if (aExp == 0) {

3267

if (aSig == 0) {

3268

return packFloat32(aSign, 0, 0);

3269

}

3270

3271

normalizeFloat16Subnormal(aSig, &aExp, &aSig);

3272

aExp--;

3273

}

3274

return packFloat32( aSign, aExp + 0x70, aSig << 13);

3275

}

3276

3277

float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)

3278

{

3279

flag aSign;

3280

int_fast16_t aExp;

3281

uint32_t aSig;

3282

3283

a = float32_squash_input_denormal(a STATUS_VAR);

3284

3285

aSig = extractFloat32Frac( a );

3286

aExp = extractFloat32Exp( a );

3287

aSign = extractFloat32Sign( a );

3288

if ( aExp == 0xFF ) {

3289

if (aSig) {

3290

/* Input is a NaN */

3291

if (!ieee) {

3292

float_raise(float_flag_invalid STATUS_VAR);

3293

return packFloat16(aSign, 0, 0);

3294

}

3295

return commonNaNToFloat16(

3296

float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3297

}

3298

/* Infinity */

3299

if (!ieee) {

3300

float_raise(float_flag_invalid STATUS_VAR);

3301

return packFloat16(aSign, 0x1f, 0x3ff);

3302

}

3303

return packFloat16(aSign, 0x1f, 0);

3304

}

3305

if (aExp == 0 && aSig == 0) {

3306

return packFloat16(aSign, 0, 0);

3307

}

3308

/* Decimal point between bits 22 and 23. Note that we add the 1 bit

3309

* even if the input is denormal; however this is harmless because

3310

* the largest possible single-precision denormal is still smaller

3311

* than the smallest representable half-precision denormal, and so we

3312

* will end up ignoring aSig and returning via the "always return zero"

3313

* codepath.

3314

3315

aSig |= 0x00800000;

3316

aExp -= 0x71;

3317

3318

return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);

3319

}

3320

3321

float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)

3322

{

3323

flag aSign;

3324

int_fast16_t aExp;

3325

uint32_t aSig;

3326

3327

aSign = extractFloat16Sign(a);

3328

aExp = extractFloat16Exp(a);

3329

aSig = extractFloat16Frac(a);

3330

3331

if (aExp == 0x1f && ieee) {

3332

if (aSig) {

3333

return commonNaNToFloat64(

3334

float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3335

}

3336

return packFloat64(aSign, 0x7ff, 0);

3337

}

3338

if (aExp == 0) {

3339

if (aSig == 0) {

3340

return packFloat64(aSign, 0, 0);

3341

}

3342

3343

normalizeFloat16Subnormal(aSig, &aExp, &aSig);

3344

aExp--;

3345

}

3346

return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);

3347

}

3348

3349

float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)

3350

{

3351

flag aSign;

3352

int_fast16_t aExp;

3353

uint64_t aSig;

3354

uint32_t zSig;

3355

3356

a = float64_squash_input_denormal(a STATUS_VAR);

3357

3358

aSig = extractFloat64Frac(a);

3359

aExp = extractFloat64Exp(a);

3360

aSign = extractFloat64Sign(a);

3361

if (aExp == 0x7FF) {

3362

if (aSig) {

3363

/* Input is a NaN */

3364

if (!ieee) {

3365

float_raise(float_flag_invalid STATUS_VAR);

3366

return packFloat16(aSign, 0, 0);

3367

}

3368

return commonNaNToFloat16(

3369

float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3370

}

3371

/* Infinity */

3372

if (!ieee) {

3373

float_raise(float_flag_invalid STATUS_VAR);

3374

return packFloat16(aSign, 0x1f, 0x3ff);

3375

}

3376

return packFloat16(aSign, 0x1f, 0);

3377

}

3378

shift64RightJamming(aSig, 29, &aSig);

3379

zSig = aSig;

3380

if (aExp == 0 && zSig == 0) {

3381

return packFloat16(aSign, 0, 0);

3382

}

3383

/* Decimal point between bits 22 and 23. Note that we add the 1 bit

3384

* even if the input is denormal; however this is harmless because

3385

* the largest possible single-precision denormal is still smaller

3386

* than the smallest representable half-precision denormal, and so we

3387

* will end up ignoring aSig and returning via the "always return zero"

3388

* codepath.

3389

3390

zSig |= 0x00800000;

3391

aExp -= 0x3F1;

3392

3393

return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);

3394

}

3395

3396

/*----------------------------------------------------------------------------

3397

| Returns the result of converting the double-precision floating-point value

3398

| `a' to the extended double-precision floating-point format. The conversion

3399

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

3400

| Arithmetic.

3401

*----------------------------------------------------------------------------*/

3402

3403

floatx80 float64_to_floatx80( float64 a STATUS_PARAM )

3404

{

3405

flag aSign;

3406

int_fast16_t aExp;

3407

uint64_t aSig;

3408

3409

a = float64_squash_input_denormal(a STATUS_VAR);

3410

aSig = extractFloat64Frac( a );

3411

aExp = extractFloat64Exp( a );

3412

aSign = extractFloat64Sign( a );

3413

if ( aExp == 0x7FF ) {

3414

if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3415

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

3416

}

3417

if ( aExp == 0 ) {

3418

if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );

3419

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3420

}

3421

return

3422

packFloatx80(

3423

aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );

3424

3425

}

3426

3427

/*----------------------------------------------------------------------------

3428

| Returns the result of converting the double-precision floating-point value

3429

| `a' to the quadruple-precision floating-point format. The conversion is

3430

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3431

| Arithmetic.

3432

*----------------------------------------------------------------------------*/

3433

3434

float128 float64_to_float128( float64 a STATUS_PARAM )

3435

{

3436

flag aSign;

3437

int_fast16_t aExp;

3438

uint64_t aSig, zSig0, zSig1;

3439

3440

a = float64_squash_input_denormal(a STATUS_VAR);

3441

aSig = extractFloat64Frac( a );

3442

aExp = extractFloat64Exp( a );

3443

aSign = extractFloat64Sign( a );

3444

if ( aExp == 0x7FF ) {

3445

if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3446

return packFloat128( aSign, 0x7FFF, 0, 0 );

3447

}

3448

if ( aExp == 0 ) {

3449

if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );

3450

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3451

--aExp;

3452

}

3453

shift128Right( aSig, 0, 4, &zSig0, &zSig1 );

3454

return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );

3455

3456

}

3457

3458

/*----------------------------------------------------------------------------

3459

| Rounds the double-precision floating-point value `a' to an integer, and

3460

| returns the result as a double-precision floating-point value. The

3461

| operation is performed according to the IEC/IEEE Standard for Binary

3462

| Floating-Point Arithmetic.

3463

*----------------------------------------------------------------------------*/

3464

3465

float64 float64_round_to_int( float64 a STATUS_PARAM )

3466

{

3467

flag aSign;

3468

int_fast16_t aExp;

3469

uint64_t lastBitMask, roundBitsMask;

3470

uint64_t z;

3471

a = float64_squash_input_denormal(a STATUS_VAR);

3472

3473

aExp = extractFloat64Exp( a );

3474

if ( 0x433 <= aExp ) {

3475

if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {

3476

return propagateFloat64NaN( a, a STATUS_VAR );

3477

}

3478

return a;

3479

}

3480

if ( aExp < 0x3FF ) {

3481

if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;

3482

STATUS(float_exception_flags) |= float_flag_inexact;

3483

aSign = extractFloat64Sign( a );

3484

switch ( STATUS(float_rounding_mode) ) {

3485

case float_round_nearest_even:

3486

if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {

3487

return packFloat64( aSign, 0x3FF, 0 );

3488

}

3489

break;

3490

case float_round_down:

3491

return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);

3492

case float_round_up:

3493

return make_float64(

3494

aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));

3495

}

3496

return packFloat64( aSign, 0, 0 );

3497

}

3498

lastBitMask = 1;

3499

lastBitMask <<= 0x433 - aExp;

3500

roundBitsMask = lastBitMask - 1;

3501

z = float64_val(a);

3502

switch (STATUS(float_rounding_mode)) {

3503

case float_round_nearest_even:

3504

z += lastBitMask >> 1;

3505

if ((z & roundBitsMask) == 0) {

3506

z &= ~lastBitMask;

3507

}

3508

break;

3509

case float_round_to_zero:

3510

break;

3511

case float_round_up:

3512

if (!extractFloat64Sign(make_float64(z))) {

3513

z += roundBitsMask;

3514

}

3515

break;

3516

case float_round_down:

3517

if (extractFloat64Sign(make_float64(z))) {

3518

z += roundBitsMask;

3519

}

3520

break;

3521

default:

3522

abort();

3523

}

3524

z &= ~ roundBitsMask;

3525

if ( z != float64_val(a) )

3526

STATUS(float_exception_flags) |= float_flag_inexact;

3527

return make_float64(z);

3528

3529

}

3530

3531

float64 float64_trunc_to_int( float64 a STATUS_PARAM)

3532

{

3533

int oldmode;

3534

float64 res;

3535

oldmode = STATUS(float_rounding_mode);

3536

STATUS(float_rounding_mode) = float_round_to_zero;

3537

res = float64_round_to_int(a STATUS_VAR);

3538

STATUS(float_rounding_mode) = oldmode;

3539

return res;

3540

}

3541

3542

/*----------------------------------------------------------------------------

3543

| Returns the result of adding the absolute values of the double-precision

3544

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

3545

| before being returned. `zSign' is ignored if the result is a NaN.

3546

| The addition is performed according to the IEC/IEEE Standard for Binary

3547

| Floating-Point Arithmetic.

3548

*----------------------------------------------------------------------------*/

3549

3550

static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )

3551

{

3552

int_fast16_t aExp, bExp, zExp;

3553

uint64_t aSig, bSig, zSig;

3554

int_fast16_t expDiff;

3555

3556

aSig = extractFloat64Frac( a );

3557

aExp = extractFloat64Exp( a );

3558

bSig = extractFloat64Frac( b );

3559

bExp = extractFloat64Exp( b );

3560

expDiff = aExp - bExp;

3561

aSig <<= 9;

3562

bSig <<= 9;

3563

if ( 0 < expDiff ) {

3564

if ( aExp == 0x7FF ) {

3565

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3566

return a;

3567

}

3568

if ( bExp == 0 ) {

3569

--expDiff;

3570

}

3571

else {

3572

bSig |= LIT64( 0x2000000000000000 );

3573

}

3574

shift64RightJamming( bSig, expDiff, &bSig );

3575

zExp = aExp;

3576

}

3577

else if ( expDiff < 0 ) {

3578

if ( bExp == 0x7FF ) {

3579

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3580

return packFloat64( zSign, 0x7FF, 0 );

3581

}

3582

if ( aExp == 0 ) {

3583

++expDiff;

3584

}

3585

else {

3586

aSig |= LIT64( 0x2000000000000000 );

3587

}

3588

shift64RightJamming( aSig, - expDiff, &aSig );

3589

zExp = bExp;

3590

}

3591

else {

3592

if ( aExp == 0x7FF ) {

3593

if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3594

return a;

3595

}

3596

if ( aExp == 0 ) {

3597

if (STATUS(flush_to_zero)) {

3598

if (aSig | bSig) {

3599

float_raise(float_flag_output_denormal STATUS_VAR);

3600

}

3601

return packFloat64(zSign, 0, 0);

3602

}

3603

return packFloat64( zSign, 0, ( aSig + bSig )>>9 );

3604

}

3605

zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;

3606

zExp = aExp;

3607

goto roundAndPack;

3608

}

3609

aSig |= LIT64( 0x2000000000000000 );

3610

zSig = ( aSig + bSig )<<1;

3611

--zExp;

3612

if ( (int64_t) zSig < 0 ) {

3613

zSig = aSig + bSig;

3614

++zExp;

3615

}

3616

roundAndPack:

3617

return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3618

3619

}

3620

3621

/*----------------------------------------------------------------------------

3622

| Returns the result of subtracting the absolute values of the double-

3623

| precision floating-point values `a' and `b'. If `zSign' is 1, the

3624

| difference is negated before being returned. `zSign' is ignored if the

3625

| result is a NaN. The subtraction is performed according to the IEC/IEEE

3626

| Standard for Binary Floating-Point Arithmetic.

3627

*----------------------------------------------------------------------------*/

3628

3629

static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )

3630

{

3631

int_fast16_t aExp, bExp, zExp;

3632

uint64_t aSig, bSig, zSig;

3633

int_fast16_t expDiff;

3634

3635

aSig = extractFloat64Frac( a );

3636

aExp = extractFloat64Exp( a );

3637

bSig = extractFloat64Frac( b );

3638

bExp = extractFloat64Exp( b );

3639

expDiff = aExp - bExp;

3640

aSig <<= 10;

3641

bSig <<= 10;

3642

if ( 0 < expDiff ) goto aExpBigger;

3643

if ( expDiff < 0 ) goto bExpBigger;

3644

if ( aExp == 0x7FF ) {

3645

if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3646

float_raise( float_flag_invalid STATUS_VAR);

3647

return float64_default_nan;

3648

}

3649

if ( aExp == 0 ) {

3650

aExp = 1;

3651

bExp = 1;

3652

}

3653

if ( bSig < aSig ) goto aBigger;

3654

if ( aSig < bSig ) goto bBigger;

3655

return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

3656

bExpBigger:

3657

if ( bExp == 0x7FF ) {

3658

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3659

return packFloat64( zSign ^ 1, 0x7FF, 0 );

3660

}

3661

if ( aExp == 0 ) {

3662

++expDiff;

3663

}

3664

else {

3665

aSig |= LIT64( 0x4000000000000000 );

3666

}

3667

shift64RightJamming( aSig, - expDiff, &aSig );

3668

bSig |= LIT64( 0x4000000000000000 );

3669

bBigger:

3670

zSig = bSig - aSig;

3671

zExp = bExp;

3672

zSign ^= 1;

3673

goto normalizeRoundAndPack;

3674

aExpBigger:

3675

if ( aExp == 0x7FF ) {

3676

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3677

return a;

3678

}

3679

if ( bExp == 0 ) {

3680

--expDiff;

3681

}

3682

else {

3683

bSig |= LIT64( 0x4000000000000000 );

3684

}

3685

shift64RightJamming( bSig, expDiff, &bSig );

3686

aSig |= LIT64( 0x4000000000000000 );

3687

aBigger:

3688

zSig = aSig - bSig;

3689

zExp = aExp;

3690

normalizeRoundAndPack:

3691

--zExp;

3692

return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3693

3694

}

3695

3696

/*----------------------------------------------------------------------------

3697

| Returns the result of adding the double-precision floating-point values `a'

3698

| and `b'. The operation is performed according to the IEC/IEEE Standard for

3699

| Binary Floating-Point Arithmetic.

3700

*----------------------------------------------------------------------------*/

3701

3702

float64 float64_add( float64 a, float64 b STATUS_PARAM )

3703

{

3704

flag aSign, bSign;

3705

a = float64_squash_input_denormal(a STATUS_VAR);

3706

b = float64_squash_input_denormal(b STATUS_VAR);

3707

3708

aSign = extractFloat64Sign( a );

3709

bSign = extractFloat64Sign( b );

3710

if ( aSign == bSign ) {

3711

return addFloat64Sigs( a, b, aSign STATUS_VAR );

3712

}

3713

else {

3714

return subFloat64Sigs( a, b, aSign STATUS_VAR );

3715

}

3716

3717

}

3718

3719

/*----------------------------------------------------------------------------

3720

| Returns the result of subtracting the double-precision floating-point values

3721

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

3722

| for Binary Floating-Point Arithmetic.

3723

*----------------------------------------------------------------------------*/

3724

3725

float64 float64_sub( float64 a, float64 b STATUS_PARAM )

3726

{

3727

flag aSign, bSign;

3728

a = float64_squash_input_denormal(a STATUS_VAR);

3729

b = float64_squash_input_denormal(b STATUS_VAR);

3730

3731

aSign = extractFloat64Sign( a );

3732

bSign = extractFloat64Sign( b );

3733

if ( aSign == bSign ) {

3734

return subFloat64Sigs( a, b, aSign STATUS_VAR );

3735

}

3736

else {

3737

return addFloat64Sigs( a, b, aSign STATUS_VAR );

3738

}

3739

3740

}

3741

3742

/*----------------------------------------------------------------------------

3743

| Returns the result of multiplying the double-precision floating-point values

3744

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

3745

| for Binary Floating-Point Arithmetic.

3746

*----------------------------------------------------------------------------*/

3747

3748

float64 float64_mul( float64 a, float64 b STATUS_PARAM )

3749

{

3750

flag aSign, bSign, zSign;

3751

int_fast16_t aExp, bExp, zExp;

3752

uint64_t aSig, bSig, zSig0, zSig1;

3753

3754

a = float64_squash_input_denormal(a STATUS_VAR);

3755

b = float64_squash_input_denormal(b STATUS_VAR);

3756

3757

aSig = extractFloat64Frac( a );

3758

aExp = extractFloat64Exp( a );

3759

aSign = extractFloat64Sign( a );

3760

bSig = extractFloat64Frac( b );

3761

bExp = extractFloat64Exp( b );

3762

bSign = extractFloat64Sign( b );

3763

zSign = aSign ^ bSign;

3764

if ( aExp == 0x7FF ) {

3765

if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {

3766

return propagateFloat64NaN( a, b STATUS_VAR );

3767

}

3768

if ( ( bExp | bSig ) == 0 ) {

3769

float_raise( float_flag_invalid STATUS_VAR);

3770

return float64_default_nan;

3771

}

3772

return packFloat64( zSign, 0x7FF, 0 );

3773

}

3774

if ( bExp == 0x7FF ) {

3775

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3776

if ( ( aExp | aSig ) == 0 ) {

3777

float_raise( float_flag_invalid STATUS_VAR);

3778

return float64_default_nan;

3779

}

3780

return packFloat64( zSign, 0x7FF, 0 );

3781

}

3782

if ( aExp == 0 ) {

3783

if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );

3784

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3785

}

3786

if ( bExp == 0 ) {

3787

if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );

3788

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3789

}

3790

zExp = aExp + bExp - 0x3FF;

3791

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;

3792

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3793

mul64To128( aSig, bSig, &zSig0, &zSig1 );

3794

zSig0 |= ( zSig1 != 0 );

3795

if ( 0 <= (int64_t) ( zSig0<<1 ) ) {

3796

zSig0 <<= 1;

3797

--zExp;

3798

}

3799

return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );

3800

3801

}

3802

3803

/*----------------------------------------------------------------------------

3804

| Returns the result of dividing the double-precision floating-point value `a'

3805

| by the corresponding value `b'. The operation is performed according to

3806

| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

3807

*----------------------------------------------------------------------------*/

3808

3809

float64 float64_div( float64 a, float64 b STATUS_PARAM )

3810

{

3811

flag aSign, bSign, zSign;

3812

int_fast16_t aExp, bExp, zExp;

3813

uint64_t aSig, bSig, zSig;

3814

uint64_t rem0, rem1;

3815

uint64_t term0, term1;

3816

a = float64_squash_input_denormal(a STATUS_VAR);

3817

b = float64_squash_input_denormal(b STATUS_VAR);

3818

3819

aSig = extractFloat64Frac( a );

3820

aExp = extractFloat64Exp( a );

3821

aSign = extractFloat64Sign( a );

3822

bSig = extractFloat64Frac( b );

3823

bExp = extractFloat64Exp( b );

3824

bSign = extractFloat64Sign( b );

3825

zSign = aSign ^ bSign;

3826

if ( aExp == 0x7FF ) {

3827

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3828

if ( bExp == 0x7FF ) {

3829

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3830

float_raise( float_flag_invalid STATUS_VAR);

3831

return float64_default_nan;

3832

}

3833

return packFloat64( zSign, 0x7FF, 0 );

3834

}

3835

if ( bExp == 0x7FF ) {

3836

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3837

return packFloat64( zSign, 0, 0 );

3838

}

3839

if ( bExp == 0 ) {

3840

if ( bSig == 0 ) {

3841

if ( ( aExp | aSig ) == 0 ) {

3842

float_raise( float_flag_invalid STATUS_VAR);

3843

return float64_default_nan;

3844

}

3845

float_raise( float_flag_divbyzero STATUS_VAR);

3846

return packFloat64( zSign, 0x7FF, 0 );

3847

}

3848

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3849

}

3850

if ( aExp == 0 ) {

3851

if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );

3852

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3853

}

3854

zExp = aExp - bExp + 0x3FD;

3855

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;

3856

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3857

if ( bSig <= ( aSig + aSig ) ) {

3858

aSig >>= 1;

3859

++zExp;

3860

}

3861

zSig = estimateDiv128To64( aSig, 0, bSig );

3862

if ( ( zSig & 0x1FF ) <= 2 ) {

3863

mul64To128( bSig, zSig, &term0, &term1 );

3864

sub128( aSig, 0, term0, term1, &rem0, &rem1 );

3865

while ( (int64_t) rem0 < 0 ) {

3866

--zSig;

3867

add128( rem0, rem1, 0, bSig, &rem0, &rem1 );

3868

}

3869

zSig |= ( rem1 != 0 );

3870

}

3871

return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3872

3873

}

3874

3875

/*----------------------------------------------------------------------------

3876

| Returns the remainder of the double-precision floating-point value `a'

3877

| with respect to the corresponding value `b'. The operation is performed

3878

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

3879

*----------------------------------------------------------------------------*/

3880

3881

float64 float64_rem( float64 a, float64 b STATUS_PARAM )

3882

{

3883

flag aSign, zSign;

3884

int_fast16_t aExp, bExp, expDiff;

3885

uint64_t aSig, bSig;

3886

uint64_t q, alternateASig;

3887

int64_t sigMean;

3888

3889

a = float64_squash_input_denormal(a STATUS_VAR);

3890

b = float64_squash_input_denormal(b STATUS_VAR);

3891

aSig = extractFloat64Frac( a );

3892

aExp = extractFloat64Exp( a );

3893

aSign = extractFloat64Sign( a );

3894

bSig = extractFloat64Frac( b );

3895

bExp = extractFloat64Exp( b );

3896

if ( aExp == 0x7FF ) {

3897

if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {

3898

return propagateFloat64NaN( a, b STATUS_VAR );

3899

}

3900

float_raise( float_flag_invalid STATUS_VAR);

3901

return float64_default_nan;

3902

}

3903

if ( bExp == 0x7FF ) {

3904

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3905

return a;

3906

}

3907

if ( bExp == 0 ) {

3908

if ( bSig == 0 ) {

3909

float_raise( float_flag_invalid STATUS_VAR);

3910

return float64_default_nan;

3911

}

3912

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3913

}

3914

if ( aExp == 0 ) {

3915

if ( aSig == 0 ) return a;

3916

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3917

}

3918

expDiff = aExp - bExp;

3919

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;

3920

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3921

if ( expDiff < 0 ) {

3922

if ( expDiff < -1 ) return a;

3923

aSig >>= 1;

3924

}

3925

q = ( bSig <= aSig );

3926

if ( q ) aSig -= bSig;

3927

expDiff -= 64;

3928

while ( 0 < expDiff ) {

3929

q = estimateDiv128To64( aSig, 0, bSig );

3930

q = ( 2 < q ) ? q - 2 : 0;

3931

aSig = - ( ( bSig>>2 ) * q );

3932

expDiff -= 62;

3933

}

3934

expDiff += 64;

3935

if ( 0 < expDiff ) {

3936

q = estimateDiv128To64( aSig, 0, bSig );

3937

q = ( 2 < q ) ? q - 2 : 0;

3938

q >>= 64 - expDiff;

3939

bSig >>= 2;

3940

aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;

3941

}

3942

else {

3943

aSig >>= 2;

3944

bSig >>= 2;

3945

}

3946

do {

3947

alternateASig = aSig;

3948

++q;

3949

aSig -= bSig;

3950

} while ( 0 <= (int64_t) aSig );

3951

sigMean = aSig + alternateASig;

3952

if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {

3953

aSig = alternateASig;

3954

}

3955

zSign = ( (int64_t) aSig < 0 );

3956

if ( zSign ) aSig = - aSig;

3957

return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );

3958

3959

}

3960

3961

/*----------------------------------------------------------------------------

3962

| Returns the result of multiplying the double-precision floating-point values

3963

| `a' and `b' then adding 'c', with no intermediate rounding step after the

3964

| multiplication. The operation is performed according to the IEC/IEEE

3965

| Standard for Binary Floating-Point Arithmetic 754-2008.

3966

| The flags argument allows the caller to select negation of the

3967

| addend, the intermediate product, or the final result. (The difference

3968

| between this and having the caller do a separate negation is that negating

3969

| externally will flip the sign bit on NaNs.)

3970

*----------------------------------------------------------------------------*/

3971

3972

float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)

3973

{

3974

flag aSign, bSign, cSign, zSign;

3975

int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;

3976

uint64_t aSig, bSig, cSig;

3977

flag pInf, pZero, pSign;

3978

uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;

3979

int shiftcount;

3980

flag signflip, infzero;

3981

3982

a = float64_squash_input_denormal(a STATUS_VAR);

3983

b = float64_squash_input_denormal(b STATUS_VAR);

3984

c = float64_squash_input_denormal(c STATUS_VAR);

3985

aSig = extractFloat64Frac(a);

3986

aExp = extractFloat64Exp(a);

3987

aSign = extractFloat64Sign(a);

3988

bSig = extractFloat64Frac(b);

3989

bExp = extractFloat64Exp(b);

3990

bSign = extractFloat64Sign(b);

3991

cSig = extractFloat64Frac(c);

3992

cExp = extractFloat64Exp(c);

3993

cSign = extractFloat64Sign(c);

3994

3995

infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||

3996

(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));

3997

3998

/* It is implementation-defined whether the cases of (0,inf,qnan)

3999

* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN

4000

* they return if they do), so we have to hand this information

4001

* off to the target-specific pick-a-NaN routine.

4002

4003

if (((aExp == 0x7ff) && aSig) ||

4004

((bExp == 0x7ff) && bSig) ||

4005

((cExp == 0x7ff) && cSig)) {

4006

return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);

4007

}

4008

4009

if (infzero) {

4010

float_raise(float_flag_invalid STATUS_VAR);

4011

return float64_default_nan;

4012

}

4013

4014

if (flags & float_muladd_negate_c) {

4015

cSign ^= 1;

4016

}

4017

4018

signflip = (flags & float_muladd_negate_result) ? 1 : 0;

4019

4020

/* Work out the sign and type of the product */

4021

pSign = aSign ^ bSign;

4022

if (flags & float_muladd_negate_product) {

4023

pSign ^= 1;

4024

}

4025

pInf = (aExp == 0x7ff) || (bExp == 0x7ff);

4026

pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);

4027

4028

if (cExp == 0x7ff) {

4029

if (pInf && (pSign ^ cSign)) {

4030

/* addition of opposite-signed infinities => InvalidOperation */

4031

float_raise(float_flag_invalid STATUS_VAR);

4032

return float64_default_nan;

4033

}

4034

/* Otherwise generate an infinity of the same sign */

4035

return packFloat64(cSign ^ signflip, 0x7ff, 0);

4036

}

4037

4038

if (pInf) {

4039

return packFloat64(pSign ^ signflip, 0x7ff, 0);

4040

}

4041

4042

if (pZero) {

4043

if (cExp == 0) {

4044

if (cSig == 0) {

4045

/* Adding two exact zeroes */

4046

if (pSign == cSign) {

4047

zSign = pSign;

4048

} else if (STATUS(float_rounding_mode) == float_round_down) {

4049

zSign = 1;

4050

} else {

4051

zSign = 0;

4052

}

4053

return packFloat64(zSign ^ signflip, 0, 0);

4054

}

4055

/* Exact zero plus a denorm */

4056

if (STATUS(flush_to_zero)) {

4057

float_raise(float_flag_output_denormal STATUS_VAR);

4058

return packFloat64(cSign ^ signflip, 0, 0);

4059

}

4060

}

4061

/* Zero plus something non-zero : just return the something */

4062

return packFloat64(cSign ^ signflip, cExp, cSig);

4063

}

4064

4065

if (aExp == 0) {

4066

normalizeFloat64Subnormal(aSig, &aExp, &aSig);

4067

}

4068

if (bExp == 0) {

4069

normalizeFloat64Subnormal(bSig, &bExp, &bSig);

4070

}

4071

4072

/* Calculate the actual result a * b + c */

4073

4074

/* Multiply first; this is easy. */

4075

/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff

4076

* because we want the true exponent, not the "one-less-than"

4077

* flavour that roundAndPackFloat64() takes.

4078

4079

pExp = aExp + bExp - 0x3fe;

4080

aSig = (aSig | LIT64(0x0010000000000000))<<10;

4081

bSig = (bSig | LIT64(0x0010000000000000))<<11;

4082

mul64To128(aSig, bSig, &pSig0, &pSig1);

4083

if ((int64_t)(pSig0 << 1) >= 0) {

4084

shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);

4085

pExp--;

4086

}

4087

4088

zSign = pSign ^ signflip;

4089

4090

/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit

4091

* bit in position 126.

4092

4093

if (cExp == 0) {

4094

if (!cSig) {

4095

/* Throw out the special case of c being an exact zero now */

4096

shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);

4097

return roundAndPackFloat64(zSign, pExp - 1,

4098

pSig1 STATUS_VAR);

4099

}

4100

normalizeFloat64Subnormal(cSig, &cExp, &cSig);

4101

}

4102

4103

/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the

4104

* significand of the addend, with the explicit bit in position 126.

4105

4106

cSig0 = cSig << (126 - 64 - 52);

4107

cSig1 = 0;

4108

cSig0 |= LIT64(0x4000000000000000);

4109

expDiff = pExp - cExp;

4110

4111

if (pSign == cSign) {

4112

/* Addition */

4113

if (expDiff > 0) {

4114

/* scale c to match p */

4115

shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);

4116

zExp = pExp;

4117

} else if (expDiff < 0) {

4118

/* scale p to match c */

4119

shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);

4120

zExp = cExp;

4121

} else {

4122

/* no scaling needed */

4123

zExp = cExp;

4124

}

4125

/* Add significands and make sure explicit bit ends up in posn 126 */

4126

add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

4127

if ((int64_t)zSig0 < 0) {

4128

shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);

4129

} else {

4130

zExp--;

4131

}

4132

shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);

4133

return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);

4134

} else {

4135

/* Subtraction */

4136

if (expDiff > 0) {

4137

shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);

4138

sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

4139

zExp = pExp;

4140

} else if (expDiff < 0) {

4141

shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);

4142

sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);

4143

zExp = cExp;

4144

zSign ^= 1;

4145

} else {

4146

zExp = pExp;

4147

if (lt128(cSig0, cSig1, pSig0, pSig1)) {

4148

sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

4149

} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {

4150

sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);

4151

zSign ^= 1;

4152

} else {

4153

/* Exact zero */

4154

zSign = signflip;

4155

if (STATUS(float_rounding_mode) == float_round_down) {

4156

zSign ^= 1;

4157

}

4158

return packFloat64(zSign, 0, 0);

4159

}

4160

}

4161

--zExp;

4162

/* Do the equivalent of normalizeRoundAndPackFloat64() but

4163

* starting with the significand in a pair of uint64_t.

4164

4165

if (zSig0) {

4166

shiftcount = countLeadingZeros64(zSig0) - 1;

4167

shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);

4168

if (zSig1) {

4169

zSig0 |= 1;

4170

}

4171

zExp -= shiftcount;

4172

} else {

4173

shiftcount = countLeadingZeros64(zSig1);

4174

if (shiftcount == 0) {

4175

zSig0 = (zSig1 >> 1) | (zSig1 & 1);

4176

zExp -= 63;

4177

} else {

4178

shiftcount--;

4179

zSig0 = zSig1 << shiftcount;

4180

zExp -= (shiftcount + 64);

4181

}

4182

}

4183

return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);

4184

}

4185

}

4186

4187

/*----------------------------------------------------------------------------

4188

| Returns the square root of the double-precision floating-point value `a'.

4189

| The operation is performed according to the IEC/IEEE Standard for Binary

4190

| Floating-Point Arithmetic.

4191

*----------------------------------------------------------------------------*/

4192

4193

float64 float64_sqrt( float64 a STATUS_PARAM )

4194

{

4195

flag aSign;

4196

int_fast16_t aExp, zExp;

4197

uint64_t aSig, zSig, doubleZSig;

4198

uint64_t rem0, rem1, term0, term1;

4199

a = float64_squash_input_denormal(a STATUS_VAR);

4200

4201

aSig = extractFloat64Frac( a );

4202

aExp = extractFloat64Exp( a );

4203

aSign = extractFloat64Sign( a );

4204

if ( aExp == 0x7FF ) {

4205

if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );

4206

if ( ! aSign ) return a;

4207

float_raise( float_flag_invalid STATUS_VAR);

4208

return float64_default_nan;

4209

}

4210

if ( aSign ) {

4211

if ( ( aExp | aSig ) == 0 ) return a;

4212

float_raise( float_flag_invalid STATUS_VAR);

4213

return float64_default_nan;

4214

}

4215

if ( aExp == 0 ) {

4216

if ( aSig == 0 ) return float64_zero;

4217

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

4218

}

4219

zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;

4220

aSig |= LIT64( 0x0010000000000000 );

4221

zSig = estimateSqrt32( aExp, aSig>>21 );

4222

aSig <<= 9 - ( aExp & 1 );

4223

zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );

4224

if ( ( zSig & 0x1FF ) <= 5 ) {

4225

doubleZSig = zSig<<1;

4226

mul64To128( zSig, zSig, &term0, &term1 );

4227

sub128( aSig, 0, term0, term1, &rem0, &rem1 );

4228

while ( (int64_t) rem0 < 0 ) {

4229

--zSig;

4230

doubleZSig -= 2;

4231

add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );

4232

}

4233

zSig |= ( ( rem0 | rem1 ) != 0 );

4234

}

4235

return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );

4236

4237

}

4238

4239

/*----------------------------------------------------------------------------

4240

| Returns the binary log of the double-precision floating-point value `a'.

4241

| The operation is performed according to the IEC/IEEE Standard for Binary

4242

| Floating-Point Arithmetic.

4243

*----------------------------------------------------------------------------*/

4244

float64 float64_log2( float64 a STATUS_PARAM )

4245

{

4246

flag aSign, zSign;

4247

int_fast16_t aExp;

4248

uint64_t aSig, aSig0, aSig1, zSig, i;

4249

a = float64_squash_input_denormal(a STATUS_VAR);

4250

4251

aSig = extractFloat64Frac( a );

4252

aExp = extractFloat64Exp( a );

4253

aSign = extractFloat64Sign( a );

4254

4255

if ( aExp == 0 ) {

4256

if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );

4257

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

4258

}

4259

if ( aSign ) {

4260

float_raise( float_flag_invalid STATUS_VAR);

4261

return float64_default_nan;

4262

}

4263

if ( aExp == 0x7FF ) {

4264

if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );

4265

return a;

4266

}

4267

4268

aExp -= 0x3FF;

4269

aSig |= LIT64( 0x0010000000000000 );

4270

zSign = aExp < 0;

4271

zSig = (uint64_t)aExp << 52;

4272

for (i = 1LL << 51; i > 0; i >>= 1) {

4273

mul64To128( aSig, aSig, &aSig0, &aSig1 );

4274

aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );

4275

if ( aSig & LIT64( 0x0020000000000000 ) ) {

4276

aSig >>= 1;

4277

zSig |= i;

4278

}

4279

}

4280

4281

if ( zSign )

4282

zSig = -zSig;

4283

return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );

4284

}

4285

4286

/*----------------------------------------------------------------------------

4287

| Returns 1 if the double-precision floating-point value `a' is equal to the

4288

| corresponding value `b', and 0 otherwise. The invalid exception is raised

4289

| if either operand is a NaN. Otherwise, the comparison is performed

4290

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4291

*----------------------------------------------------------------------------*/

4292

4293

int float64_eq( float64 a, float64 b STATUS_PARAM )

4294

{

4295

uint64_t av, bv;

4296

a = float64_squash_input_denormal(a STATUS_VAR);

4297

b = float64_squash_input_denormal(b STATUS_VAR);

4298

4299

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4300

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4301

) {

4302

float_raise( float_flag_invalid STATUS_VAR);

4303

return 0;

4304

}

4305

av = float64_val(a);

4306

bv = float64_val(b);

4307

return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4308

4309

}

4310

4311

/*----------------------------------------------------------------------------

4312

| Returns 1 if the double-precision floating-point value `a' is less than or

4313

| equal to the corresponding value `b', and 0 otherwise. The invalid

4314

| exception is raised if either operand is a NaN. The comparison is performed

4315

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4316

*----------------------------------------------------------------------------*/

4317

4318

int float64_le( float64 a, float64 b STATUS_PARAM )

4319

{

4320

flag aSign, bSign;

4321

uint64_t av, bv;

4322

a = float64_squash_input_denormal(a STATUS_VAR);

4323

b = float64_squash_input_denormal(b STATUS_VAR);

4324

4325

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4326

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4327

) {

4328

float_raise( float_flag_invalid STATUS_VAR);

4329

return 0;

4330

}

4331

aSign = extractFloat64Sign( a );

4332

bSign = extractFloat64Sign( b );

4333

av = float64_val(a);

4334

bv = float64_val(b);

4335

if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4336

return ( av == bv ) || ( aSign ^ ( av < bv ) );

4337

4338

}

4339

4340

/*----------------------------------------------------------------------------

4341

| Returns 1 if the double-precision floating-point value `a' is less than

4342

| the corresponding value `b', and 0 otherwise. The invalid exception is

4343

| raised if either operand is a NaN. The comparison is performed according

4344

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4345

*----------------------------------------------------------------------------*/

4346

4347

int float64_lt( float64 a, float64 b STATUS_PARAM )

4348

{

4349

flag aSign, bSign;

4350

uint64_t av, bv;

4351

4352

a = float64_squash_input_denormal(a STATUS_VAR);

4353

b = float64_squash_input_denormal(b STATUS_VAR);

4354

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4355

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4356

) {

4357

float_raise( float_flag_invalid STATUS_VAR);

4358

return 0;

4359

}

4360

aSign = extractFloat64Sign( a );

4361

bSign = extractFloat64Sign( b );

4362

av = float64_val(a);

4363

bv = float64_val(b);

4364

if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );

4365

return ( av != bv ) && ( aSign ^ ( av < bv ) );

4366

4367

}

4368

4369

/*----------------------------------------------------------------------------

4370

| Returns 1 if the double-precision floating-point values `a' and `b' cannot

4371

| be compared, and 0 otherwise. The invalid exception is raised if either

4372

| operand is a NaN. The comparison is performed according to the IEC/IEEE

4373

| Standard for Binary Floating-Point Arithmetic.

4374

*----------------------------------------------------------------------------*/

4375

4376

int float64_unordered( float64 a, float64 b STATUS_PARAM )

4377

{

4378

a = float64_squash_input_denormal(a STATUS_VAR);

4379

b = float64_squash_input_denormal(b STATUS_VAR);

4380

4381

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4382

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4383

) {

4384

float_raise( float_flag_invalid STATUS_VAR);

4385

return 1;

4386

}

4387

return 0;

4388

}

4389

4390

/*----------------------------------------------------------------------------

4391

| Returns 1 if the double-precision floating-point value `a' is equal to the

4392

| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

4393

| exception.The comparison is performed according to the IEC/IEEE Standard

4394

| for Binary Floating-Point Arithmetic.

4395

*----------------------------------------------------------------------------*/

4396

4397

int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )

4398

{

4399

uint64_t av, bv;

4400

a = float64_squash_input_denormal(a STATUS_VAR);

4401

b = float64_squash_input_denormal(b STATUS_VAR);

4402

4403

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4404

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4405

) {

4406

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4407

float_raise( float_flag_invalid STATUS_VAR);

4408

}

4409

return 0;

4410

}

4411

av = float64_val(a);

4412

bv = float64_val(b);

4413

return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4414

4415

}

4416

4417

/*----------------------------------------------------------------------------

4418

| Returns 1 if the double-precision floating-point value `a' is less than or

4419

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

4420

| cause an exception. Otherwise, the comparison is performed according to the

4421

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4422

*----------------------------------------------------------------------------*/

4423

4424

int float64_le_quiet( float64 a, float64 b STATUS_PARAM )

4425

{

4426

flag aSign, bSign;

4427

uint64_t av, bv;

4428

a = float64_squash_input_denormal(a STATUS_VAR);

4429

b = float64_squash_input_denormal(b STATUS_VAR);

4430

4431

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4432

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4433

) {

4434

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4435

float_raise( float_flag_invalid STATUS_VAR);

4436

}

4437

return 0;

4438

}

4439

aSign = extractFloat64Sign( a );

4440

bSign = extractFloat64Sign( b );

4441

av = float64_val(a);

4442

bv = float64_val(b);

4443

if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4444

return ( av == bv ) || ( aSign ^ ( av < bv ) );

4445

4446

}

4447

4448

/*----------------------------------------------------------------------------

4449

| Returns 1 if the double-precision floating-point value `a' is less than

4450

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

4451

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

4452

| Standard for Binary Floating-Point Arithmetic.

4453

*----------------------------------------------------------------------------*/

4454

4455

int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )

4456

{

4457

flag aSign, bSign;

4458

uint64_t av, bv;

4459

a = float64_squash_input_denormal(a STATUS_VAR);

4460

b = float64_squash_input_denormal(b STATUS_VAR);

4461

4462

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4463

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4464

) {

4465

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4466

float_raise( float_flag_invalid STATUS_VAR);

4467

}

4468

return 0;

4469

}

4470

aSign = extractFloat64Sign( a );

4471

bSign = extractFloat64Sign( b );

4472

av = float64_val(a);

4473

bv = float64_val(b);

4474

if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );

4475

return ( av != bv ) && ( aSign ^ ( av < bv ) );

4476

4477

}

4478

4479

/*----------------------------------------------------------------------------

4480

| Returns 1 if the double-precision floating-point values `a' and `b' cannot

4481

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

4482

| comparison is performed according to the IEC/IEEE Standard for Binary

4483

| Floating-Point Arithmetic.

4484

*----------------------------------------------------------------------------*/

4485

4486

int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )

4487

{

4488

a = float64_squash_input_denormal(a STATUS_VAR);

4489

b = float64_squash_input_denormal(b STATUS_VAR);

4490

4491

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4492

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4493

) {

4494

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4495

float_raise( float_flag_invalid STATUS_VAR);

4496

}

4497

return 1;

4498

}

4499

return 0;

4500

}

4501

4502

/*----------------------------------------------------------------------------

4503

| Returns the result of converting the extended double-precision floating-

4504

| point value `a' to the 32-bit two's complement integer format. The

4505

| conversion is performed according to the IEC/IEEE Standard for Binary

4506

| Floating-Point Arithmetic---which means in particular that the conversion

4507

| is rounded according to the current rounding mode. If `a' is a NaN, the

4508

| largest positive integer is returned. Otherwise, if the conversion

4509

| overflows, the largest integer with the same sign as `a' is returned.

4510

*----------------------------------------------------------------------------*/

4511

4512

int32 floatx80_to_int32( floatx80 a STATUS_PARAM )

4513

{

4514

flag aSign;

4515

int32 aExp, shiftCount;

4516

uint64_t aSig;

4517

4518

aSig = extractFloatx80Frac( a );

4519

aExp = extractFloatx80Exp( a );

4520

aSign = extractFloatx80Sign( a );

4521

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;

4522

shiftCount = 0x4037 - aExp;

4523

if ( shiftCount <= 0 ) shiftCount = 1;

4524

shift64RightJamming( aSig, shiftCount, &aSig );

4525

return roundAndPackInt32( aSign, aSig STATUS_VAR );

4526

4527

}

4528

4529

/*----------------------------------------------------------------------------

4530

| Returns the result of converting the extended double-precision floating-

4531

| point value `a' to the 32-bit two's complement integer format. The

4532

| conversion is performed according to the IEC/IEEE Standard for Binary

4533

| Floating-Point Arithmetic, except that the conversion is always rounded

4534

| toward zero. If `a' is a NaN, the largest positive integer is returned.

4535

| Otherwise, if the conversion overflows, the largest integer with the same

4536

| sign as `a' is returned.

4537

*----------------------------------------------------------------------------*/

4538

4539

int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )

4540

{

4541

flag aSign;

4542

int32 aExp, shiftCount;

4543

uint64_t aSig, savedASig;

4544

int32_t z;

4545

4546

aSig = extractFloatx80Frac( a );

4547

aExp = extractFloatx80Exp( a );

4548

aSign = extractFloatx80Sign( a );

4549

if ( 0x401E < aExp ) {

4550

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;

4551

goto invalid;

4552

}

4553

else if ( aExp < 0x3FFF ) {

4554

if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

4555

return 0;

4556

}

4557

shiftCount = 0x403E - aExp;

4558

savedASig = aSig;

4559

aSig >>= shiftCount;

4560

z = aSig;

4561

if ( aSign ) z = - z;

4562

if ( ( z < 0 ) ^ aSign ) {

4563

invalid:

4564

float_raise( float_flag_invalid STATUS_VAR);

4565

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

4566

}

4567

if ( ( aSig<<shiftCount ) != savedASig ) {

4568

STATUS(float_exception_flags) |= float_flag_inexact;

4569

}

4570

return z;

4571

4572

}

4573

4574

/*----------------------------------------------------------------------------

4575

| Returns the result of converting the extended double-precision floating-

4576

| point value `a' to the 64-bit two's complement integer format. The

4577

| conversion is performed according to the IEC/IEEE Standard for Binary

4578

| Floating-Point Arithmetic---which means in particular that the conversion

4579

| is rounded according to the current rounding mode. If `a' is a NaN,

4580

| the largest positive integer is returned. Otherwise, if the conversion

4581

| overflows, the largest integer with the same sign as `a' is returned.

4582

*----------------------------------------------------------------------------*/

4583

4584

int64 floatx80_to_int64( floatx80 a STATUS_PARAM )

4585

{

4586

flag aSign;

4587

int32 aExp, shiftCount;

4588

uint64_t aSig, aSigExtra;

4589

4590

aSig = extractFloatx80Frac( a );

4591

aExp = extractFloatx80Exp( a );

4592

aSign = extractFloatx80Sign( a );

4593

shiftCount = 0x403E - aExp;

4594

if ( shiftCount <= 0 ) {

4595

if ( shiftCount ) {

4596

float_raise( float_flag_invalid STATUS_VAR);

4597

if ( ! aSign

4598

|| ( ( aExp == 0x7FFF )

4599

&& ( aSig != LIT64( 0x8000000000000000 ) ) )

4600

) {

4601

return LIT64( 0x7FFFFFFFFFFFFFFF );

4602

}

4603

return (int64_t) LIT64( 0x8000000000000000 );

4604

}

4605

aSigExtra = 0;

4606

}

4607

else {

4608

shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );

4609

}

4610

return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );

4611

4612

}

4613

4614

/*----------------------------------------------------------------------------

4615

| Returns the result of converting the extended double-precision floating-

4616

| point value `a' to the 64-bit two's complement integer format. The

4617

| conversion is performed according to the IEC/IEEE Standard for Binary

4618

| Floating-Point Arithmetic, except that the conversion is always rounded

4619

| toward zero. If `a' is a NaN, the largest positive integer is returned.

4620

| Otherwise, if the conversion overflows, the largest integer with the same

4621

| sign as `a' is returned.

4622

*----------------------------------------------------------------------------*/

4623

4624

int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )

4625

{

4626

flag aSign;

4627

int32 aExp, shiftCount;

4628

uint64_t aSig;

4629

int64 z;

4630

4631

aSig = extractFloatx80Frac( a );

4632

aExp = extractFloatx80Exp( a );

4633

aSign = extractFloatx80Sign( a );

4634

shiftCount = aExp - 0x403E;

4635

if ( 0 <= shiftCount ) {

4636

aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );

4637

if ( ( a.high != 0xC03E ) || aSig ) {

4638

float_raise( float_flag_invalid STATUS_VAR);

4639

if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {

4640

return LIT64( 0x7FFFFFFFFFFFFFFF );

4641

}

4642

}

4643

return (int64_t) LIT64( 0x8000000000000000 );

4644

}

4645

else if ( aExp < 0x3FFF ) {

4646

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

4647

return 0;

4648

}

4649

z = aSig>>( - shiftCount );

4650

if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {

4651

STATUS(float_exception_flags) |= float_flag_inexact;

4652

}

4653

if ( aSign ) z = - z;

4654

return z;

4655

4656

}

4657

4658

/*----------------------------------------------------------------------------

4659

| Returns the result of converting the extended double-precision floating-

4660

| point value `a' to the single-precision floating-point format. The

4661

| conversion is performed according to the IEC/IEEE Standard for Binary

4662

| Floating-Point Arithmetic.

4663

*----------------------------------------------------------------------------*/

4664

4665

float32 floatx80_to_float32( floatx80 a STATUS_PARAM )

4666

{

4667

flag aSign;

4668

int32 aExp;

4669

uint64_t aSig;

4670

4671

aSig = extractFloatx80Frac( a );

4672

aExp = extractFloatx80Exp( a );

4673

aSign = extractFloatx80Sign( a );

4674

if ( aExp == 0x7FFF ) {

4675

if ( (uint64_t) ( aSig<<1 ) ) {

4676

return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4677

}

4678

return packFloat32( aSign, 0xFF, 0 );

4679

}

4680

shift64RightJamming( aSig, 33, &aSig );

4681

if ( aExp || aSig ) aExp -= 0x3F81;

4682

return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );

4683

4684

}

4685

4686

/*----------------------------------------------------------------------------

4687

| Returns the result of converting the extended double-precision floating-

4688

| point value `a' to the double-precision floating-point format. The

4689

| conversion is performed according to the IEC/IEEE Standard for Binary

4690

| Floating-Point Arithmetic.

4691

*----------------------------------------------------------------------------*/

4692

4693

float64 floatx80_to_float64( floatx80 a STATUS_PARAM )

4694

{

4695

flag aSign;

4696

int32 aExp;

4697

uint64_t aSig, zSig;

4698

4699

aSig = extractFloatx80Frac( a );

4700

aExp = extractFloatx80Exp( a );

4701

aSign = extractFloatx80Sign( a );

4702

if ( aExp == 0x7FFF ) {

4703

if ( (uint64_t) ( aSig<<1 ) ) {

4704

return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4705

}

4706

return packFloat64( aSign, 0x7FF, 0 );

4707

}

4708

shift64RightJamming( aSig, 1, &zSig );

4709

if ( aExp || aSig ) aExp -= 0x3C01;

4710

return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );

4711

4712

}

4713

4714

/*----------------------------------------------------------------------------

4715

| Returns the result of converting the extended double-precision floating-

4716

| point value `a' to the quadruple-precision floating-point format. The

4717

| conversion is performed according to the IEC/IEEE Standard for Binary

4718

| Floating-Point Arithmetic.

4719

*----------------------------------------------------------------------------*/

4720

4721

float128 floatx80_to_float128( floatx80 a STATUS_PARAM )

4722

{

4723

flag aSign;

4724

int_fast16_t aExp;

4725

uint64_t aSig, zSig0, zSig1;

4726

4727

aSig = extractFloatx80Frac( a );

4728

aExp = extractFloatx80Exp( a );

4729

aSign = extractFloatx80Sign( a );

4730

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {

4731

return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4732

}

4733

shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );

4734

return packFloat128( aSign, aExp, zSig0, zSig1 );

4735

4736

}

4737

4738

/*----------------------------------------------------------------------------

4739

| Rounds the extended double-precision floating-point value `a' to an integer,

4740

| and returns the result as an extended quadruple-precision floating-point

4741

| value. The operation is performed according to the IEC/IEEE Standard for

4742

| Binary Floating-Point Arithmetic.

4743

*----------------------------------------------------------------------------*/

4744

4745

floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )

4746

{

4747

flag aSign;

4748

int32 aExp;

4749

uint64_t lastBitMask, roundBitsMask;

4750

floatx80 z;

4751

4752

aExp = extractFloatx80Exp( a );

4753

if ( 0x403E <= aExp ) {

4754

if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {

4755

return propagateFloatx80NaN( a, a STATUS_VAR );

4756

}

4757

return a;

4758

}

4759

if ( aExp < 0x3FFF ) {

4760

if ( ( aExp == 0 )

4761

&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {

4762

return a;

4763

}

4764

STATUS(float_exception_flags) |= float_flag_inexact;

4765

aSign = extractFloatx80Sign( a );

4766

switch ( STATUS(float_rounding_mode) ) {

4767

case float_round_nearest_even:

4768

if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )

4769

) {

4770

return

4771

packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );

4772

}

4773

break;

4774

case float_round_down:

4775

return

4776

aSign ?

4777

packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )

4778

: packFloatx80( 0, 0, 0 );

4779

case float_round_up:

4780

return

4781

aSign ? packFloatx80( 1, 0, 0 )

4782

: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );

4783

}

4784

return packFloatx80( aSign, 0, 0 );

4785

}

4786

lastBitMask = 1;

4787

lastBitMask <<= 0x403E - aExp;

4788

roundBitsMask = lastBitMask - 1;

4789

z = a;

4790

switch (STATUS(float_rounding_mode)) {

4791

case float_round_nearest_even:

4792

z.low += lastBitMask>>1;

4793

if ((z.low & roundBitsMask) == 0) {

4794

z.low &= ~lastBitMask;

4795

}

4796

break;

4797

case float_round_to_zero:

4798

break;

4799

case float_round_up:

4800

if (!extractFloatx80Sign(z)) {

4801

z.low += roundBitsMask;

4802

}

4803

break;

4804

case float_round_down:

4805

if (extractFloatx80Sign(z)) {

4806

z.low += roundBitsMask;

4807

}

4808

break;

4809

default:

4810

abort();

4811

}

4812

z.low &= ~ roundBitsMask;

4813

if ( z.low == 0 ) {

4814

++z.high;

4815

z.low = LIT64( 0x8000000000000000 );

4816

}

4817

if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;

4818

return z;

4819

4820

}

4821

4822

/*----------------------------------------------------------------------------

4823

| Returns the result of adding the absolute values of the extended double-

4824

| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is

4825

| negated before being returned. `zSign' is ignored if the result is a NaN.

4826

| The addition is performed according to the IEC/IEEE Standard for Binary

4827

| Floating-Point Arithmetic.

4828

*----------------------------------------------------------------------------*/

4829

4830

static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)

4831

{

4832

int32 aExp, bExp, zExp;

4833

uint64_t aSig, bSig, zSig0, zSig1;

4834

int32 expDiff;

4835

4836

aSig = extractFloatx80Frac( a );

4837

aExp = extractFloatx80Exp( a );

4838

bSig = extractFloatx80Frac( b );

4839

bExp = extractFloatx80Exp( b );

4840

expDiff = aExp - bExp;

4841

if ( 0 < expDiff ) {

4842

if ( aExp == 0x7FFF ) {

4843

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4844

return a;

4845

}

4846

if ( bExp == 0 ) --expDiff;

4847

shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );

4848

zExp = aExp;

4849

}

4850

else if ( expDiff < 0 ) {

4851

if ( bExp == 0x7FFF ) {

4852

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4853

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4854

}

4855

if ( aExp == 0 ) ++expDiff;

4856

shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );

4857

zExp = bExp;

4858

}

4859

else {

4860

if ( aExp == 0x7FFF ) {

4861

if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {

4862

return propagateFloatx80NaN( a, b STATUS_VAR );

4863

}

4864

return a;

4865

}

4866

zSig1 = 0;

4867

zSig0 = aSig + bSig;

4868

if ( aExp == 0 ) {

4869

normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );

4870

goto roundAndPack;

4871

}

4872

zExp = aExp;

4873

goto shiftRight1;

4874

}

4875

zSig0 = aSig + bSig;

4876

if ( (int64_t) zSig0 < 0 ) goto roundAndPack;

4877

shiftRight1:

4878

shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );

4879

zSig0 |= LIT64( 0x8000000000000000 );

4880

++zExp;

4881

roundAndPack:

4882

return

4883

roundAndPackFloatx80(

4884

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4885

4886

}

4887

4888

/*----------------------------------------------------------------------------

4889

| Returns the result of subtracting the absolute values of the extended

4890

| double-precision floating-point values `a' and `b'. If `zSign' is 1, the

4891

| difference is negated before being returned. `zSign' is ignored if the

4892

| result is a NaN. The subtraction is performed according to the IEC/IEEE

4893

| Standard for Binary Floating-Point Arithmetic.

4894

*----------------------------------------------------------------------------*/

4895

4896

static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )

4897

{

4898

int32 aExp, bExp, zExp;

4899

uint64_t aSig, bSig, zSig0, zSig1;

4900

int32 expDiff;

4901

floatx80 z;

4902

4903

aSig = extractFloatx80Frac( a );

4904

aExp = extractFloatx80Exp( a );

4905

bSig = extractFloatx80Frac( b );

4906

bExp = extractFloatx80Exp( b );

4907

expDiff = aExp - bExp;

4908

if ( 0 < expDiff ) goto aExpBigger;

4909

if ( expDiff < 0 ) goto bExpBigger;

4910

if ( aExp == 0x7FFF ) {

4911

if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {

4912

return propagateFloatx80NaN( a, b STATUS_VAR );

4913

}

4914

float_raise( float_flag_invalid STATUS_VAR);

4915

z.low = floatx80_default_nan_low;

4916

z.high = floatx80_default_nan_high;

4917

return z;

4918

}

4919

if ( aExp == 0 ) {

4920

aExp = 1;

4921

bExp = 1;

4922

}

4923

zSig1 = 0;

4924

if ( bSig < aSig ) goto aBigger;

4925

if ( aSig < bSig ) goto bBigger;

4926

return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

4927

bExpBigger:

4928

if ( bExp == 0x7FFF ) {

4929

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4930

return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );

4931

}

4932

if ( aExp == 0 ) ++expDiff;

4933

shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );

4934

bBigger:

4935

sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );

4936

zExp = bExp;

4937

zSign ^= 1;

4938

goto normalizeRoundAndPack;

4939

aExpBigger:

4940

if ( aExp == 0x7FFF ) {

4941

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4942

return a;

4943

}

4944

if ( bExp == 0 ) --expDiff;

4945

shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );

4946

aBigger:

4947

sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );

4948

zExp = aExp;

4949

normalizeRoundAndPack:

4950

return

4951

normalizeRoundAndPackFloatx80(

4952

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4953

4954

}

4955

4956

/*----------------------------------------------------------------------------

4957

| Returns the result of adding the extended double-precision floating-point

4958

| values `a' and `b'. The operation is performed according to the IEC/IEEE

4959

| Standard for Binary Floating-Point Arithmetic.

4960

*----------------------------------------------------------------------------*/

4961

4962

floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )

4963

{

4964

flag aSign, bSign;

4965

4966

aSign = extractFloatx80Sign( a );

4967

bSign = extractFloatx80Sign( b );

4968

if ( aSign == bSign ) {

4969

return addFloatx80Sigs( a, b, aSign STATUS_VAR );

4970

}

4971

else {

4972

return subFloatx80Sigs( a, b, aSign STATUS_VAR );

4973

}

4974

4975

}

4976

4977

/*----------------------------------------------------------------------------

4978

| Returns the result of subtracting the extended double-precision floating-

4979

| point values `a' and `b'. The operation is performed according to the

4980

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4981

*----------------------------------------------------------------------------*/

4982

4983

floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )

4984

{

4985

flag aSign, bSign;

4986

4987

aSign = extractFloatx80Sign( a );

4988

bSign = extractFloatx80Sign( b );

4989

if ( aSign == bSign ) {

4990

return subFloatx80Sigs( a, b, aSign STATUS_VAR );

4991

}

4992

else {

4993

return addFloatx80Sigs( a, b, aSign STATUS_VAR );

4994

}

4995

4996

}

4997

4998

/*----------------------------------------------------------------------------

4999

| Returns the result of multiplying the extended double-precision floating-

5000

| point values `a' and `b'. The operation is performed according to the

5001

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5002

*----------------------------------------------------------------------------*/

5003

5004

floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )

5005

{

5006

flag aSign, bSign, zSign;

5007

int32 aExp, bExp, zExp;

5008

uint64_t aSig, bSig, zSig0, zSig1;

5009

floatx80 z;

5010

5011

aSig = extractFloatx80Frac( a );

5012

aExp = extractFloatx80Exp( a );

5013

aSign = extractFloatx80Sign( a );

5014

bSig = extractFloatx80Frac( b );

5015

bExp = extractFloatx80Exp( b );

5016

bSign = extractFloatx80Sign( b );

5017

zSign = aSign ^ bSign;

5018

if ( aExp == 0x7FFF ) {

5019

if ( (uint64_t) ( aSig<<1 )

5020

|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {

5021

return propagateFloatx80NaN( a, b STATUS_VAR );

5022

}

5023

if ( ( bExp | bSig ) == 0 ) goto invalid;

5024

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5025

}

5026

if ( bExp == 0x7FFF ) {

5027

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

5028

if ( ( aExp | aSig ) == 0 ) {

5029

invalid:

5030

float_raise( float_flag_invalid STATUS_VAR);

5031

z.low = floatx80_default_nan_low;

5032

z.high = floatx80_default_nan_high;

5033

return z;

5034

}

5035

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5036

}

5037

if ( aExp == 0 ) {

5038

if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );

5039

normalizeFloatx80Subnormal( aSig, &aExp, &aSig );

5040

}

5041

if ( bExp == 0 ) {

5042

if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );

5043

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

5044

}

5045

zExp = aExp + bExp - 0x3FFE;

5046

mul64To128( aSig, bSig, &zSig0, &zSig1 );

5047

if ( 0 < (int64_t) zSig0 ) {

5048

shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );

5049

--zExp;

5050

}

5051

return

5052

roundAndPackFloatx80(

5053

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

5054

5055

}

5056

5057

/*----------------------------------------------------------------------------

5058

| Returns the result of dividing the extended double-precision floating-point

5059

| value `a' by the corresponding value `b'. The operation is performed

5060

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5061

*----------------------------------------------------------------------------*/

5062

5063

floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )

5064

{

5065

flag aSign, bSign, zSign;

5066

int32 aExp, bExp, zExp;

5067

uint64_t aSig, bSig, zSig0, zSig1;

5068

uint64_t rem0, rem1, rem2, term0, term1, term2;

5069

floatx80 z;

5070

5071

aSig = extractFloatx80Frac( a );

5072

aExp = extractFloatx80Exp( a );

5073

aSign = extractFloatx80Sign( a );

5074

bSig = extractFloatx80Frac( b );

5075

bExp = extractFloatx80Exp( b );

5076

bSign = extractFloatx80Sign( b );

5077

zSign = aSign ^ bSign;

5078

if ( aExp == 0x7FFF ) {

5079

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

5080

if ( bExp == 0x7FFF ) {

5081

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

5082

goto invalid;

5083

}

5084

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5085

}

5086

if ( bExp == 0x7FFF ) {

5087

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

5088

return packFloatx80( zSign, 0, 0 );

5089

}

5090

if ( bExp == 0 ) {

5091

if ( bSig == 0 ) {

5092

if ( ( aExp | aSig ) == 0 ) {

5093

invalid:

5094

float_raise( float_flag_invalid STATUS_VAR);

5095

z.low = floatx80_default_nan_low;

5096

z.high = floatx80_default_nan_high;

5097

return z;

5098

}

5099

float_raise( float_flag_divbyzero STATUS_VAR);

5100

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5101

}

5102

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

5103

}

5104

if ( aExp == 0 ) {

5105

if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );

5106

normalizeFloatx80Subnormal( aSig, &aExp, &aSig );

5107

}

5108

zExp = aExp - bExp + 0x3FFE;

5109

rem1 = 0;

5110

if ( bSig <= aSig ) {

5111

shift128Right( aSig, 0, 1, &aSig, &rem1 );

5112

++zExp;

5113

}

5114

zSig0 = estimateDiv128To64( aSig, rem1, bSig );

5115

mul64To128( bSig, zSig0, &term0, &term1 );

5116

sub128( aSig, rem1, term0, term1, &rem0, &rem1 );

5117

while ( (int64_t) rem0 < 0 ) {

5118

--zSig0;

5119

add128( rem0, rem1, 0, bSig, &rem0, &rem1 );

5120

}

5121

zSig1 = estimateDiv128To64( rem1, 0, bSig );

5122

if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {

5123

mul64To128( bSig, zSig1, &term1, &term2 );

5124

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

5125

while ( (int64_t) rem1 < 0 ) {

5126

--zSig1;

5127

add128( rem1, rem2, 0, bSig, &rem1, &rem2 );

5128

}

5129

zSig1 |= ( ( rem1 | rem2 ) != 0 );

5130

}

5131

return

5132

roundAndPackFloatx80(

5133

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

5134

5135

}

5136

5137

/*----------------------------------------------------------------------------

5138

| Returns the remainder of the extended double-precision floating-point value

5139

| `a' with respect to the corresponding value `b'. The operation is performed

5140

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5141

*----------------------------------------------------------------------------*/

5142

5143

floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )

5144

{

5145

flag aSign, zSign;

5146

int32 aExp, bExp, expDiff;

5147

uint64_t aSig0, aSig1, bSig;

5148

uint64_t q, term0, term1, alternateASig0, alternateASig1;

5149

floatx80 z;

5150

5151

aSig0 = extractFloatx80Frac( a );

5152

aExp = extractFloatx80Exp( a );

5153

aSign = extractFloatx80Sign( a );

5154

bSig = extractFloatx80Frac( b );

5155

bExp = extractFloatx80Exp( b );

5156

if ( aExp == 0x7FFF ) {

5157

if ( (uint64_t) ( aSig0<<1 )

5158

|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {

5159

return propagateFloatx80NaN( a, b STATUS_VAR );

5160

}

5161

goto invalid;

5162

}

5163

if ( bExp == 0x7FFF ) {

5164

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

5165

return a;

5166

}

5167

if ( bExp == 0 ) {

5168

if ( bSig == 0 ) {

5169

invalid:

5170

float_raise( float_flag_invalid STATUS_VAR);

5171

z.low = floatx80_default_nan_low;

5172

z.high = floatx80_default_nan_high;

5173

return z;

5174

}

5175

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

5176

}

5177

if ( aExp == 0 ) {

5178

if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;

5179

normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );

5180

}

5181

bSig |= LIT64( 0x8000000000000000 );

5182

zSign = aSign;

5183

expDiff = aExp - bExp;

5184

aSig1 = 0;

5185

if ( expDiff < 0 ) {

5186

if ( expDiff < -1 ) return a;

5187

shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );

5188

expDiff = 0;

5189

}

5190

q = ( bSig <= aSig0 );

5191

if ( q ) aSig0 -= bSig;

5192

expDiff -= 64;

5193

while ( 0 < expDiff ) {

5194

q = estimateDiv128To64( aSig0, aSig1, bSig );

5195

q = ( 2 < q ) ? q - 2 : 0;

5196

mul64To128( bSig, q, &term0, &term1 );

5197

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5198

shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );

5199

expDiff -= 62;

5200

}

5201

expDiff += 64;

5202

if ( 0 < expDiff ) {

5203

q = estimateDiv128To64( aSig0, aSig1, bSig );

5204

q = ( 2 < q ) ? q - 2 : 0;

5205

q >>= 64 - expDiff;

5206

mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );

5207

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5208

shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );

5209

while ( le128( term0, term1, aSig0, aSig1 ) ) {

5210

++q;

5211

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5212

}

5213

}

5214

else {

5215

term1 = 0;

5216

term0 = bSig;

5217

}

5218

sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );

5219

if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )

5220

|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )

5221

&& ( q & 1 ) )

5222

) {

5223

aSig0 = alternateASig0;

5224

aSig1 = alternateASig1;

5225

zSign = ! zSign;

5226

}

5227

return

5228

normalizeRoundAndPackFloatx80(

5229

80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );

5230

5231

}

5232

5233

/*----------------------------------------------------------------------------

5234

| Returns the square root of the extended double-precision floating-point

5235

| value `a'. The operation is performed according to the IEC/IEEE Standard

5236

| for Binary Floating-Point Arithmetic.

5237

*----------------------------------------------------------------------------*/

5238

5239

floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )

5240

{

5241

flag aSign;

5242

int32 aExp, zExp;

5243

uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;

5244

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

5245

floatx80 z;

5246

5247

aSig0 = extractFloatx80Frac( a );

5248

aExp = extractFloatx80Exp( a );

5249

aSign = extractFloatx80Sign( a );

5250

if ( aExp == 0x7FFF ) {

5251

if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );

5252

if ( ! aSign ) return a;

5253

goto invalid;

5254

}

5255

if ( aSign ) {

5256

if ( ( aExp | aSig0 ) == 0 ) return a;

5257

invalid:

5258

float_raise( float_flag_invalid STATUS_VAR);

5259

z.low = floatx80_default_nan_low;

5260

z.high = floatx80_default_nan_high;

5261

return z;

5262

}

5263

if ( aExp == 0 ) {

5264

if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );

5265

normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );

5266

}

5267

zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;

5268

zSig0 = estimateSqrt32( aExp, aSig0>>32 );

5269

shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );

5270

zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );

5271

doubleZSig0 = zSig0<<1;

5272

mul64To128( zSig0, zSig0, &term0, &term1 );

5273

sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );

5274

while ( (int64_t) rem0 < 0 ) {

5275

--zSig0;

5276

doubleZSig0 -= 2;

5277

add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );

5278

}

5279

zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );

5280

if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {

5281

if ( zSig1 == 0 ) zSig1 = 1;

5282

mul64To128( doubleZSig0, zSig1, &term1, &term2 );

5283

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

5284

mul64To128( zSig1, zSig1, &term2, &term3 );

5285

sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );

5286

while ( (int64_t) rem1 < 0 ) {

5287

--zSig1;

5288

shortShift128Left( 0, zSig1, 1, &term2, &term3 );

5289

term3 |= 1;

5290

term2 |= doubleZSig0;

5291

add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );

5292

}

5293

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

5294

}

5295

shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );

5296

zSig0 |= doubleZSig0;

5297

return

5298

roundAndPackFloatx80(

5299

STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );

5300

5301

}

5302

5303

/*----------------------------------------------------------------------------

5304

| Returns 1 if the extended double-precision floating-point value `a' is equal

5305

| to the corresponding value `b', and 0 otherwise. The invalid exception is

5306

| raised if either operand is a NaN. Otherwise, the comparison is performed

5307

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5308

*----------------------------------------------------------------------------*/

5309

5310

int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )

5311

{

5312

5313

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5314

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5315

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5316

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5317

) {

5318

float_raise( float_flag_invalid STATUS_VAR);

5319

return 0;

5320

}

5321

return

5322

( a.low == b.low )

5323

&& ( ( a.high == b.high )

5324

|| ( ( a.low == 0 )

5325

&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )

5326

);

5327

5328

}

5329

5330

/*----------------------------------------------------------------------------

5331

| Returns 1 if the extended double-precision floating-point value `a' is

5332

| less than or equal to the corresponding value `b', and 0 otherwise. The

5333

| invalid exception is raised if either operand is a NaN. The comparison is

5334

| performed according to the IEC/IEEE Standard for Binary Floating-Point

5335

| Arithmetic.

5336

*----------------------------------------------------------------------------*/

5337

5338

int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )

5339

{

5340

flag aSign, bSign;

5341

5342

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5343

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5344

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5345

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5346

) {

5347

float_raise( float_flag_invalid STATUS_VAR);

5348

return 0;

5349

}

5350

aSign = extractFloatx80Sign( a );

5351

bSign = extractFloatx80Sign( b );

5352

if ( aSign != bSign ) {

5353

return

5354

aSign

5355

|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5356

== 0 );

5357

}

5358

return

5359

aSign ? le128( b.high, b.low, a.high, a.low )

5360

: le128( a.high, a.low, b.high, b.low );

5361

5362

}

5363

5364

/*----------------------------------------------------------------------------

5365

| Returns 1 if the extended double-precision floating-point value `a' is

5366

| less than the corresponding value `b', and 0 otherwise. The invalid

5367

| exception is raised if either operand is a NaN. The comparison is performed

5368

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5369

*----------------------------------------------------------------------------*/

5370

5371

int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )

5372

{

5373

flag aSign, bSign;

5374

5375

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5376

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5377

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5378

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5379

) {

5380

float_raise( float_flag_invalid STATUS_VAR);

5381

return 0;

5382

}

5383

aSign = extractFloatx80Sign( a );

5384

bSign = extractFloatx80Sign( b );

5385

if ( aSign != bSign ) {

5386

return

5387

aSign

5388

&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5389

!= 0 );

5390

}

5391

return

5392

aSign ? lt128( b.high, b.low, a.high, a.low )

5393

: lt128( a.high, a.low, b.high, b.low );

5394

5395

}

5396

5397

/*----------------------------------------------------------------------------

5398

| Returns 1 if the extended double-precision floating-point values `a' and `b'

5399

| cannot be compared, and 0 otherwise. The invalid exception is raised if

5400

| either operand is a NaN. The comparison is performed according to the

5401

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5402

*----------------------------------------------------------------------------*/

5403

int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )

5404

{

5405

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5406

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5407

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5408

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5409

) {

5410

float_raise( float_flag_invalid STATUS_VAR);

5411

return 1;

5412

}

5413

return 0;

5414

}

5415

5416

/*----------------------------------------------------------------------------

5417

| Returns 1 if the extended double-precision floating-point value `a' is

5418

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

5419

| cause an exception. The comparison is performed according to the IEC/IEEE

5420

| Standard for Binary Floating-Point Arithmetic.

5421

*----------------------------------------------------------------------------*/

5422

5423

int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5424

{

5425

5426

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5427

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5428

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5429

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5430

) {

5431

if ( floatx80_is_signaling_nan( a )

5432

|| floatx80_is_signaling_nan( b ) ) {

5433

float_raise( float_flag_invalid STATUS_VAR);

5434

}

5435

return 0;

5436

}

5437

return

5438

( a.low == b.low )

5439

&& ( ( a.high == b.high )

5440

|| ( ( a.low == 0 )

5441

&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )

5442

);

5443

5444

}

5445

5446

/*----------------------------------------------------------------------------

5447

| Returns 1 if the extended double-precision floating-point value `a' is less

5448

| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs

5449

| do not cause an exception. Otherwise, the comparison is performed according

5450

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5451

*----------------------------------------------------------------------------*/

5452

5453

int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5454

{

5455

flag aSign, bSign;

5456

5457

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5458

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5459

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5460

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5461

) {

5462

if ( floatx80_is_signaling_nan( a )

5463

|| floatx80_is_signaling_nan( b ) ) {

5464

float_raise( float_flag_invalid STATUS_VAR);

5465

}

5466

return 0;

5467

}

5468

aSign = extractFloatx80Sign( a );

5469

bSign = extractFloatx80Sign( b );

5470

if ( aSign != bSign ) {

5471

return

5472

aSign

5473

|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5474

== 0 );

5475

}

5476

return

5477

aSign ? le128( b.high, b.low, a.high, a.low )

5478

: le128( a.high, a.low, b.high, b.low );

5479

5480

}

5481

5482

/*----------------------------------------------------------------------------

5483

| Returns 1 if the extended double-precision floating-point value `a' is less

5484

| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause

5485

| an exception. Otherwise, the comparison is performed according to the

5486

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5487

*----------------------------------------------------------------------------*/

5488

5489

int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5490

{

5491

flag aSign, bSign;

5492

5493

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5494

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5495

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5496

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5497

) {

5498

if ( floatx80_is_signaling_nan( a )

5499

|| floatx80_is_signaling_nan( b ) ) {

5500

float_raise( float_flag_invalid STATUS_VAR);

5501

}

5502

return 0;

5503

}

5504

aSign = extractFloatx80Sign( a );

5505

bSign = extractFloatx80Sign( b );

5506

if ( aSign != bSign ) {

5507

return

5508

aSign

5509

&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5510

!= 0 );

5511

}

5512

return

5513

aSign ? lt128( b.high, b.low, a.high, a.low )

5514

: lt128( a.high, a.low, b.high, b.low );

5515

5516

}

5517

5518

/*----------------------------------------------------------------------------

5519

| Returns 1 if the extended double-precision floating-point values `a' and `b'

5520

| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.

5521

| The comparison is performed according to the IEC/IEEE Standard for Binary

5522

| Floating-Point Arithmetic.

5523

*----------------------------------------------------------------------------*/

5524

int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5525

{

5526

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5527

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5528

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5529

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5530

) {

5531

if ( floatx80_is_signaling_nan( a )

5532

|| floatx80_is_signaling_nan( b ) ) {

5533

float_raise( float_flag_invalid STATUS_VAR);

5534

}

5535

return 1;

5536

}

5537

return 0;

5538

}

5539

5540

/*----------------------------------------------------------------------------

5541

| Returns the result of converting the quadruple-precision floating-point

5542

| value `a' to the 32-bit two's complement integer format. The conversion

5543

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5544

| Arithmetic---which means in particular that the conversion is rounded

5545

| according to the current rounding mode. If `a' is a NaN, the largest

5546

| positive integer is returned. Otherwise, if the conversion overflows, the

5547

| largest integer with the same sign as `a' is returned.

5548

*----------------------------------------------------------------------------*/

5549

5550

int32 float128_to_int32( float128 a STATUS_PARAM )

5551

{

5552

flag aSign;

5553

int32 aExp, shiftCount;

5554

uint64_t aSig0, aSig1;

5555

5556

aSig1 = extractFloat128Frac1( a );

5557

aSig0 = extractFloat128Frac0( a );

5558

aExp = extractFloat128Exp( a );

5559

aSign = extractFloat128Sign( a );

5560

if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;

5561

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5562

aSig0 |= ( aSig1 != 0 );

5563

shiftCount = 0x4028 - aExp;

5564

if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );

5565

return roundAndPackInt32( aSign, aSig0 STATUS_VAR );

5566

5567

}

5568

5569

/*----------------------------------------------------------------------------

5570

| Returns the result of converting the quadruple-precision floating-point

5571

| value `a' to the 32-bit two's complement integer format. The conversion

5572

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5573

| Arithmetic, except that the conversion is always rounded toward zero. If

5574

| `a' is a NaN, the largest positive integer is returned. Otherwise, if the

5575

| conversion overflows, the largest integer with the same sign as `a' is

5576

| returned.

5577

*----------------------------------------------------------------------------*/

5578

5579

int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )

5580

{

5581

flag aSign;

5582

int32 aExp, shiftCount;

5583

uint64_t aSig0, aSig1, savedASig;

5584

int32_t z;

5585

5586

aSig1 = extractFloat128Frac1( a );

5587

aSig0 = extractFloat128Frac0( a );

5588

aExp = extractFloat128Exp( a );

5589

aSign = extractFloat128Sign( a );

5590

aSig0 |= ( aSig1 != 0 );

5591

if ( 0x401E < aExp ) {

5592

if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;

5593

goto invalid;

5594

}

5595

else if ( aExp < 0x3FFF ) {

5596

if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;

5597

return 0;

5598

}

5599

aSig0 |= LIT64( 0x0001000000000000 );

5600

shiftCount = 0x402F - aExp;

5601

savedASig = aSig0;

5602

aSig0 >>= shiftCount;

5603

z = aSig0;

5604

if ( aSign ) z = - z;

5605

if ( ( z < 0 ) ^ aSign ) {

5606

invalid:

5607

float_raise( float_flag_invalid STATUS_VAR);

5608

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

5609

}

5610

if ( ( aSig0<<shiftCount ) != savedASig ) {

5611

STATUS(float_exception_flags) |= float_flag_inexact;

5612

}

5613

return z;

5614

5615

}

5616

5617

/*----------------------------------------------------------------------------

5618

| Returns the result of converting the quadruple-precision floating-point

5619

| value `a' to the 64-bit two's complement integer format. The conversion

5620

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5621

| Arithmetic---which means in particular that the conversion is rounded

5622

| according to the current rounding mode. If `a' is a NaN, the largest

5623

| positive integer is returned. Otherwise, if the conversion overflows, the

5624

| largest integer with the same sign as `a' is returned.

5625

*----------------------------------------------------------------------------*/

5626

5627

int64 float128_to_int64( float128 a STATUS_PARAM )

5628

{

5629

flag aSign;

5630

int32 aExp, shiftCount;

5631

uint64_t aSig0, aSig1;

5632

5633

aSig1 = extractFloat128Frac1( a );

5634

aSig0 = extractFloat128Frac0( a );

5635

aExp = extractFloat128Exp( a );

5636

aSign = extractFloat128Sign( a );

5637

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5638

shiftCount = 0x402F - aExp;

5639

if ( shiftCount <= 0 ) {

5640

if ( 0x403E < aExp ) {

5641

float_raise( float_flag_invalid STATUS_VAR);

5642

if ( ! aSign

5643

|| ( ( aExp == 0x7FFF )

5644

&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )

5645

)

5646

) {

5647

return LIT64( 0x7FFFFFFFFFFFFFFF );

5648

}

5649

return (int64_t) LIT64( 0x8000000000000000 );

5650

}

5651

shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );

5652

}

5653

else {

5654

shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );

5655

}

5656

return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );

5657

5658

}

5659

5660

/*----------------------------------------------------------------------------

5661

| Returns the result of converting the quadruple-precision floating-point

5662

| value `a' to the 64-bit two's complement integer format. The conversion

5663

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5664

| Arithmetic, except that the conversion is always rounded toward zero.

5665

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

5666

| the conversion overflows, the largest integer with the same sign as `a' is

5667

| returned.

5668

*----------------------------------------------------------------------------*/

5669

5670

int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )

5671

{

5672

flag aSign;

5673

int32 aExp, shiftCount;

5674

uint64_t aSig0, aSig1;

5675

int64 z;

5676

5677

aSig1 = extractFloat128Frac1( a );

5678

aSig0 = extractFloat128Frac0( a );

5679

aExp = extractFloat128Exp( a );

5680

aSign = extractFloat128Sign( a );

5681

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5682

shiftCount = aExp - 0x402F;

5683

if ( 0 < shiftCount ) {

5684

if ( 0x403E <= aExp ) {

5685

aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );

5686

if ( ( a.high == LIT64( 0xC03E000000000000 ) )

5687

&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {

5688

if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

5689

}

5690

else {

5691

float_raise( float_flag_invalid STATUS_VAR);

5692

if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {

5693

return LIT64( 0x7FFFFFFFFFFFFFFF );

5694

}

5695

}

5696

return (int64_t) LIT64( 0x8000000000000000 );

5697

}

5698

z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );

5699

if ( (uint64_t) ( aSig1<<shiftCount ) ) {

5700

STATUS(float_exception_flags) |= float_flag_inexact;

5701

}

5702

}

5703

else {

5704

if ( aExp < 0x3FFF ) {

5705

if ( aExp | aSig0 | aSig1 ) {

5706

STATUS(float_exception_flags) |= float_flag_inexact;

5707

}

5708

return 0;

5709

}

5710

z = aSig0>>( - shiftCount );

5711

if ( aSig1

5712

|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {

5713

STATUS(float_exception_flags) |= float_flag_inexact;

5714

}

5715

}

5716

if ( aSign ) z = - z;

5717

return z;

5718

5719

}

5720

5721

/*----------------------------------------------------------------------------

5722

| Returns the result of converting the quadruple-precision floating-point

5723

| value `a' to the single-precision floating-point format. The conversion

5724

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5725

| Arithmetic.

5726

*----------------------------------------------------------------------------*/

5727

5728

float32 float128_to_float32( float128 a STATUS_PARAM )

5729

{

5730

flag aSign;

5731

int32 aExp;

5732

uint64_t aSig0, aSig1;

5733

uint32_t zSig;

5734

5735

aSig1 = extractFloat128Frac1( a );

5736

aSig0 = extractFloat128Frac0( a );

5737

aExp = extractFloat128Exp( a );

5738

aSign = extractFloat128Sign( a );

5739

if ( aExp == 0x7FFF ) {

5740

if ( aSig0 | aSig1 ) {

5741

return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5742

}

5743

return packFloat32( aSign, 0xFF, 0 );

5744

}

5745

aSig0 |= ( aSig1 != 0 );

5746

shift64RightJamming( aSig0, 18, &aSig0 );

5747

zSig = aSig0;

5748

if ( aExp || zSig ) {

5749

zSig |= 0x40000000;

5750

aExp -= 0x3F81;

5751

}

5752

return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );

5753

5754

}

5755

5756

/*----------------------------------------------------------------------------

5757

| Returns the result of converting the quadruple-precision floating-point

5758

| value `a' to the double-precision floating-point format. The conversion

5759

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5760

| Arithmetic.

5761

*----------------------------------------------------------------------------*/

5762

5763

float64 float128_to_float64( float128 a STATUS_PARAM )

5764

{

5765

flag aSign;

5766

int32 aExp;

5767

uint64_t aSig0, aSig1;

5768

5769

aSig1 = extractFloat128Frac1( a );

5770

aSig0 = extractFloat128Frac0( a );

5771

aExp = extractFloat128Exp( a );

5772

aSign = extractFloat128Sign( a );

5773

if ( aExp == 0x7FFF ) {

5774

if ( aSig0 | aSig1 ) {

5775

return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5776

}

5777

return packFloat64( aSign, 0x7FF, 0 );

5778

}

5779

shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );

5780

aSig0 |= ( aSig1 != 0 );

5781

if ( aExp || aSig0 ) {

5782

aSig0 |= LIT64( 0x4000000000000000 );

5783

aExp -= 0x3C01;

5784

}

5785

return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );

5786

5787

}

5788

5789

/*----------------------------------------------------------------------------

5790

| Returns the result of converting the quadruple-precision floating-point

5791

| value `a' to the extended double-precision floating-point format. The

5792

| conversion is performed according to the IEC/IEEE Standard for Binary

5793

| Floating-Point Arithmetic.

5794

*----------------------------------------------------------------------------*/

5795

5796

floatx80 float128_to_floatx80( float128 a STATUS_PARAM )

5797

{

5798

flag aSign;

5799

int32 aExp;

5800

uint64_t aSig0, aSig1;

5801

5802

aSig1 = extractFloat128Frac1( a );

5803

aSig0 = extractFloat128Frac0( a );

5804

aExp = extractFloat128Exp( a );

5805

aSign = extractFloat128Sign( a );

5806

if ( aExp == 0x7FFF ) {

5807

if ( aSig0 | aSig1 ) {

5808

return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5809

}

5810

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5811

}

5812

if ( aExp == 0 ) {

5813

if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );

5814

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

5815

}

5816

else {

5817

aSig0 |= LIT64( 0x0001000000000000 );

5818

}

5819

shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );

5820

return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );

5821

5822

}

5823

5824

/*----------------------------------------------------------------------------

5825

| Rounds the quadruple-precision floating-point value `a' to an integer, and

5826

| returns the result as a quadruple-precision floating-point value. The

5827

| operation is performed according to the IEC/IEEE Standard for Binary

5828

| Floating-Point Arithmetic.

5829

*----------------------------------------------------------------------------*/

5830

5831

float128 float128_round_to_int( float128 a STATUS_PARAM )

5832

{

5833

flag aSign;

5834

int32 aExp;

5835

uint64_t lastBitMask, roundBitsMask;

5836

float128 z;

5837

5838

aExp = extractFloat128Exp( a );

5839

if ( 0x402F <= aExp ) {

5840

if ( 0x406F <= aExp ) {

5841

if ( ( aExp == 0x7FFF )

5842

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )

5843

) {

5844

return propagateFloat128NaN( a, a STATUS_VAR );

5845

}

5846

return a;

5847

}

5848

lastBitMask = 1;

5849

lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;

5850

roundBitsMask = lastBitMask - 1;

5851

z = a;

5852

switch (STATUS(float_rounding_mode)) {

5853

case float_round_nearest_even:

5854

if ( lastBitMask ) {

5855

add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );

5856

if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;

5857

}

5858

else {

5859

if ( (int64_t) z.low < 0 ) {

5860

++z.high;

5861

if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;

5862

}

5863

}

5864

break;

5865

case float_round_to_zero:

5866

break;

5867

case float_round_up:

5868

if (!extractFloat128Sign(z)) {

5869

add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);

5870

}

5871

break;

5872

case float_round_down:

5873

if (extractFloat128Sign(z)) {

5874

add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);

5875

}

5876

break;

5877

default:

5878

abort();

5879

}

5880

z.low &= ~ roundBitsMask;

5881

}

5882

else {

5883

if ( aExp < 0x3FFF ) {

5884

if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;

5885

STATUS(float_exception_flags) |= float_flag_inexact;

5886

aSign = extractFloat128Sign( a );

5887

switch ( STATUS(float_rounding_mode) ) {

5888

case float_round_nearest_even:

5889

if ( ( aExp == 0x3FFE )

5890

&& ( extractFloat128Frac0( a )

5891

| extractFloat128Frac1( a ) )

5892

) {

5893

return packFloat128( aSign, 0x3FFF, 0, 0 );

5894

}

5895

break;

5896

case float_round_down:

5897

return

5898

aSign ? packFloat128( 1, 0x3FFF, 0, 0 )

5899

: packFloat128( 0, 0, 0, 0 );

5900

case float_round_up:

5901

return

5902

aSign ? packFloat128( 1, 0, 0, 0 )

5903

: packFloat128( 0, 0x3FFF, 0, 0 );

5904

}

5905

return packFloat128( aSign, 0, 0, 0 );

5906

}

5907

lastBitMask = 1;

5908

lastBitMask <<= 0x402F - aExp;

5909

roundBitsMask = lastBitMask - 1;

5910

z.low = 0;

5911

z.high = a.high;

5912

switch (STATUS(float_rounding_mode)) {

5913

case float_round_nearest_even:

5914

z.high += lastBitMask>>1;

5915

if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {

5916

z.high &= ~ lastBitMask;

5917

}

5918

break;

5919

case float_round_to_zero:

5920

break;

5921

case float_round_up:

5922

if (!extractFloat128Sign(z)) {

5923

z.high |= ( a.low != 0 );

5924

z.high += roundBitsMask;

5925

}

5926

break;

5927

case float_round_down:

5928

if (extractFloat128Sign(z)) {

5929

z.high |= (a.low != 0);

5930

z.high += roundBitsMask;

5931

}

5932

break;

5933

default:

5934

abort();

5935

}

5936

z.high &= ~ roundBitsMask;

5937

}

5938

if ( ( z.low != a.low ) || ( z.high != a.high ) ) {

5939

STATUS(float_exception_flags) |= float_flag_inexact;

5940

}

5941

return z;

5942

5943

}

5944

5945

/*----------------------------------------------------------------------------

5946

| Returns the result of adding the absolute values of the quadruple-precision

5947

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

5948

| before being returned. `zSign' is ignored if the result is a NaN.

5949

| The addition is performed according to the IEC/IEEE Standard for Binary

5950

| Floating-Point Arithmetic.

5951

*----------------------------------------------------------------------------*/

5952

5953

static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)

5954

{

5955

int32 aExp, bExp, zExp;

5956

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;

5957

int32 expDiff;

5958

5959

aSig1 = extractFloat128Frac1( a );

5960

aSig0 = extractFloat128Frac0( a );

5961

aExp = extractFloat128Exp( a );

5962

bSig1 = extractFloat128Frac1( b );

5963

bSig0 = extractFloat128Frac0( b );

5964

bExp = extractFloat128Exp( b );

5965

expDiff = aExp - bExp;

5966

if ( 0 < expDiff ) {

5967

if ( aExp == 0x7FFF ) {

5968

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5969

return a;

5970

}

5971

if ( bExp == 0 ) {

5972

--expDiff;

5973

}

5974

else {

5975

bSig0 |= LIT64( 0x0001000000000000 );

5976

}

5977

shift128ExtraRightJamming(

5978

bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );

5979

zExp = aExp;

5980

}

5981

else if ( expDiff < 0 ) {

5982

if ( bExp == 0x7FFF ) {

5983

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5984

return packFloat128( zSign, 0x7FFF, 0, 0 );

5985

}

5986

if ( aExp == 0 ) {

5987

++expDiff;

5988

}

5989

else {

5990

aSig0 |= LIT64( 0x0001000000000000 );

5991

}

5992

shift128ExtraRightJamming(

5993

aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );

5994

zExp = bExp;

5995

}

5996

else {

5997

if ( aExp == 0x7FFF ) {

5998

if ( aSig0 | aSig1 | bSig0 | bSig1 ) {

5999

return propagateFloat128NaN( a, b STATUS_VAR );

6000

}

6001

return a;

6002

}

6003

add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

6004

if ( aExp == 0 ) {

6005

if (STATUS(flush_to_zero)) {

6006

if (zSig0 | zSig1) {

6007

float_raise(float_flag_output_denormal STATUS_VAR);

6008

}

6009

return packFloat128(zSign, 0, 0, 0);

6010

}

6011

return packFloat128( zSign, 0, zSig0, zSig1 );

6012

}

6013

zSig2 = 0;

6014

zSig0 |= LIT64( 0x0002000000000000 );

6015

zExp = aExp;

6016

goto shiftRight1;

6017

}

6018

aSig0 |= LIT64( 0x0001000000000000 );

6019

add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

6020

--zExp;

6021

if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;

6022

++zExp;

6023

shiftRight1:

6024

shift128ExtraRightJamming(

6025

zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );

6026

roundAndPack:

6027

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6028

6029

}

6030

6031

/*----------------------------------------------------------------------------

6032

| Returns the result of subtracting the absolute values of the quadruple-

6033

| precision floating-point values `a' and `b'. If `zSign' is 1, the

6034

| difference is negated before being returned. `zSign' is ignored if the

6035

| result is a NaN. The subtraction is performed according to the IEC/IEEE

6036

| Standard for Binary Floating-Point Arithmetic.

6037

*----------------------------------------------------------------------------*/

6038

6039

static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)

6040

{

6041

int32 aExp, bExp, zExp;

6042

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;

6043

int32 expDiff;

6044

float128 z;

6045

6046

aSig1 = extractFloat128Frac1( a );

6047

aSig0 = extractFloat128Frac0( a );

6048

aExp = extractFloat128Exp( a );

6049

bSig1 = extractFloat128Frac1( b );

6050

bSig0 = extractFloat128Frac0( b );

6051

bExp = extractFloat128Exp( b );

6052

expDiff = aExp - bExp;

6053

shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );

6054

shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );

6055

if ( 0 < expDiff ) goto aExpBigger;

6056

if ( expDiff < 0 ) goto bExpBigger;

6057

if ( aExp == 0x7FFF ) {

6058

if ( aSig0 | aSig1 | bSig0 | bSig1 ) {

6059

return propagateFloat128NaN( a, b STATUS_VAR );

6060

}

6061

float_raise( float_flag_invalid STATUS_VAR);

6062

z.low = float128_default_nan_low;

6063

z.high = float128_default_nan_high;

6064

return z;

6065

}

6066

if ( aExp == 0 ) {

6067

aExp = 1;

6068

bExp = 1;

6069

}

6070

if ( bSig0 < aSig0 ) goto aBigger;

6071

if ( aSig0 < bSig0 ) goto bBigger;

6072

if ( bSig1 < aSig1 ) goto aBigger;

6073

if ( aSig1 < bSig1 ) goto bBigger;

6074

return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );

6075

bExpBigger:

6076

if ( bExp == 0x7FFF ) {

6077

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6078

return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );

6079

}

6080

if ( aExp == 0 ) {

6081

++expDiff;

6082

}

6083

else {

6084

aSig0 |= LIT64( 0x4000000000000000 );

6085

}

6086

shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );

6087

bSig0 |= LIT64( 0x4000000000000000 );

6088

bBigger:

6089

sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );

6090

zExp = bExp;

6091

zSign ^= 1;

6092

goto normalizeRoundAndPack;

6093

aExpBigger:

6094

if ( aExp == 0x7FFF ) {

6095

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6096

return a;

6097

}

6098

if ( bExp == 0 ) {

6099

--expDiff;

6100

}

6101

else {

6102

bSig0 |= LIT64( 0x4000000000000000 );

6103

}

6104

shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );

6105

aSig0 |= LIT64( 0x4000000000000000 );

6106

aBigger:

6107

sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

6108

zExp = aExp;

6109

normalizeRoundAndPack:

6110

--zExp;

6111

return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );

6112

6113

}

6114

6115

/*----------------------------------------------------------------------------

6116

| Returns the result of adding the quadruple-precision floating-point values

6117

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

6118

| for Binary Floating-Point Arithmetic.

6119

*----------------------------------------------------------------------------*/

6120

6121

float128 float128_add( float128 a, float128 b STATUS_PARAM )

6122

{

6123

flag aSign, bSign;

6124

6125

aSign = extractFloat128Sign( a );

6126

bSign = extractFloat128Sign( b );

6127

if ( aSign == bSign ) {

6128

return addFloat128Sigs( a, b, aSign STATUS_VAR );

6129

}

6130

else {

6131

return subFloat128Sigs( a, b, aSign STATUS_VAR );

6132

}

6133

6134

}

6135

6136

/*----------------------------------------------------------------------------

6137

| Returns the result of subtracting the quadruple-precision floating-point

6138

| values `a' and `b'. The operation is performed according to the IEC/IEEE

6139

| Standard for Binary Floating-Point Arithmetic.

6140

*----------------------------------------------------------------------------*/

6141

6142

float128 float128_sub( float128 a, float128 b STATUS_PARAM )

6143

{

6144

flag aSign, bSign;

6145

6146

aSign = extractFloat128Sign( a );

6147

bSign = extractFloat128Sign( b );

6148

if ( aSign == bSign ) {

6149

return subFloat128Sigs( a, b, aSign STATUS_VAR );

6150

}

6151

else {

6152

return addFloat128Sigs( a, b, aSign STATUS_VAR );

6153

}

6154

6155

}

6156

6157

/*----------------------------------------------------------------------------

6158

| Returns the result of multiplying the quadruple-precision floating-point

6159

| values `a' and `b'. The operation is performed according to the IEC/IEEE

6160

| Standard for Binary Floating-Point Arithmetic.

6161

*----------------------------------------------------------------------------*/

6162

6163

float128 float128_mul( float128 a, float128 b STATUS_PARAM )

6164

{

6165

flag aSign, bSign, zSign;

6166

int32 aExp, bExp, zExp;

6167

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;

6168

float128 z;

6169

6170

aSig1 = extractFloat128Frac1( a );

6171

aSig0 = extractFloat128Frac0( a );

6172

aExp = extractFloat128Exp( a );

6173

aSign = extractFloat128Sign( a );

6174

bSig1 = extractFloat128Frac1( b );

6175

bSig0 = extractFloat128Frac0( b );

6176

bExp = extractFloat128Exp( b );

6177

bSign = extractFloat128Sign( b );

6178

zSign = aSign ^ bSign;

6179

if ( aExp == 0x7FFF ) {

6180

if ( ( aSig0 | aSig1 )

6181

|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {

6182

return propagateFloat128NaN( a, b STATUS_VAR );

6183

}

6184

if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;

6185

return packFloat128( zSign, 0x7FFF, 0, 0 );

6186

}

6187

if ( bExp == 0x7FFF ) {

6188

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6189

if ( ( aExp | aSig0 | aSig1 ) == 0 ) {

6190

invalid:

6191

float_raise( float_flag_invalid STATUS_VAR);

6192

z.low = float128_default_nan_low;

6193

z.high = float128_default_nan_high;

6194

return z;

6195

}

6196

return packFloat128( zSign, 0x7FFF, 0, 0 );

6197

}

6198

if ( aExp == 0 ) {

6199

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6200

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6201

}

6202

if ( bExp == 0 ) {

6203

if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6204

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6205

}

6206

zExp = aExp + bExp - 0x4000;

6207

aSig0 |= LIT64( 0x0001000000000000 );

6208

shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );

6209

mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );

6210

add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );

6211

zSig2 |= ( zSig3 != 0 );

6212

if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {

6213

shift128ExtraRightJamming(

6214

zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );

6215

++zExp;

6216

}

6217

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6218

6219

}

6220

6221

/*----------------------------------------------------------------------------

6222

| Returns the result of dividing the quadruple-precision floating-point value

6223

| `a' by the corresponding value `b'. The operation is performed according to

6224

| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6225

*----------------------------------------------------------------------------*/

6226

6227

float128 float128_div( float128 a, float128 b STATUS_PARAM )

6228

{

6229

flag aSign, bSign, zSign;

6230

int32 aExp, bExp, zExp;

6231

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;

6232

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

6233

float128 z;

6234

6235

aSig1 = extractFloat128Frac1( a );

6236

aSig0 = extractFloat128Frac0( a );

6237

aExp = extractFloat128Exp( a );

6238

aSign = extractFloat128Sign( a );

6239

bSig1 = extractFloat128Frac1( b );

6240

bSig0 = extractFloat128Frac0( b );

6241

bExp = extractFloat128Exp( b );

6242

bSign = extractFloat128Sign( b );

6243

zSign = aSign ^ bSign;

6244

if ( aExp == 0x7FFF ) {

6245

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6246

if ( bExp == 0x7FFF ) {

6247

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6248

goto invalid;

6249

}

6250

return packFloat128( zSign, 0x7FFF, 0, 0 );

6251

}

6252

if ( bExp == 0x7FFF ) {

6253

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6254

return packFloat128( zSign, 0, 0, 0 );

6255

}

6256

if ( bExp == 0 ) {

6257

if ( ( bSig0 | bSig1 ) == 0 ) {

6258

if ( ( aExp | aSig0 | aSig1 ) == 0 ) {

6259

invalid:

6260

float_raise( float_flag_invalid STATUS_VAR);

6261

z.low = float128_default_nan_low;

6262

z.high = float128_default_nan_high;

6263

return z;

6264

}

6265

float_raise( float_flag_divbyzero STATUS_VAR);

6266

return packFloat128( zSign, 0x7FFF, 0, 0 );

6267

}

6268

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6269

}

6270

if ( aExp == 0 ) {

6271

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6272

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6273

}

6274

zExp = aExp - bExp + 0x3FFD;

6275

shortShift128Left(

6276

aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );

6277

shortShift128Left(

6278

bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );

6279

if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {

6280

shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );

6281

++zExp;

6282

}

6283

zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );

6284

mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );

6285

sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );

6286

while ( (int64_t) rem0 < 0 ) {

6287

--zSig0;

6288

add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );

6289

}

6290

zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );

6291

if ( ( zSig1 & 0x3FFF ) <= 4 ) {

6292

mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );

6293

sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );

6294

while ( (int64_t) rem1 < 0 ) {

6295

--zSig1;

6296

add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );

6297

}

6298

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

6299

}

6300

shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );

6301

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6302

6303

}

6304

6305

/*----------------------------------------------------------------------------

6306

| Returns the remainder of the quadruple-precision floating-point value `a'

6307

| with respect to the corresponding value `b'. The operation is performed

6308

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6309

*----------------------------------------------------------------------------*/

6310

6311

float128 float128_rem( float128 a, float128 b STATUS_PARAM )

6312

{

6313

flag aSign, zSign;

6314

int32 aExp, bExp, expDiff;

6315

uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;

6316

uint64_t allZero, alternateASig0, alternateASig1, sigMean1;

6317

int64_t sigMean0;

6318

float128 z;

6319

6320

aSig1 = extractFloat128Frac1( a );

6321

aSig0 = extractFloat128Frac0( a );

6322

aExp = extractFloat128Exp( a );

6323

aSign = extractFloat128Sign( a );

6324

bSig1 = extractFloat128Frac1( b );

6325

bSig0 = extractFloat128Frac0( b );

6326

bExp = extractFloat128Exp( b );

6327

if ( aExp == 0x7FFF ) {

6328

if ( ( aSig0 | aSig1 )

6329

|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {

6330

return propagateFloat128NaN( a, b STATUS_VAR );

6331

}

6332

goto invalid;

6333

}

6334

if ( bExp == 0x7FFF ) {

6335

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6336

return a;

6337

}

6338

if ( bExp == 0 ) {

6339

if ( ( bSig0 | bSig1 ) == 0 ) {

6340

invalid:

6341

float_raise( float_flag_invalid STATUS_VAR);

6342

z.low = float128_default_nan_low;

6343

z.high = float128_default_nan_high;

6344

return z;

6345

}

6346

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6347

}

6348

if ( aExp == 0 ) {

6349

if ( ( aSig0 | aSig1 ) == 0 ) return a;

6350

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6351

}

6352

expDiff = aExp - bExp;

6353

if ( expDiff < -1 ) return a;

6354

shortShift128Left(

6355

aSig0 | LIT64( 0x0001000000000000 ),

6356

aSig1,

6357

15 - ( expDiff < 0 ),

6358

&aSig0,

6359

&aSig1

6360

);

6361

shortShift128Left(

6362

bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );

6363

q = le128( bSig0, bSig1, aSig0, aSig1 );

6364

if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );

6365

expDiff -= 64;

6366

while ( 0 < expDiff ) {

6367

q = estimateDiv128To64( aSig0, aSig1, bSig0 );

6368

q = ( 4 < q ) ? q - 4 : 0;

6369

mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );

6370

shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );

6371

shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );

6372

sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );

6373

expDiff -= 61;

6374

}

6375

if ( -64 < expDiff ) {

6376

q = estimateDiv128To64( aSig0, aSig1, bSig0 );

6377

q = ( 4 < q ) ? q - 4 : 0;

6378

q >>= - expDiff;

6379

shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );

6380

expDiff += 52;

6381

if ( expDiff < 0 ) {

6382

shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );

6383

}

6384

else {

6385

shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );

6386

}

6387

mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );

6388

sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );

6389

}

6390

else {

6391

shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );

6392

shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );

6393

}

6394

do {

6395

alternateASig0 = aSig0;

6396

alternateASig1 = aSig1;

6397

++q;

6398

sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );

6399

} while ( 0 <= (int64_t) aSig0 );

6400

add128(

6401

aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );

6402

if ( ( sigMean0 < 0 )

6403

|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {

6404

aSig0 = alternateASig0;

6405

aSig1 = alternateASig1;

6406

}

6407

zSign = ( (int64_t) aSig0 < 0 );

6408

if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );

6409

return

6410

normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );

6411

6412

}

6413

6414

/*----------------------------------------------------------------------------

6415

| Returns the square root of the quadruple-precision floating-point value `a'.

6416

| The operation is performed according to the IEC/IEEE Standard for Binary

6417

| Floating-Point Arithmetic.

6418

*----------------------------------------------------------------------------*/

6419

6420

float128 float128_sqrt( float128 a STATUS_PARAM )

6421

{

6422

flag aSign;

6423

int32 aExp, zExp;

6424

uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;

6425

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

6426

float128 z;

6427

6428

aSig1 = extractFloat128Frac1( a );

6429

aSig0 = extractFloat128Frac0( a );

6430

aExp = extractFloat128Exp( a );

6431

aSign = extractFloat128Sign( a );

6432

if ( aExp == 0x7FFF ) {

6433

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );

6434

if ( ! aSign ) return a;

6435

goto invalid;

6436

}

6437

if ( aSign ) {

6438

if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;

6439

invalid:

6440

float_raise( float_flag_invalid STATUS_VAR);

6441

z.low = float128_default_nan_low;

6442

z.high = float128_default_nan_high;

6443

return z;

6444

}

6445

if ( aExp == 0 ) {

6446

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );

6447

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6448

}

6449

zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;

6450

aSig0 |= LIT64( 0x0001000000000000 );

6451

zSig0 = estimateSqrt32( aExp, aSig0>>17 );

6452

shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );

6453

zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );

6454

doubleZSig0 = zSig0<<1;

6455

mul64To128( zSig0, zSig0, &term0, &term1 );

6456

sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );

6457

while ( (int64_t) rem0 < 0 ) {

6458

--zSig0;

6459

doubleZSig0 -= 2;

6460

add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );

6461

}

6462

zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );

6463

if ( ( zSig1 & 0x1FFF ) <= 5 ) {

6464

if ( zSig1 == 0 ) zSig1 = 1;

6465

mul64To128( doubleZSig0, zSig1, &term1, &term2 );

6466

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

6467

mul64To128( zSig1, zSig1, &term2, &term3 );

6468

sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );

6469

while ( (int64_t) rem1 < 0 ) {

6470

--zSig1;

6471

shortShift128Left( 0, zSig1, 1, &term2, &term3 );

6472

term3 |= 1;

6473

term2 |= doubleZSig0;

6474

add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );

6475

}

6476

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

6477

}

6478

shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );

6479

return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6480

6481

}

6482

6483

/*----------------------------------------------------------------------------

6484

| Returns 1 if the quadruple-precision floating-point value `a' is equal to

6485

| the corresponding value `b', and 0 otherwise. The invalid exception is

6486

| raised if either operand is a NaN. Otherwise, the comparison is performed

6487

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6488

*----------------------------------------------------------------------------*/

6489

6490

int float128_eq( float128 a, float128 b STATUS_PARAM )

6491

{

6492

6493

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6494

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6495

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6496

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6497

) {

6498

float_raise( float_flag_invalid STATUS_VAR);

6499

return 0;

6500

}

6501

return

6502

( a.low == b.low )

6503

&& ( ( a.high == b.high )

6504

|| ( ( a.low == 0 )

6505

&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )

6506

);

6507

6508

}

6509

6510

/*----------------------------------------------------------------------------

6511

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6512

| or equal to the corresponding value `b', and 0 otherwise. The invalid

6513

| exception is raised if either operand is a NaN. The comparison is performed

6514

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6515

*----------------------------------------------------------------------------*/

6516

6517

int float128_le( float128 a, float128 b STATUS_PARAM )

6518

{

6519

flag aSign, bSign;

6520

6521

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6522

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6523

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6524

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6525

) {

6526

float_raise( float_flag_invalid STATUS_VAR);

6527

return 0;

6528

}

6529

aSign = extractFloat128Sign( a );

6530

bSign = extractFloat128Sign( b );

6531

if ( aSign != bSign ) {

6532

return

6533

aSign

6534

|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6535

== 0 );

6536

}

6537

return

6538

aSign ? le128( b.high, b.low, a.high, a.low )

6539

: le128( a.high, a.low, b.high, b.low );

6540

6541

}

6542

6543

/*----------------------------------------------------------------------------

6544

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6545

| the corresponding value `b', and 0 otherwise. The invalid exception is

6546

| raised if either operand is a NaN. The comparison is performed according

6547

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6548

*----------------------------------------------------------------------------*/

6549

6550

int float128_lt( float128 a, float128 b STATUS_PARAM )

6551

{

6552

flag aSign, bSign;

6553

6554

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6555

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6556

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6557

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6558

) {

6559

float_raise( float_flag_invalid STATUS_VAR);

6560

return 0;

6561

}

6562

aSign = extractFloat128Sign( a );

6563

bSign = extractFloat128Sign( b );

6564

if ( aSign != bSign ) {

6565

return

6566

aSign

6567

&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6568

!= 0 );

6569

}

6570

return

6571

aSign ? lt128( b.high, b.low, a.high, a.low )

6572

: lt128( a.high, a.low, b.high, b.low );

6573

6574

}

6575

6576

/*----------------------------------------------------------------------------

6577

| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot

6578

| be compared, and 0 otherwise. The invalid exception is raised if either

6579

| operand is a NaN. The comparison is performed according to the IEC/IEEE

6580

| Standard for Binary Floating-Point Arithmetic.

6581

*----------------------------------------------------------------------------*/

6582

6583

int float128_unordered( float128 a, float128 b STATUS_PARAM )

6584

{

6585

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6586

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6587

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6588

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6589

) {

6590

float_raise( float_flag_invalid STATUS_VAR);

6591

return 1;

6592

}

6593

return 0;

6594

}

6595

6596

/*----------------------------------------------------------------------------

6597

| Returns 1 if the quadruple-precision floating-point value `a' is equal to

6598

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

6599

| exception. The comparison is performed according to the IEC/IEEE Standard

6600

| for Binary Floating-Point Arithmetic.

6601

*----------------------------------------------------------------------------*/

6602

6603

int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )

6604

{

6605

6606

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6607

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6608

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6609

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6610

) {

6611

if ( float128_is_signaling_nan( a )

6612

|| float128_is_signaling_nan( b ) ) {

6613

float_raise( float_flag_invalid STATUS_VAR);

6614

}

6615

return 0;

6616

}

6617

return

6618

( a.low == b.low )

6619

&& ( ( a.high == b.high )

6620

|| ( ( a.low == 0 )

6621

&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )

6622

);

6623

6624

}

6625

6626

/*----------------------------------------------------------------------------

6627

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6628

| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

6629

| cause an exception. Otherwise, the comparison is performed according to the

6630

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6631

*----------------------------------------------------------------------------*/

6632

6633

int float128_le_quiet( float128 a, float128 b STATUS_PARAM )

6634

{

6635

flag aSign, bSign;

6636

6637

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6638

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6639

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6640

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6641

) {

6642

if ( float128_is_signaling_nan( a )

6643

|| float128_is_signaling_nan( b ) ) {

6644

float_raise( float_flag_invalid STATUS_VAR);

6645

}

6646

return 0;

6647

}

6648

aSign = extractFloat128Sign( a );

6649

bSign = extractFloat128Sign( b );

6650

if ( aSign != bSign ) {

6651

return

6652

aSign

6653

|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6654

== 0 );

6655

}

6656

return

6657

aSign ? le128( b.high, b.low, a.high, a.low )

6658

: le128( a.high, a.low, b.high, b.low );

6659

6660

}

6661

6662

/*----------------------------------------------------------------------------

6663

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6664

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

6665

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

6666

| Standard for Binary Floating-Point Arithmetic.

6667

*----------------------------------------------------------------------------*/

6668

6669

int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )

6670

{

6671

flag aSign, bSign;

6672

6673

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6674

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6675

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6676

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6677

) {

6678

if ( float128_is_signaling_nan( a )

6679

|| float128_is_signaling_nan( b ) ) {

6680

float_raise( float_flag_invalid STATUS_VAR);

6681

}

6682

return 0;

6683

}

6684

aSign = extractFloat128Sign( a );

6685

bSign = extractFloat128Sign( b );

6686

if ( aSign != bSign ) {

6687

return

6688

aSign

6689

&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6690

!= 0 );

6691

}

6692

return

6693

aSign ? lt128( b.high, b.low, a.high, a.low )

6694

: lt128( a.high, a.low, b.high, b.low );

6695

6696

}

6697

6698

/*----------------------------------------------------------------------------

6699

| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot

6700

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

6701

| comparison is performed according to the IEC/IEEE Standard for Binary

6702

| Floating-Point Arithmetic.

6703

*----------------------------------------------------------------------------*/

6704

6705

int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )

6706

{

6707

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6708

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6709

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6710

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6711

) {

6712

if ( float128_is_signaling_nan( a )

6713

|| float128_is_signaling_nan( b ) ) {

6714

float_raise( float_flag_invalid STATUS_VAR);

6715

}

6716

return 1;

6717

}

6718

return 0;

6719

}

6720

6721

/* misc functions */

6722

float32 uint32_to_float32(uint32_t a STATUS_PARAM)

6723

{

6724

return int64_to_float32(a STATUS_VAR);

6725

}

6726

6727

float64 uint32_to_float64(uint32_t a STATUS_PARAM)

6728

{

6729

return int64_to_float64(a STATUS_VAR);

6730

}

6731

6732

uint32 float32_to_uint32( float32 a STATUS_PARAM )

6733

{

6734

int64_t v;

6735

uint32 res;

6736

int old_exc_flags = get_float_exception_flags(status);

6737

6738

v = float32_to_int64(a STATUS_VAR);

6739

if (v < 0) {

6740

res = 0;

6741

} else if (v > 0xffffffff) {

6742

res = 0xffffffff;

6743

} else {

6744

return v;

6745

}

6746

set_float_exception_flags(old_exc_flags, status);

6747

float_raise(float_flag_invalid STATUS_VAR);

6748

return res;

6749

}

6750

6751

uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )

6752

{

6753

int64_t v;

6754

uint32 res;

6755

int old_exc_flags = get_float_exception_flags(status);

6756

6757

v = float32_to_int64_round_to_zero(a STATUS_VAR);

6758

if (v < 0) {

6759

res = 0;

6760

} else if (v > 0xffffffff) {

6761

res = 0xffffffff;

6762

} else {

6763

return v;

6764

}

6765

set_float_exception_flags(old_exc_flags, status);

6766

float_raise(float_flag_invalid STATUS_VAR);

6767

return res;

6768

}

6769

6770

int_fast16_t float32_to_int16(float32 a STATUS_PARAM)

6771

{

6772

int32_t v;

6773

int_fast16_t res;

6774

int old_exc_flags = get_float_exception_flags(status);

6775

6776

v = float32_to_int32(a STATUS_VAR);

6777

if (v < -0x8000) {

6778

res = -0x8000;

6779

} else if (v > 0x7fff) {

6780

res = 0x7fff;

6781

} else {

6782

return v;

6783

}

6784

6785

set_float_exception_flags(old_exc_flags, status);

6786

float_raise(float_flag_invalid STATUS_VAR);

6787

return res;

6788

}

6789

6790

uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)

6791

{

6792

int32_t v;

6793

uint_fast16_t res;

6794

int old_exc_flags = get_float_exception_flags(status);

6795

6796

v = float32_to_int32(a STATUS_VAR);

6797

if (v < 0) {

6798

res = 0;

6799

} else if (v > 0xffff) {

6800

res = 0xffff;

6801

} else {

6802

return v;

6803

}

6804

6805

set_float_exception_flags(old_exc_flags, status);

6806

float_raise(float_flag_invalid STATUS_VAR);

6807

return res;

6808

}

6809

6810

uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)

6811

{

6812

int64_t v;

6813

uint_fast16_t res;

6814

int old_exc_flags = get_float_exception_flags(status);

6815

6816

v = float32_to_int64_round_to_zero(a STATUS_VAR);

6817

if (v < 0) {

6818

res = 0;

6819

} else if (v > 0xffff) {

6820

res = 0xffff;

6821

} else {

6822

return v;

6823

}

6824

set_float_exception_flags(old_exc_flags, status);

6825

float_raise(float_flag_invalid STATUS_VAR);

6826

return res;

6827

}

6828

6829

uint32 float64_to_uint32( float64 a STATUS_PARAM )

6830

{

6831

uint64_t v;

6832

uint32 res;

6833

int old_exc_flags = get_float_exception_flags(status);

6834

6835

v = float64_to_uint64(a STATUS_VAR);

6836

if (v > 0xffffffff) {

6837

res = 0xffffffff;

6838

} else {

6839

return v;

6840

}

6841

set_float_exception_flags(old_exc_flags, status);

6842

float_raise(float_flag_invalid STATUS_VAR);

6843

return res;

6844

}

6845

6846

uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )

6847

{

6848

uint64_t v;

6849

uint32 res;

6850

int old_exc_flags = get_float_exception_flags(status);

6851

6852

v = float64_to_uint64_round_to_zero(a STATUS_VAR);

6853

if (v > 0xffffffff) {

6854

res = 0xffffffff;

6855

} else {

6856

return v;

6857

}

6858

set_float_exception_flags(old_exc_flags, status);

6859

float_raise(float_flag_invalid STATUS_VAR);

6860

return res;

6861

}

6862

6863

int_fast16_t float64_to_int16(float64 a STATUS_PARAM)

6864

{

6865

int64_t v;

6866

int_fast16_t res;

6867

int old_exc_flags = get_float_exception_flags(status);

6868

6869

v = float64_to_int32(a STATUS_VAR);

6870

if (v < -0x8000) {

6871

res = -0x8000;

6872

} else if (v > 0x7fff) {

6873

res = 0x7fff;

6874

} else {

6875

return v;

6876

}

6877

6878

set_float_exception_flags(old_exc_flags, status);

6879

float_raise(float_flag_invalid STATUS_VAR);

6880

return res;

6881

}

6882

6883

uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)

6884

{

6885

int64_t v;

6886

uint_fast16_t res;

6887

int old_exc_flags = get_float_exception_flags(status);

6888

6889

v = float64_to_int32(a STATUS_VAR);

6890

if (v < 0) {

6891

res = 0;

6892

} else if (v > 0xffff) {

6893

res = 0xffff;

6894

} else {

6895

return v;

6896

}

6897

6898

set_float_exception_flags(old_exc_flags, status);

6899

float_raise(float_flag_invalid STATUS_VAR);

6900

return res;

6901

}

6902

6903

uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)

6904

{

6905

int64_t v;

6906

uint_fast16_t res;

6907

int old_exc_flags = get_float_exception_flags(status);

6908

6909

v = float64_to_int64_round_to_zero(a STATUS_VAR);

6910

if (v < 0) {

6911

res = 0;

6912

} else if (v > 0xffff) {

6913

res = 0xffff;

6914

} else {

6915

return v;

6916

}

6917

set_float_exception_flags(old_exc_flags, status);

6918

float_raise(float_flag_invalid STATUS_VAR);

6919

return res;

6920

}

6921

6922

/*----------------------------------------------------------------------------

6923

| Returns the result of converting the double-precision floating-point value

6924

| `a' to the 64-bit unsigned integer format. The conversion is

6925

| performed according to the IEC/IEEE Standard for Binary Floating-Point

6926

| Arithmetic---which means in particular that the conversion is rounded

6927

| according to the current rounding mode. If `a' is a NaN, the largest

6928

| positive integer is returned. If the conversion overflows, the

6929

| largest unsigned integer is returned. If 'a' is negative, the value is

6930

| rounded and zero is returned; negative values that do not round to zero

6931

| will raise the inexact exception.

6932

*----------------------------------------------------------------------------*/

6933

6934

uint64_t float64_to_uint64(float64 a STATUS_PARAM)

6935

{

6936

flag aSign;

6937

int_fast16_t aExp, shiftCount;

6938

uint64_t aSig, aSigExtra;

6939

a = float64_squash_input_denormal(a STATUS_VAR);

6940

6941

aSig = extractFloat64Frac(a);

6942

aExp = extractFloat64Exp(a);

6943

aSign = extractFloat64Sign(a);

6944

if (aSign && (aExp > 1022)) {

6945

float_raise(float_flag_invalid STATUS_VAR);

6946

if (float64_is_any_nan(a)) {

6947

return LIT64(0xFFFFFFFFFFFFFFFF);

6948

} else {

6949

return 0;

6950

}

6951

}

6952

if (aExp) {

6953

aSig |= LIT64(0x0010000000000000);

6954

}

6955

shiftCount = 0x433 - aExp;

6956

if (shiftCount <= 0) {

6957

if (0x43E < aExp) {

6958

float_raise(float_flag_invalid STATUS_VAR);

6959

return LIT64(0xFFFFFFFFFFFFFFFF);

6960

}

6961

aSigExtra = 0;

6962

aSig <<= -shiftCount;

6963

} else {

6964

shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);

6965

}

6966

return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);

6967

}

6968

6969

uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)

6970

{

6971

signed char current_rounding_mode = STATUS(float_rounding_mode);

6972

set_float_rounding_mode(float_round_to_zero STATUS_VAR);

6973

int64_t v = float64_to_uint64(a STATUS_VAR);

6974

set_float_rounding_mode(current_rounding_mode STATUS_VAR);

6975

return v;

6976

}

6977

6978

#define COMPARE(s, nan_exp) \

6979

INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \

6980

int is_quiet STATUS_PARAM ) \

6981

{ \

6982

flag aSign, bSign; \

6983

uint ## s ## _t av, bv; \

6984

a = float ## s ## _squash_input_denormal(a STATUS_VAR); \

6985

b = float ## s ## _squash_input_denormal(b STATUS_VAR); \

6986

6987

if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \

6988

extractFloat ## s ## Frac( a ) ) || \

6989

( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \

6990

extractFloat ## s ## Frac( b ) )) { \

6991

if (!is_quiet || \

6992

float ## s ## _is_signaling_nan( a ) || \

6993

float ## s ## _is_signaling_nan( b ) ) { \

6994

float_raise( float_flag_invalid STATUS_VAR); \

6995

} \

6996

return float_relation_unordered; \

6997

} \

6998

aSign = extractFloat ## s ## Sign( a ); \

6999

bSign = extractFloat ## s ## Sign( b ); \

7000

av = float ## s ## _val(a); \

7001

bv = float ## s ## _val(b); \

7002

if ( aSign != bSign ) { \

7003

if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \

7004

/* zero case */ \

7005

return float_relation_equal; \

7006

} else { \

7007

return 1 - (2 * aSign); \

7008

} \

7009

} else { \

7010

if (av == bv) { \

7011

return float_relation_equal; \

7012

} else { \

7013

return 1 - 2 * (aSign ^ ( av < bv )); \

7014

} \

7015

} \

7016

} \

7017

7018

int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \

7019

{ \

7020

return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \

7021

} \

7022

7023

int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \

7024

{ \

7025

return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \

7026

}

7027

7028

COMPARE(32, 0xff)

7029

COMPARE(64, 0x7ff)

7030

7031

INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,

7032

int is_quiet STATUS_PARAM )

7033

{

7034

flag aSign, bSign;

7035

7036

if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&

7037

( extractFloatx80Frac( a )<<1 ) ) ||

7038

( ( extractFloatx80Exp( b ) == 0x7fff ) &&

7039

( extractFloatx80Frac( b )<<1 ) )) {

7040

if (!is_quiet ||

7041

floatx80_is_signaling_nan( a ) ||

7042

floatx80_is_signaling_nan( b ) ) {

7043

float_raise( float_flag_invalid STATUS_VAR);

7044

}

7045

return float_relation_unordered;

7046

}

7047

aSign = extractFloatx80Sign( a );

7048

bSign = extractFloatx80Sign( b );

7049

if ( aSign != bSign ) {

7050

7051

if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&

7052

( ( a.low | b.low ) == 0 ) ) {

7053

/* zero case */

7054

return float_relation_equal;

7055

} else {

7056

return 1 - (2 * aSign);

7057

}

7058

} else {

7059

if (a.low == b.low && a.high == b.high) {

7060

return float_relation_equal;

7061

} else {

7062

return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));

7063

}

7064

}

7065

}

7066

7067

int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )

7068

{

7069

return floatx80_compare_internal(a, b, 0 STATUS_VAR);

7070

}

7071

7072

int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )

7073

{

7074

return floatx80_compare_internal(a, b, 1 STATUS_VAR);

7075

}

7076

7077

INLINE int float128_compare_internal( float128 a, float128 b,

7078

int is_quiet STATUS_PARAM )

7079

{

7080

flag aSign, bSign;

7081

7082

if (( ( extractFloat128Exp( a ) == 0x7fff ) &&

7083

( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||

7084

( ( extractFloat128Exp( b ) == 0x7fff ) &&

7085

( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {

7086

if (!is_quiet ||

7087

float128_is_signaling_nan( a ) ||

7088

float128_is_signaling_nan( b ) ) {

7089

float_raise( float_flag_invalid STATUS_VAR);

7090

}

7091

return float_relation_unordered;

7092

}

7093

aSign = extractFloat128Sign( a );

7094

bSign = extractFloat128Sign( b );

7095

if ( aSign != bSign ) {

7096

if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {

7097

/* zero case */

7098

return float_relation_equal;

7099

} else {

7100

return 1 - (2 * aSign);

7101

}

7102

} else {

7103

if (a.low == b.low && a.high == b.high) {

7104

return float_relation_equal;

7105

} else {

7106

return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));

7107

}

7108

}

7109

}

7110

7111

int float128_compare( float128 a, float128 b STATUS_PARAM )

7112

{

7113

return float128_compare_internal(a, b, 0 STATUS_VAR);

7114

}

7115

7116

int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )

7117

{

7118

return float128_compare_internal(a, b, 1 STATUS_VAR);

7119

}

7120

7121

/* min() and max() functions. These can't be implemented as

7122

* 'compare and pick one input' because that would mishandle

7123

* NaNs and +0 vs -0.

7124

7125

* minnum() and maxnum() functions. These are similar to the min()

7126

* and max() functions but if one of the arguments is a QNaN and

7127

* the other is numerical then the numerical argument is returned.

7128

* minnum() and maxnum correspond to the IEEE 754-2008 minNum()

7129

* and maxNum() operations. min() and max() are the typical min/max

7130

* semantics provided by many CPUs which predate that specification.

7131

7132

#define MINMAX(s, nan_exp) \

7133

INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \

7134

int ismin, int isieee STATUS_PARAM) \

7135

{ \

7136

flag aSign, bSign; \

7137

uint ## s ## _t av, bv; \

7138

a = float ## s ## _squash_input_denormal(a STATUS_VAR); \

7139

b = float ## s ## _squash_input_denormal(b STATUS_VAR); \

7140

if (float ## s ## _is_any_nan(a) || \

7141

float ## s ## _is_any_nan(b)) { \

7142

if (isieee) { \

7143

if (float ## s ## _is_quiet_nan(a) && \

7144

!float ## s ##_is_any_nan(b)) { \

7145

return b; \

7146

} else if (float ## s ## _is_quiet_nan(b) && \

7147

!float ## s ## _is_any_nan(a)) { \

7148

return a; \

7149

} \

7150

} \

7151

return propagateFloat ## s ## NaN(a, b STATUS_VAR); \

7152

} \

7153

aSign = extractFloat ## s ## Sign(a); \

7154

bSign = extractFloat ## s ## Sign(b); \

7155

av = float ## s ## _val(a); \

7156

bv = float ## s ## _val(b); \

7157

if (aSign != bSign) { \

7158

if (ismin) { \

7159

return aSign ? a : b; \

7160

} else { \

7161

return aSign ? b : a; \

7162

} \

7163

} else { \

7164

if (ismin) { \

7165

return (aSign ^ (av < bv)) ? a : b; \

7166

} else { \

7167

return (aSign ^ (av < bv)) ? b : a; \

7168

} \

7169

} \

7170

} \

7171

7172

float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \

7173

{ \

7174

return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \

7175

} \

7176

7177

float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \

7178

{ \

7179

return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \

7180

} \

7181

7182

float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \

7183

{ \

7184

return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \

7185

} \

7186

7187

float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \

7188

{ \

7189

return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \

7190

}

7191

7192

MINMAX(32, 0xff)

7193

MINMAX(64, 0x7ff)

7194

7195

7196

/* Multiply A by 2 raised to the power N. */

7197

float32 float32_scalbn( float32 a, int n STATUS_PARAM )

7198

{

7199

flag aSign;

7200

int16_t aExp;

7201

uint32_t aSig;

7202

7203

a = float32_squash_input_denormal(a STATUS_VAR);

7204

aSig = extractFloat32Frac( a );

7205

aExp = extractFloat32Exp( a );

7206

aSign = extractFloat32Sign( a );

7207

7208

if ( aExp == 0xFF ) {

7209

if ( aSig ) {

7210

return propagateFloat32NaN( a, a STATUS_VAR );

7211

}

7212

return a;

7213

}

7214

if (aExp != 0) {

7215

aSig |= 0x00800000;

7216

} else if (aSig == 0) {

7217

return a;

7218

} else {

7219

aExp++;

7220

}

7221

7222

if (n > 0x200) {

7223

n = 0x200;

7224

} else if (n < -0x200) {

7225

n = -0x200;

7226

}

7227

7228

aExp += n - 1;

7229

aSig <<= 7;

7230

return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );

7231

}

7232

7233

float64 float64_scalbn( float64 a, int n STATUS_PARAM )

7234

{

7235

flag aSign;

7236

int16_t aExp;

7237

uint64_t aSig;

7238

7239

a = float64_squash_input_denormal(a STATUS_VAR);

7240

aSig = extractFloat64Frac( a );

7241

aExp = extractFloat64Exp( a );

7242

aSign = extractFloat64Sign( a );

7243

7244

if ( aExp == 0x7FF ) {

7245

if ( aSig ) {

7246

return propagateFloat64NaN( a, a STATUS_VAR );

7247

}

7248

return a;

7249

}

7250

if (aExp != 0) {

7251

aSig |= LIT64( 0x0010000000000000 );

7252

} else if (aSig == 0) {

7253

return a;

7254

} else {

7255

aExp++;

7256

}

7257

7258

if (n > 0x1000) {

7259

n = 0x1000;

7260

} else if (n < -0x1000) {

7261

n = -0x1000;

7262

}

7263

7264

aExp += n - 1;

7265

aSig <<= 10;

7266

return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );

7267

}

7268

7269

floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )

7270

{

7271

flag aSign;

7272

int32_t aExp;

7273

uint64_t aSig;

7274

7275

aSig = extractFloatx80Frac( a );

7276

aExp = extractFloatx80Exp( a );

7277

aSign = extractFloatx80Sign( a );

7278

7279

if ( aExp == 0x7FFF ) {

7280

if ( aSig<<1 ) {

7281

return propagateFloatx80NaN( a, a STATUS_VAR );

7282

}

7283

return a;

7284

}

7285

7286

if (aExp == 0) {

7287

if (aSig == 0) {

7288

return a;

7289

}

7290

aExp++;

7291

}

7292

7293

if (n > 0x10000) {

7294

n = 0x10000;

7295

} else if (n < -0x10000) {

7296

n = -0x10000;

7297

}

7298

7299

aExp += n;

7300

return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),

7301

aSign, aExp, aSig, 0 STATUS_VAR );

7302

}

7303

7304

float128 float128_scalbn( float128 a, int n STATUS_PARAM )

7305

{

7306

flag aSign;

7307

int32_t aExp;

7308

uint64_t aSig0, aSig1;

7309

7310

aSig1 = extractFloat128Frac1( a );

7311

aSig0 = extractFloat128Frac0( a );

7312

aExp = extractFloat128Exp( a );

7313

aSign = extractFloat128Sign( a );

7314

if ( aExp == 0x7FFF ) {

7315

if ( aSig0 | aSig1 ) {

7316

return propagateFloat128NaN( a, a STATUS_VAR );

7317

}

7318

return a;

7319

}

7320

if (aExp != 0) {

7321

aSig0 |= LIT64( 0x0001000000000000 );

7322

} else if (aSig0 == 0 && aSig1 == 0) {

7323

return a;

7324

} else {

7325

aExp++;

7326

}

7327

7328

if (n > 0x10000) {

7329

n = 0x10000;

7330

} else if (n < -0x10000) {

7331

n = -0x10000;

7332

}

7333

7334

aExp += n - 1;

7335

return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1

7336

STATUS_VAR );

7337

7338

}

Older »