~ubuntu-branches/ubuntu/vivid/qemu/vivid

Viewing changes to .pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu/softfloat.c

Committer: Package Import Robot
Author(s): dann frazier
Date: 2014-02-11 15:41:53 UTC
Revision ID: package-import@ubuntu.com-20140211154153-2d001tf0ium08u81

Tags: 1.7.0+dfsg-3ubuntu2

* Backport changes to enable qemu-user-static support for aarch64
* debian/control: add ppc64el to Architectures
* debian/rules: only install qemu-system-aarch64 on arm64.
Fixes a FTBFS when built twice in a row on non-arm64 due to a stale
debian/qemu-system-aarch64 directory

files added:
.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch

.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch/target-arm

.pc/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch/target-arm/cpu64.c

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch/target-arm

.pc/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch/target-arm

.pc/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch/target-arm

.pc/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch/target-arm

.pc/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch/target-arm

.pc/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch/target-arm

.pc/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch/target-arm

.pc/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch/target-arm

.pc/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch/target-arm

.pc/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch/target-arm

.pc/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch/target-arm

.pc/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/helper.c

.pc/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch/target-arm/kvm-consts.h

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/helper.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate.c

.pc/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch/target-arm/translate.h

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch/target-arm

.pc/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/aarch64

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/aarch64/target_cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/arm

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/arm/target_cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/linux-user/main.c

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch/target-arm

.pc/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch/target-arm

.pc/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch/target-arm

.pc/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/linux-user

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/linux-user/main.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/machine.c

.pc/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch/target-arm/translate.c

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/linux-user

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/linux-user/main.c

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/target-arm

.pc/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user/aarch64

.pc/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch/linux-user/aarch64/syscall.h

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch/linux-user

.pc/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch/linux-user/signal.c

.pc/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch

.pc/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch/.travis.yml

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch/default-configs

.pc/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch/default-configs/aarch64-linux-user.mak

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch/target-arm

.pc/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch/target-arm

.pc/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/helper.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/helper.h

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/neon_helper.c

.pc/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch/target-arm/translate.c

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch/target-arm

.pc/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch/target-arm

.pc/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch/target-arm

.pc/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/helper-a64.c

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/helper-a64.h

.pc/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch/target-arm

.pc/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch/target-arm

.pc/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch/target-arm/helper.c

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch/fpu

.pc/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/fpu

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include/fpu

.pc/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include/fpu

.pc/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/fpu

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include/fpu

.pc/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch/fpu

.pc/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch/fpu

.pc/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch/fpu

.pc/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include/fpu

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu

.pc/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch/fpu

.pc/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch/fpu

.pc/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/fpu

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include/fpu

.pc/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch/fpu

.pc/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/fpu

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include/fpu

.pc/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch/fpu

.pc/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/fpu/softfloat.c

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include/fpu

.pc/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch/include/fpu/softfloat.h

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch/target-arm

.pc/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch/target-arm/helper.c

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/helper.c

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/helper.h

.pc/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch/target-arm/translate.c

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch/target-arm

.pc/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch/target-arm/helper.c

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm/helper.c

.pc/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch/target-arm/helper.h

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/helper.c

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/helper.h

.pc/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch/target-arm

.pc/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/helper.c

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/helper.h

.pc/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/helper.c

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/helper.h

.pc/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch/target-arm

.pc/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch/target-arm/helper.c

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch/target-arm

.pc/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch/target-arm

.pc/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch/target-arm

.pc/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch/target-arm

.pc/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch/target-arm

.pc/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/helper-a64.c

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/helper-a64.h

.pc/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch/target-arm

.pc/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch/target-arm

.pc/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch/target-arm

.pc/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch/target-arm

.pc/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch/target-arm

.pc/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/cpu.h

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/helper.c

.pc/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch/target-arm

.pc/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch/target-arm/translate.c

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch/target-arm

.pc/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch/target-arm/translate.c

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch/target-arm

.pc/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch/target-arm/translate.c

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch/target-arm

.pc/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch/target-arm/translate.c

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch/target-arm

.pc/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch/target-arm/translate.c

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm/helper.c

.pc/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch/target-arm/helper.h

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch/target-arm

.pc/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch/target-arm/translate.c

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch/target-arm

.pc/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch/target-arm/translate.c

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch/target-arm

.pc/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch/target-arm/translate.c

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch/target-arm

.pc/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch/target-arm

.pc/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch/target-arm

.pc/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch/target-arm

.pc/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch/target-arm

.pc/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch/target-arm

.pc/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch/target-arm

.pc/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch/target-arm

.pc/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch/target-arm

.pc/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch/target-arm

.pc/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch/target-arm

.pc/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch/tcg

.pc/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch/tcg/tcg.h

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch/target-arm

.pc/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch/target-arm

.pc/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch/target-arm

.pc/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch/target-arm

.pc/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch/target-arm

.pc/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/helper.h

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/neon_helper.c

.pc/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch/target-arm

.pc/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch/target-arm

.pc/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch/target-arm

.pc/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch/target-arm

.pc/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch/target-arm/translate.c

.pc/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch

.pc/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch/rules.mak

.pc/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch

.pc/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch/rules.mak

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/LICENCE

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/README

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/assembler-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/constants-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/cpu-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/decoder-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/decoder-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/disasm-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/disasm-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/instructions-a64.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/a64/instructions-a64.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/globals.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/platform.h

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/utils.cc

.pc/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch/disas/libvixl/utils.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/a64

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/a64/instructions-a64.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/globals.h

.pc/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch/disas/libvixl/utils.h

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/configure

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas.c

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/Makefile.objs

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/arm-a64.cc

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/libvixl

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/disas/libvixl/Makefile.objs

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include/disas

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/include/disas/bfd.h

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/target-arm

.pc/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch/target-arm/translate-a64.c

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc/linux-user

.pc/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc/linux-user/main.c

debian/binfmts/qemu-aarch64

debian/patches/ubuntu/arm64/0050-target-arm-fix-build-with-gcc-4.8.2.patch

debian/patches/ubuntu/arm64/0051-target-arm-A64-add-support-for-ld-st-pair.patch

debian/patches/ubuntu/arm64/0052-target-arm-A64-add-support-for-ld-st-unsigned-imm.patch

debian/patches/ubuntu/arm64/0053-target-arm-A64-add-support-for-ld-st-with-reg-offset.patch

debian/patches/ubuntu/arm64/0054-target-arm-A64-add-support-for-ld-st-with-index.patch

debian/patches/ubuntu/arm64/0055-target-arm-A64-add-support-for-add-addi-sub-subi.patch

debian/patches/ubuntu/arm64/0056-target-arm-A64-add-support-for-move-wide-instruction.patch

debian/patches/ubuntu/arm64/0057-target-arm-A64-add-support-for-3-src-data-proc-insns.patch

debian/patches/ubuntu/arm64/0058-target-arm-A64-implement-SVC-BRK.patch

debian/patches/ubuntu/arm64/0059-target-arm-A64-Add-decoder-skeleton-for-FP-instructi.patch

debian/patches/ubuntu/arm64/0060-target-arm-A64-implement-FMOV.patch

debian/patches/ubuntu/arm64/0061-target-arm-Pull-add-one-cpreg-to-hashtable-into-its-.patch

debian/patches/ubuntu/arm64/0062-target-arm-Update-generic-cpreg-code-for-AArch64.patch

debian/patches/ubuntu/arm64/0063-target-arm-Remove-ARMCPU-CPUARMState-from-cpregs-API.patch

debian/patches/ubuntu/arm64/0064-target-arm-A64-Implement-MRS-MSR-SYS-SYSL.patch

debian/patches/ubuntu/arm64/0065-target-arm-A64-Implement-minimal-set-of-EL0-visible-.patch

debian/patches/ubuntu/arm64/0066-target-arm-Widen-thread-local-register-state-fields-.patch

debian/patches/ubuntu/arm64/0067-target-arm-A64-add-support-for-add-sub-with-carry.patch

debian/patches/ubuntu/arm64/0068-target-arm-A64-add-support-for-conditional-compare-i.patch

debian/patches/ubuntu/arm64/0069-target-arm-aarch64-add-support-for-ld-lit.patch

debian/patches/ubuntu/arm64/0070-target-arm-Widen-exclusive-access-support-struct-fie.patch

debian/patches/ubuntu/arm64/0071-target-arm-A64-support-for-ld-st-cl-exclusive.patch

debian/patches/ubuntu/arm64/0072-linux-user-AArch64-define-TARGET_CLONE_BACKWARDS.patch

debian/patches/ubuntu/arm64/0073-linux-user-AArch64-Use-correct-values-for-FPSR-FPCR-.patch

debian/patches/ubuntu/arm64/0074-.travis.yml-Add-aarch64-targets.patch

debian/patches/ubuntu/arm64/0075-default-configs-Add-config-for-aarch64-linux-user.patch

debian/patches/ubuntu/arm64/0076-target-arm-A64-Add-support-for-dumping-AArch64-VFP-r.patch

debian/patches/ubuntu/arm64/0077-target-arm-A64-Fix-vector-register-access-on-bigendi.patch

debian/patches/ubuntu/arm64/0078-target-arm-Use-VFP_BINOP-macro-for-min-max-minnum-ma.patch

debian/patches/ubuntu/arm64/0079-target-arm-A64-Add-Floating-point-data-processing-2-.patch

debian/patches/ubuntu/arm64/0080-target-arm-A64-Add-Floating-point-data-processing-3-.patch

debian/patches/ubuntu/arm64/0081-target-arm-A64-Add-fmov-scalar-immediate-instruction.patch

debian/patches/ubuntu/arm64/0082-target-arm-A64-Add-support-for-floating-point-compar.patch

debian/patches/ubuntu/arm64/0083-target-arm-A64-Add-support-for-floating-point-condit.patch

debian/patches/ubuntu/arm64/0084-target-arm-A64-Add-support-for-floating-point-cond-s.patch

debian/patches/ubuntu/arm64/0085-target-arm-Give-the-FPSCR-rounding-modes-names.patch

debian/patches/ubuntu/arm64/0086-softfloat-Fix-exception-flag-handling-for-float32_to.patch

debian/patches/ubuntu/arm64/0087-softfloat-Add-float-to-16bit-integer-conversions.patch

debian/patches/ubuntu/arm64/0088-softfloat-Add-16-bit-integer-to-float-conversions.patch

debian/patches/ubuntu/arm64/0089-softfloat-Make-the-int-to-float-functions-take-exact.patch

debian/patches/ubuntu/arm64/0090-softfloat-Fix-float64_to_uint64.patch

debian/patches/ubuntu/arm64/0091-softfloat-Only-raise-Invalid-when-conversions-to-int.patch

debian/patches/ubuntu/arm64/0092-softfloat-Fix-factor-2-error-for-scalbn-on-denormal-.patch

debian/patches/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch

debian/patches/ubuntu/arm64/0094-softfloat-Fix-float64_to_uint64_round_to_zero.patch

debian/patches/ubuntu/arm64/0095-softfloat-Fix-float64_to_uint32.patch

debian/patches/ubuntu/arm64/0096-softfloat-Fix-float64_to_uint32_round_to_zero.patch

debian/patches/ubuntu/arm64/0097-softfloat-Provide-complete-set-of-accessors-for-fp-s.patch

debian/patches/ubuntu/arm64/0098-softfloat-Factor-out-RoundAndPackFloat16-and-Normali.patch

debian/patches/ubuntu/arm64/0099-softfloat-Add-float16-float64-conversion-functions.patch

debian/patches/ubuntu/arm64/0100-softfloat-Refactor-code-handling-various-rounding-mo.patch

debian/patches/ubuntu/arm64/0101-softfloat-Add-support-for-ties-away-rounding.patch

debian/patches/ubuntu/arm64/0102-target-arm-Prepare-VFP_CONV_FIX-helpers-for-A64-uses.patch

debian/patches/ubuntu/arm64/0103-target-arm-Rename-A32-VFP-conversion-helpers.patch

debian/patches/ubuntu/arm64/0104-target-arm-Ignore-most-exceptions-from-scalbn-when-d.patch

debian/patches/ubuntu/arm64/0105-target-arm-A64-Add-extra-VFP-fixed-point-conversion-.patch

debian/patches/ubuntu/arm64/0106-target-arm-A64-Add-floating-point-fixed-point-instru.patch

debian/patches/ubuntu/arm64/0107-target-arm-A64-Add-floating-point-integer-conversion.patch

debian/patches/ubuntu/arm64/0108-target-arm-A64-Add-1-source-32-to-32-and-64-to-64-FP.patch

debian/patches/ubuntu/arm64/0109-target-arm-A64-Add-support-for-FCVT-between-half-sin.patch

debian/patches/ubuntu/arm64/0110-target-arm-remove-raw_read-write-duplication.patch

debian/patches/ubuntu/arm64/0111-arm-fix-compile-on-bigendian-host.patch

debian/patches/ubuntu/arm64/0112-target-arm-A64-Add-SIMD-ld-st-multiple.patch

debian/patches/ubuntu/arm64/0113-target-arm-A64-Add-SIMD-ld-st-single.patch

debian/patches/ubuntu/arm64/0114-target-arm-A64-Add-decode-skeleton-for-SIMD-data-pro.patch

debian/patches/ubuntu/arm64/0115-target-arm-A64-Add-SIMD-EXT.patch

debian/patches/ubuntu/arm64/0116-target-arm-A64-Add-SIMD-TBL-TBLX.patch

debian/patches/ubuntu/arm64/0117-target-arm-A64-Add-SIMD-ZIP-UZP-TRN.patch

debian/patches/ubuntu/arm64/0118-target-arm-A64-Add-SIMD-across-lanes-instructions.patch

debian/patches/ubuntu/arm64/0119-target-arm-A64-Add-SIMD-copy-operations.patch

debian/patches/ubuntu/arm64/0120-target-arm-A64-Add-SIMD-modified-immediate-group.patch

debian/patches/ubuntu/arm64/0121-target-arm-A64-Add-SIMD-scalar-copy-instructions.patch

debian/patches/ubuntu/arm64/0122-target-arm-Move-arm_rmode_to_sf-to-a-shared-location.patch

debian/patches/ubuntu/arm64/0123-target-arm-Add-AArch32-FP-VRINTA-VRINTN-VRINTP-and-V.patch

debian/patches/ubuntu/arm64/0124-target-arm-Add-support-for-AArch32-FP-VRINTR.patch

debian/patches/ubuntu/arm64/0125-target-arm-Add-support-for-AArch32-FP-VRINTZ.patch

debian/patches/ubuntu/arm64/0126-target-arm-Add-support-for-AArch32-FP-VRINTX.patch

debian/patches/ubuntu/arm64/0127-target-arm-Add-support-for-AArch32-SIMD-VRINTX.patch

debian/patches/ubuntu/arm64/0128-target-arm-Add-set_neon_rmode-helper.patch

debian/patches/ubuntu/arm64/0129-target-arm-Add-AArch32-SIMD-VRINTA-VRINTN-VRINTP-VRI.patch

debian/patches/ubuntu/arm64/0130-target-arm-Add-AArch32-FP-VCVTA-VCVTN-VCVTP-and-VCVT.patch

debian/patches/ubuntu/arm64/0131-target-arm-Add-AArch32-SIMD-VCVTA-VCVTN-VCVTP-and-VC.patch

debian/patches/ubuntu/arm64/0132-target-arm-A64-Add-SIMD-three-different-multiply-acc.patch

debian/patches/ubuntu/arm64/0133-target-arm-A64-Add-SIMD-three-different-ABDL-instruc.patch

debian/patches/ubuntu/arm64/0134-target-arm-A64-Add-SIMD-scalar-3-same-add-sub-and-co.patch

debian/patches/ubuntu/arm64/0135-target-arm-A64-Add-top-level-decode-for-SIMD-3-same-.patch

debian/patches/ubuntu/arm64/0136-target-arm-A64-Add-logic-ops-from-SIMD-3-same-group.patch

debian/patches/ubuntu/arm64/0137-target-arm-A64-Add-integer-ops-from-SIMD-3-same-grou.patch

debian/patches/ubuntu/arm64/0138-target-arm-A64-Add-simple-SIMD-3-same-floating-point.patch

debian/patches/ubuntu/arm64/0139-target-arm-A64-Add-SIMD-shift-by-immediate.patch

debian/patches/ubuntu/arm64/0140-target-arm-A64-Implement-SIMD-3-reg-same-shift-and-s.patch

debian/patches/ubuntu/arm64/0141-target-arm-A64-Implement-remaining-non-pairwise-int-.patch

debian/patches/ubuntu/arm64/0142-target-arm-A64-Implement-pairwise-integer-ops-from-3.patch

debian/patches/ubuntu/arm64/0143-tcg-Add-TCGV_UNUSED_PTR-TCGV_IS_UNUSED_PTR-TCGV_EQUA.patch

debian/patches/ubuntu/arm64/0144-target-arm-A64-Implement-scalar-pairwise-ops.patch

debian/patches/ubuntu/arm64/0145-target-arm-A64-Implement-remaining-integer-scalar-3-.patch

debian/patches/ubuntu/arm64/0146-target-arm-A64-Add-SIMD-simple-64-bit-insns-from-sca.patch

debian/patches/ubuntu/arm64/0147-target-arm-A64-Add-skeleton-decode-for-SIMD-2-reg-mi.patch

debian/patches/ubuntu/arm64/0148-target-arm-A64-Implement-2-register-misc-compares-AB.patch

debian/patches/ubuntu/arm64/0149-target-arm-A64-Implement-2-reg-misc-CNT-NOT-and-RBIT.patch

debian/patches/ubuntu/arm64/0150-target-arm-A64-Add-narrowing-2-reg-misc-instructions.patch

debian/patches/ubuntu/arm64/0151-target-arm-A64-Add-2-reg-misc-REV-instructions.patch

debian/patches/ubuntu/arm64/0152-target-arm-A64-Add-FNEG-and-FABS-to-the-SIMD-2-reg-m.patch

debian/patches/ubuntu/arm64/0153-target-arm-Add-support-for-AArch32-64bit-VCVTB-and-V.patch

debian/patches/ubuntu/arm64/0154-rules.mak-Support-.cc-as-a-C-source-file-suffix.patch

debian/patches/ubuntu/arm64/0155-rules.mak-Link-with-C-if-we-have-a-C-compiler.patch

debian/patches/ubuntu/arm64/0156-disas-Add-subset-of-libvixl-sources-for-A64-disassem.patch

debian/patches/ubuntu/arm64/0157-disas-libvixl-Fix-upstream-libvixl-compilation-issue.patch

debian/patches/ubuntu/arm64/0158-disas-Implement-disassembly-output-for-A64.patch

debian/patches/ubuntu/arm64/force-aarch64-uname-to-3.7.0-to-appease-glibc

default-configs/aarch64-linux-user.mak

disas/arm-a64.cc

disas/libvixl

disas/libvixl/LICENCE

disas/libvixl/Makefile.objs

disas/libvixl/README

disas/libvixl/a64

disas/libvixl/a64/assembler-a64.h

disas/libvixl/a64/constants-a64.h

disas/libvixl/a64/cpu-a64.h

disas/libvixl/a64/decoder-a64.cc

disas/libvixl/a64/decoder-a64.h

disas/libvixl/a64/disasm-a64.cc

disas/libvixl/a64/disasm-a64.h

disas/libvixl/a64/instructions-a64.cc

disas/libvixl/a64/instructions-a64.h

disas/libvixl/globals.h

disas/libvixl/platform.h

disas/libvixl/utils.cc

disas/libvixl/utils.h

files modified:
.pc/applied-patches

.travis.yml

configure

debian/changelog

debian/patches/series

debian/rules

disas.c

disas/Makefile.objs

fpu/softfloat.c

include/disas/bfd.h

include/fpu/softfloat.h

linux-user/aarch64/syscall.h

linux-user/aarch64/target_cpu.h

linux-user/arm/target_cpu.h

linux-user/main.c

linux-user/signal.c

rules.mak

target-arm/cpu.h

target-arm/cpu64.c

target-arm/helper-a64.c

target-arm/helper-a64.h

target-arm/helper.c

target-arm/helper.h

target-arm/kvm-consts.h

target-arm/machine.c

target-arm/neon_helper.c

target-arm/translate-a64.c

target-arm/translate.c

target-arm/translate.h

tcg/tcg.h

Show diffs side-by-side

added added

removed removed

.pc/ubuntu/arm64/0093-softfloat-Add-float32_to_uint64.patch/fpu/softfloat.c

* QEMU float support

* Derived from SoftFloat.

/*============================================================================

This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic

Package, Release 2b.

Written by John R. Hauser. This work was made possible in part by the

International Computer Science Institute, located at Suite 600, 1947 Center

Street, Berkeley, California 94704. Funding was partially provided by the

National Science Foundation under grant MIP-9311980. The original version

of this code was written as part of a project to build a fixed-point vector

processor in collaboration with the University of California at Berkeley,

overseen by Profs. Nelson Morgan and John Wawrzynek. More information

is available through the Web page `http://www.cs.berkeley.edu/~jhauser/

arithmetic/SoftFloat.html'.

THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has

been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES

RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS

AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,

COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE

EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE

INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR

OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.

Derivative works are acceptable, even for commercial purposes, so long as

(1) the source code for the derivative work includes prominent notice that

the work is derivative, and (2) the source code includes prominent notice with

these four paragraphs for those parts of this code that are retained.

=============================================================================*/

/* softfloat (and in particular the code in softfloat-specialize.h) is

* target-dependent and needs the TARGET_* macros.

#include "config.h"

#include "fpu/softfloat.h"

/*----------------------------------------------------------------------------

| Primitive arithmetic functions, including multi-word arithmetic, and

| division and square root approximations. (Can be specialized to target if

| desired.)

*----------------------------------------------------------------------------*/

#include "softfloat-macros.h"

/*----------------------------------------------------------------------------

| Functions and definitions to determine: (1) whether tininess for underflow

| is detected before or after rounding by default, (2) what (if anything)

| happens when exceptions are raised, (3) how signaling NaNs are distinguished

| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs

| are propagated from function inputs to output. These details are target-

| specific.

*----------------------------------------------------------------------------*/

#include "softfloat-specialize.h"

void set_float_rounding_mode(int val STATUS_PARAM)

{

STATUS(float_rounding_mode) = val;

}

void set_float_exception_flags(int val STATUS_PARAM)

{

STATUS(float_exception_flags) = val;

}

void set_floatx80_rounding_precision(int val STATUS_PARAM)

{

STATUS(floatx80_rounding_precision) = val;

}

/*----------------------------------------------------------------------------

| Returns the fraction bits of the half-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE uint32_t extractFloat16Frac(float16 a)

{

return float16_val(a) & 0x3ff;

}

/*----------------------------------------------------------------------------

| Returns the exponent bits of the half-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE int_fast16_t extractFloat16Exp(float16 a)

{

return (float16_val(a) >> 10) & 0x1f;

}

/*----------------------------------------------------------------------------

| Returns the sign bit of the single-precision floating-point value `a'.

*----------------------------------------------------------------------------*/

INLINE flag extractFloat16Sign(float16 a)

100

{

101

return float16_val(a)>>15;

102

}

103

104

/*----------------------------------------------------------------------------

105

| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6

106

| and 7, and returns the properly rounded 32-bit integer corresponding to the

107

| input. If `zSign' is 1, the input is negated before being converted to an

108

| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input

109

| is simply rounded to an integer, with the inexact exception raised if the

110

| input cannot be represented exactly as an integer. However, if the fixed-

111

| point input is too large, the invalid exception is raised and the largest

112

| positive or negative integer is returned.

113

*----------------------------------------------------------------------------*/

114

115

static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)

116

{

117

int8 roundingMode;

118

flag roundNearestEven;

119

int8 roundIncrement, roundBits;

120

int32_t z;

121

122

roundingMode = STATUS(float_rounding_mode);

123

roundNearestEven = ( roundingMode == float_round_nearest_even );

124

roundIncrement = 0x40;

125

if ( ! roundNearestEven ) {

126

if ( roundingMode == float_round_to_zero ) {

127

roundIncrement = 0;

128

}

129

else {

130

roundIncrement = 0x7F;

131

if ( zSign ) {

132

if ( roundingMode == float_round_up ) roundIncrement = 0;

133

}

134

else {

135

if ( roundingMode == float_round_down ) roundIncrement = 0;

136

}

137

}

138

}

139

roundBits = absZ & 0x7F;

140

absZ = ( absZ + roundIncrement )>>7;

141

absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );

142

z = absZ;

143

if ( zSign ) z = - z;

144

if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {

145

float_raise( float_flag_invalid STATUS_VAR);

146

return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

147

}

148

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

149

return z;

150

151

}

152

153

/*----------------------------------------------------------------------------

154

| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and

155

| `absZ1', with binary point between bits 63 and 64 (between the input words),

156

| and returns the properly rounded 64-bit integer corresponding to the input.

157

| If `zSign' is 1, the input is negated before being converted to an integer.

158

| Ordinarily, the fixed-point input is simply rounded to an integer, with

159

| the inexact exception raised if the input cannot be represented exactly as

160

| an integer. However, if the fixed-point input is too large, the invalid

161

| exception is raised and the largest positive or negative integer is

162

| returned.

163

*----------------------------------------------------------------------------*/

164

165

static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)

166

{

167

int8 roundingMode;

168

flag roundNearestEven, increment;

169

int64_t z;

170

171

roundingMode = STATUS(float_rounding_mode);

172

roundNearestEven = ( roundingMode == float_round_nearest_even );

173

increment = ( (int64_t) absZ1 < 0 );

174

if ( ! roundNearestEven ) {

175

if ( roundingMode == float_round_to_zero ) {

176

increment = 0;

177

}

178

else {

179

if ( zSign ) {

180

increment = ( roundingMode == float_round_down ) && absZ1;

181

}

182

else {

183

increment = ( roundingMode == float_round_up ) && absZ1;

184

}

185

}

186

}

187

if ( increment ) {

188

++absZ0;

189

if ( absZ0 == 0 ) goto overflow;

190

absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );

191

}

192

z = absZ0;

193

if ( zSign ) z = - z;

194

if ( z && ( ( z < 0 ) ^ zSign ) ) {

195

overflow:

196

float_raise( float_flag_invalid STATUS_VAR);

197

return

198

zSign ? (int64_t) LIT64( 0x8000000000000000 )

199

: LIT64( 0x7FFFFFFFFFFFFFFF );

200

}

201

if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;

202

return z;

203

204

}

205

206

/*----------------------------------------------------------------------------

207

| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and

208

| `absZ1', with binary point between bits 63 and 64 (between the input words),

209

| and returns the properly rounded 64-bit unsigned integer corresponding to the

210

| input. Ordinarily, the fixed-point input is simply rounded to an integer,

211

| with the inexact exception raised if the input cannot be represented exactly

212

| as an integer. However, if the fixed-point input is too large, the invalid

213

| exception is raised and the largest unsigned integer is returned.

214

*----------------------------------------------------------------------------*/

215

216

static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,

217

uint64_t absZ1 STATUS_PARAM)

218

{

219

int8 roundingMode;

220

flag roundNearestEven, increment;

221

222

roundingMode = STATUS(float_rounding_mode);

223

roundNearestEven = (roundingMode == float_round_nearest_even);

224

increment = ((int64_t)absZ1 < 0);

225

if (!roundNearestEven) {

226

if (roundingMode == float_round_to_zero) {

227

increment = 0;

228

} else if (absZ1) {

229

if (zSign) {

230

increment = (roundingMode == float_round_down) && absZ1;

231

} else {

232

increment = (roundingMode == float_round_up) && absZ1;

233

}

234

}

235

}

236

if (increment) {

237

++absZ0;

238

if (absZ0 == 0) {

239

float_raise(float_flag_invalid STATUS_VAR);

240

return LIT64(0xFFFFFFFFFFFFFFFF);

241

}

242

absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);

243

}

244

245

if (zSign && absZ0) {

246

float_raise(float_flag_invalid STATUS_VAR);

247

return 0;

248

}

249

250

if (absZ1) {

251

STATUS(float_exception_flags) |= float_flag_inexact;

252

}

253

return absZ0;

254

}

255

256

/*----------------------------------------------------------------------------

257

| Returns the fraction bits of the single-precision floating-point value `a'.

258

*----------------------------------------------------------------------------*/

259

260

INLINE uint32_t extractFloat32Frac( float32 a )

261

{

262

263

return float32_val(a) & 0x007FFFFF;

264

265

}

266

267

/*----------------------------------------------------------------------------

268

| Returns the exponent bits of the single-precision floating-point value `a'.

269

*----------------------------------------------------------------------------*/

270

271

INLINE int_fast16_t extractFloat32Exp(float32 a)

272

{

273

274

return ( float32_val(a)>>23 ) & 0xFF;

275

276

}

277

278

/*----------------------------------------------------------------------------

279

| Returns the sign bit of the single-precision floating-point value `a'.

280

*----------------------------------------------------------------------------*/

281

282

INLINE flag extractFloat32Sign( float32 a )

283

{

284

285

return float32_val(a)>>31;

286

287

}

288

289

/*----------------------------------------------------------------------------

290

| If `a' is denormal and we are in flush-to-zero mode then set the

291

| input-denormal exception and return zero. Otherwise just return the value.

292

*----------------------------------------------------------------------------*/

293

static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)

294

{

295

if (STATUS(flush_inputs_to_zero)) {

296

if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {

297

float_raise(float_flag_input_denormal STATUS_VAR);

298

return make_float32(float32_val(a) & 0x80000000);

299

}

300

}

301

return a;

302

}

303

304

/*----------------------------------------------------------------------------

305

| Normalizes the subnormal single-precision floating-point value represented

306

| by the denormalized significand `aSig'. The normalized exponent and

307

| significand are stored at the locations pointed to by `zExpPtr' and

308

| `zSigPtr', respectively.

309

*----------------------------------------------------------------------------*/

310

311

static void

312

normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)

313

{

314

int8 shiftCount;

315

316

shiftCount = countLeadingZeros32( aSig ) - 8;

317

*zSigPtr = aSig<<shiftCount;

318

*zExpPtr = 1 - shiftCount;

319

320

}

321

322

/*----------------------------------------------------------------------------

323

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

324

| single-precision floating-point value, returning the result. After being

325

| shifted into the proper positions, the three fields are simply added

326

| together to form the result. This means that any integer portion of `zSig'

327

| will be added into the exponent. Since a properly normalized significand

328

| will have an integer portion equal to 1, the `zExp' input should be 1 less

329

| than the desired result exponent whenever `zSig' is a complete, normalized

330

| significand.

331

*----------------------------------------------------------------------------*/

332

333

INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)

334

{

335

336

return make_float32(

337

( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);

338

339

}

340

341

/*----------------------------------------------------------------------------

342

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

343

| and significand `zSig', and returns the proper single-precision floating-

344

| point value corresponding to the abstract input. Ordinarily, the abstract

345

| value is simply rounded and packed into the single-precision format, with

346

| the inexact exception raised if the abstract input cannot be represented

347

| exactly. However, if the abstract value is too large, the overflow and

348

| inexact exceptions are raised and an infinity or maximal finite value is

349

| returned. If the abstract value is too small, the input value is rounded to

350

| a subnormal number, and the underflow and inexact exceptions are raised if

351

| the abstract input cannot be represented exactly as a subnormal single-

352

| precision floating-point number.

353

| The input significand `zSig' has its binary point between bits 30

354

| and 29, which is 7 bits to the left of the usual location. This shifted

355

| significand must be normalized or smaller. If `zSig' is not normalized,

356

| `zExp' must be 0; in that case, the result returned is a subnormal number,

357

| and it must not require rounding. In the usual case that `zSig' is

358

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

359

| The handling of underflow and overflow follows the IEC/IEEE Standard for

360

| Binary Floating-Point Arithmetic.

361

*----------------------------------------------------------------------------*/

362

363

static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)

364

{

365

int8 roundingMode;

366

flag roundNearestEven;

367

int8 roundIncrement, roundBits;

368

flag isTiny;

369

370

roundingMode = STATUS(float_rounding_mode);

371

roundNearestEven = ( roundingMode == float_round_nearest_even );

372

roundIncrement = 0x40;

373

if ( ! roundNearestEven ) {

374

if ( roundingMode == float_round_to_zero ) {

375

roundIncrement = 0;

376

}

377

else {

378

roundIncrement = 0x7F;

379

if ( zSign ) {

380

if ( roundingMode == float_round_up ) roundIncrement = 0;

381

}

382

else {

383

if ( roundingMode == float_round_down ) roundIncrement = 0;

384

}

385

}

386

}

387

roundBits = zSig & 0x7F;

388

if ( 0xFD <= (uint16_t) zExp ) {

389

if ( ( 0xFD < zExp )

390

|| ( ( zExp == 0xFD )

391

&& ( (int32_t) ( zSig + roundIncrement ) < 0 ) )

392

) {

393

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

394

return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));

395

}

396

if ( zExp < 0 ) {

397

if (STATUS(flush_to_zero)) {

398

float_raise(float_flag_output_denormal STATUS_VAR);

399

return packFloat32(zSign, 0, 0);

400

}

401

isTiny =

402

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

403

|| ( zExp < -1 )

404

|| ( zSig + roundIncrement < 0x80000000 );

405

shift32RightJamming( zSig, - zExp, &zSig );

406

zExp = 0;

407

roundBits = zSig & 0x7F;

408

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

409

}

410

}

411

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

412

zSig = ( zSig + roundIncrement )>>7;

413

zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );

414

if ( zSig == 0 ) zExp = 0;

415

return packFloat32( zSign, zExp, zSig );

416

417

}

418

419

/*----------------------------------------------------------------------------

420

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

421

| and significand `zSig', and returns the proper single-precision floating-

422

| point value corresponding to the abstract input. This routine is just like

423

| `roundAndPackFloat32' except that `zSig' does not have to be normalized.

424

| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''

425

| floating-point exponent.

426

*----------------------------------------------------------------------------*/

427

428

static float32

429

normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)

430

{

431

int8 shiftCount;

432

433

shiftCount = countLeadingZeros32( zSig ) - 1;

434

return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);

435

436

}

437

438

/*----------------------------------------------------------------------------

439

| Returns the fraction bits of the double-precision floating-point value `a'.

440

*----------------------------------------------------------------------------*/

441

442

INLINE uint64_t extractFloat64Frac( float64 a )

443

{

444

445

return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );

446

447

}

448

449

/*----------------------------------------------------------------------------

450

| Returns the exponent bits of the double-precision floating-point value `a'.

451

*----------------------------------------------------------------------------*/

452

453

INLINE int_fast16_t extractFloat64Exp(float64 a)

454

{

455

456

return ( float64_val(a)>>52 ) & 0x7FF;

457

458

}

459

460

/*----------------------------------------------------------------------------

461

| Returns the sign bit of the double-precision floating-point value `a'.

462

*----------------------------------------------------------------------------*/

463

464

INLINE flag extractFloat64Sign( float64 a )

465

{

466

467

return float64_val(a)>>63;

468

469

}

470

471

/*----------------------------------------------------------------------------

472

| If `a' is denormal and we are in flush-to-zero mode then set the

473

| input-denormal exception and return zero. Otherwise just return the value.

474

*----------------------------------------------------------------------------*/

475

static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)

476

{

477

if (STATUS(flush_inputs_to_zero)) {

478

if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {

479

float_raise(float_flag_input_denormal STATUS_VAR);

480

return make_float64(float64_val(a) & (1ULL << 63));

481

}

482

}

483

return a;

484

}

485

486

/*----------------------------------------------------------------------------

487

| Normalizes the subnormal double-precision floating-point value represented

488

| by the denormalized significand `aSig'. The normalized exponent and

489

| significand are stored at the locations pointed to by `zExpPtr' and

490

| `zSigPtr', respectively.

491

*----------------------------------------------------------------------------*/

492

493

static void

494

normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)

495

{

496

int8 shiftCount;

497

498

shiftCount = countLeadingZeros64( aSig ) - 11;

499

*zSigPtr = aSig<<shiftCount;

500

*zExpPtr = 1 - shiftCount;

501

502

}

503

504

/*----------------------------------------------------------------------------

505

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

506

| double-precision floating-point value, returning the result. After being

507

| shifted into the proper positions, the three fields are simply added

508

| together to form the result. This means that any integer portion of `zSig'

509

| will be added into the exponent. Since a properly normalized significand

510

| will have an integer portion equal to 1, the `zExp' input should be 1 less

511

| than the desired result exponent whenever `zSig' is a complete, normalized

512

| significand.

513

*----------------------------------------------------------------------------*/

514

515

INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)

516

{

517

518

return make_float64(

519

( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);

520

521

}

522

523

/*----------------------------------------------------------------------------

524

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

525

| and significand `zSig', and returns the proper double-precision floating-

526

| point value corresponding to the abstract input. Ordinarily, the abstract

527

| value is simply rounded and packed into the double-precision format, with

528

| the inexact exception raised if the abstract input cannot be represented

529

| exactly. However, if the abstract value is too large, the overflow and

530

| inexact exceptions are raised and an infinity or maximal finite value is

531

| returned. If the abstract value is too small, the input value is rounded

532

| to a subnormal number, and the underflow and inexact exceptions are raised

533

| if the abstract input cannot be represented exactly as a subnormal double-

534

| precision floating-point number.

535

| The input significand `zSig' has its binary point between bits 62

536

| and 61, which is 10 bits to the left of the usual location. This shifted

537

| significand must be normalized or smaller. If `zSig' is not normalized,

538

| `zExp' must be 0; in that case, the result returned is a subnormal number,

539

| and it must not require rounding. In the usual case that `zSig' is

540

| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.

541

| The handling of underflow and overflow follows the IEC/IEEE Standard for

542

| Binary Floating-Point Arithmetic.

543

*----------------------------------------------------------------------------*/

544

545

static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)

546

{

547

int8 roundingMode;

548

flag roundNearestEven;

549

int_fast16_t roundIncrement, roundBits;

550

flag isTiny;

551

552

roundingMode = STATUS(float_rounding_mode);

553

roundNearestEven = ( roundingMode == float_round_nearest_even );

554

roundIncrement = 0x200;

555

if ( ! roundNearestEven ) {

556

if ( roundingMode == float_round_to_zero ) {

557

roundIncrement = 0;

558

}

559

else {

560

roundIncrement = 0x3FF;

561

if ( zSign ) {

562

if ( roundingMode == float_round_up ) roundIncrement = 0;

563

}

564

else {

565

if ( roundingMode == float_round_down ) roundIncrement = 0;

566

}

567

}

568

}

569

roundBits = zSig & 0x3FF;

570

if ( 0x7FD <= (uint16_t) zExp ) {

571

if ( ( 0x7FD < zExp )

572

|| ( ( zExp == 0x7FD )

573

&& ( (int64_t) ( zSig + roundIncrement ) < 0 ) )

574

) {

575

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

576

return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));

577

}

578

if ( zExp < 0 ) {

579

if (STATUS(flush_to_zero)) {

580

float_raise(float_flag_output_denormal STATUS_VAR);

581

return packFloat64(zSign, 0, 0);

582

}

583

isTiny =

584

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

585

|| ( zExp < -1 )

586

|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );

587

shift64RightJamming( zSig, - zExp, &zSig );

588

zExp = 0;

589

roundBits = zSig & 0x3FF;

590

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

591

}

592

}

593

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

594

zSig = ( zSig + roundIncrement )>>10;

595

zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );

596

if ( zSig == 0 ) zExp = 0;

597

return packFloat64( zSign, zExp, zSig );

598

599

}

600

601

/*----------------------------------------------------------------------------

602

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

603

| and significand `zSig', and returns the proper double-precision floating-

604

| point value corresponding to the abstract input. This routine is just like

605

| `roundAndPackFloat64' except that `zSig' does not have to be normalized.

606

| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''

607

| floating-point exponent.

608

*----------------------------------------------------------------------------*/

609

610

static float64

611

normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)

612

{

613

int8 shiftCount;

614

615

shiftCount = countLeadingZeros64( zSig ) - 1;

616

return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);

617

618

}

619

620

/*----------------------------------------------------------------------------

621

| Returns the fraction bits of the extended double-precision floating-point

622

| value `a'.

623

*----------------------------------------------------------------------------*/

624

625

INLINE uint64_t extractFloatx80Frac( floatx80 a )

626

{

627

628

return a.low;

629

630

}

631

632

/*----------------------------------------------------------------------------

633

| Returns the exponent bits of the extended double-precision floating-point

634

| value `a'.

635

*----------------------------------------------------------------------------*/

636

637

INLINE int32 extractFloatx80Exp( floatx80 a )

638

{

639

640

return a.high & 0x7FFF;

641

642

}

643

644

/*----------------------------------------------------------------------------

645

| Returns the sign bit of the extended double-precision floating-point value

646

| `a'.

647

*----------------------------------------------------------------------------*/

648

649

INLINE flag extractFloatx80Sign( floatx80 a )

650

{

651

652

return a.high>>15;

653

654

}

655

656

/*----------------------------------------------------------------------------

657

| Normalizes the subnormal extended double-precision floating-point value

658

| represented by the denormalized significand `aSig'. The normalized exponent

659

| and significand are stored at the locations pointed to by `zExpPtr' and

660

| `zSigPtr', respectively.

661

*----------------------------------------------------------------------------*/

662

663

static void

664

normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )

665

{

666

int8 shiftCount;

667

668

shiftCount = countLeadingZeros64( aSig );

669

*zSigPtr = aSig<<shiftCount;

670

*zExpPtr = 1 - shiftCount;

671

672

}

673

674

/*----------------------------------------------------------------------------

675

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an

676

| extended double-precision floating-point value, returning the result.

677

*----------------------------------------------------------------------------*/

678

679

INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )

680

{

681

floatx80 z;

682

683

z.low = zSig;

684

z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;

685

return z;

686

687

}

688

689

/*----------------------------------------------------------------------------

690

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

691

| and extended significand formed by the concatenation of `zSig0' and `zSig1',

692

| and returns the proper extended double-precision floating-point value

693

| corresponding to the abstract input. Ordinarily, the abstract value is

694

| rounded and packed into the extended double-precision format, with the

695

| inexact exception raised if the abstract input cannot be represented

696

| exactly. However, if the abstract value is too large, the overflow and

697

| inexact exceptions are raised and an infinity or maximal finite value is

698

| returned. If the abstract value is too small, the input value is rounded to

699

| a subnormal number, and the underflow and inexact exceptions are raised if

700

| the abstract input cannot be represented exactly as a subnormal extended

701

| double-precision floating-point number.

702

| If `roundingPrecision' is 32 or 64, the result is rounded to the same

703

| number of bits as single or double precision, respectively. Otherwise, the

704

| result is rounded to the full precision of the extended double-precision

705

| format.

706

| The input significand must be normalized or smaller. If the input

707

| significand is not normalized, `zExp' must be 0; in that case, the result

708

| returned is a subnormal number, and it must not require rounding. The

709

| handling of underflow and overflow follows the IEC/IEEE Standard for Binary

710

| Floating-Point Arithmetic.

711

*----------------------------------------------------------------------------*/

712

713

static floatx80

714

roundAndPackFloatx80(

715

int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1

716

STATUS_PARAM)

717

{

718

int8 roundingMode;

719

flag roundNearestEven, increment, isTiny;

720

int64 roundIncrement, roundMask, roundBits;

721

722

roundingMode = STATUS(float_rounding_mode);

723

roundNearestEven = ( roundingMode == float_round_nearest_even );

724

if ( roundingPrecision == 80 ) goto precision80;

725

if ( roundingPrecision == 64 ) {

726

roundIncrement = LIT64( 0x0000000000000400 );

727

roundMask = LIT64( 0x00000000000007FF );

728

}

729

else if ( roundingPrecision == 32 ) {

730

roundIncrement = LIT64( 0x0000008000000000 );

731

roundMask = LIT64( 0x000000FFFFFFFFFF );

732

}

733

else {

734

goto precision80;

735

}

736

zSig0 |= ( zSig1 != 0 );

737

if ( ! roundNearestEven ) {

738

if ( roundingMode == float_round_to_zero ) {

739

roundIncrement = 0;

740

}

741

else {

742

roundIncrement = roundMask;

743

if ( zSign ) {

744

if ( roundingMode == float_round_up ) roundIncrement = 0;

745

}

746

else {

747

if ( roundingMode == float_round_down ) roundIncrement = 0;

748

}

749

}

750

}

751

roundBits = zSig0 & roundMask;

752

if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {

753

if ( ( 0x7FFE < zExp )

754

|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )

755

) {

756

goto overflow;

757

}

758

if ( zExp <= 0 ) {

759

if (STATUS(flush_to_zero)) {

760

float_raise(float_flag_output_denormal STATUS_VAR);

761

return packFloatx80(zSign, 0, 0);

762

}

763

isTiny =

764

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

765

|| ( zExp < 0 )

766

|| ( zSig0 <= zSig0 + roundIncrement );

767

shift64RightJamming( zSig0, 1 - zExp, &zSig0 );

768

zExp = 0;

769

roundBits = zSig0 & roundMask;

770

if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);

771

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

772

zSig0 += roundIncrement;

773

if ( (int64_t) zSig0 < 0 ) zExp = 1;

774

roundIncrement = roundMask + 1;

775

if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {

776

roundMask |= roundIncrement;

777

}

778

zSig0 &= ~ roundMask;

779

return packFloatx80( zSign, zExp, zSig0 );

780

}

781

}

782

if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;

783

zSig0 += roundIncrement;

784

if ( zSig0 < roundIncrement ) {

785

++zExp;

786

zSig0 = LIT64( 0x8000000000000000 );

787

}

788

roundIncrement = roundMask + 1;

789

if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {

790

roundMask |= roundIncrement;

791

}

792

zSig0 &= ~ roundMask;

793

if ( zSig0 == 0 ) zExp = 0;

794

return packFloatx80( zSign, zExp, zSig0 );

795

precision80:

796

increment = ( (int64_t) zSig1 < 0 );

797

if ( ! roundNearestEven ) {

798

if ( roundingMode == float_round_to_zero ) {

799

increment = 0;

800

}

801

else {

802

if ( zSign ) {

803

increment = ( roundingMode == float_round_down ) && zSig1;

804

}

805

else {

806

increment = ( roundingMode == float_round_up ) && zSig1;

807

}

808

}

809

}

810

if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {

811

if ( ( 0x7FFE < zExp )

812

|| ( ( zExp == 0x7FFE )

813

&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )

814

&& increment

815

)

816

) {

817

roundMask = 0;

818

overflow:

819

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

820

if ( ( roundingMode == float_round_to_zero )

821

|| ( zSign && ( roundingMode == float_round_up ) )

822

|| ( ! zSign && ( roundingMode == float_round_down ) )

823

) {

824

return packFloatx80( zSign, 0x7FFE, ~ roundMask );

825

}

826

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

827

}

828

if ( zExp <= 0 ) {

829

isTiny =

830

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

831

|| ( zExp < 0 )

832

|| ! increment

833

|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );

834

shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );

835

zExp = 0;

836

if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);

837

if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

838

if ( roundNearestEven ) {

839

increment = ( (int64_t) zSig1 < 0 );

840

}

841

else {

842

if ( zSign ) {

843

increment = ( roundingMode == float_round_down ) && zSig1;

844

}

845

else {

846

increment = ( roundingMode == float_round_up ) && zSig1;

847

}

848

}

849

if ( increment ) {

850

++zSig0;

851

zSig0 &=

852

~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );

853

if ( (int64_t) zSig0 < 0 ) zExp = 1;

854

}

855

return packFloatx80( zSign, zExp, zSig0 );

856

}

857

}

858

if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

859

if ( increment ) {

860

++zSig0;

861

if ( zSig0 == 0 ) {

862

++zExp;

863

zSig0 = LIT64( 0x8000000000000000 );

864

}

865

else {

866

zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );

867

}

868

}

869

else {

870

if ( zSig0 == 0 ) zExp = 0;

871

}

872

return packFloatx80( zSign, zExp, zSig0 );

873

874

}

875

876

/*----------------------------------------------------------------------------

877

| Takes an abstract floating-point value having sign `zSign', exponent

878

| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',

879

| and returns the proper extended double-precision floating-point value

880

| corresponding to the abstract input. This routine is just like

881

| `roundAndPackFloatx80' except that the input significand does not have to be

882

| normalized.

883

*----------------------------------------------------------------------------*/

884

885

static floatx80

886

normalizeRoundAndPackFloatx80(

887

int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1

888

STATUS_PARAM)

889

{

890

int8 shiftCount;

891

892

if ( zSig0 == 0 ) {

893

zSig0 = zSig1;

894

zSig1 = 0;

895

zExp -= 64;

896

}

897

shiftCount = countLeadingZeros64( zSig0 );

898

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

899

zExp -= shiftCount;

900

return

901

roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);

902

903

}

904

905

/*----------------------------------------------------------------------------

906

| Returns the least-significant 64 fraction bits of the quadruple-precision

907

| floating-point value `a'.

908

*----------------------------------------------------------------------------*/

909

910

INLINE uint64_t extractFloat128Frac1( float128 a )

911

{

912

913

return a.low;

914

915

}

916

917

/*----------------------------------------------------------------------------

918

| Returns the most-significant 48 fraction bits of the quadruple-precision

919

| floating-point value `a'.

920

*----------------------------------------------------------------------------*/

921

922

INLINE uint64_t extractFloat128Frac0( float128 a )

923

{

924

925

return a.high & LIT64( 0x0000FFFFFFFFFFFF );

926

927

}

928

929

/*----------------------------------------------------------------------------

930

| Returns the exponent bits of the quadruple-precision floating-point value

931

| `a'.

932

*----------------------------------------------------------------------------*/

933

934

INLINE int32 extractFloat128Exp( float128 a )

935

{

936

937

return ( a.high>>48 ) & 0x7FFF;

938

939

}

940

941

/*----------------------------------------------------------------------------

942

| Returns the sign bit of the quadruple-precision floating-point value `a'.

943

*----------------------------------------------------------------------------*/

944

945

INLINE flag extractFloat128Sign( float128 a )

946

{

947

948

return a.high>>63;

949

950

}

951

952

/*----------------------------------------------------------------------------

953

| Normalizes the subnormal quadruple-precision floating-point value

954

| represented by the denormalized significand formed by the concatenation of

955

| `aSig0' and `aSig1'. The normalized exponent is stored at the location

956

| pointed to by `zExpPtr'. The most significant 49 bits of the normalized

957

| significand are stored at the location pointed to by `zSig0Ptr', and the

958

| least significant 64 bits of the normalized significand are stored at the

959

| location pointed to by `zSig1Ptr'.

960

*----------------------------------------------------------------------------*/

961

962

static void

963

normalizeFloat128Subnormal(

964

uint64_t aSig0,

965

uint64_t aSig1,

966

int32 *zExpPtr,

967

uint64_t *zSig0Ptr,

968

uint64_t *zSig1Ptr

969

)

970

{

971

int8 shiftCount;

972

973

if ( aSig0 == 0 ) {

974

shiftCount = countLeadingZeros64( aSig1 ) - 15;

975

if ( shiftCount < 0 ) {

976

*zSig0Ptr = aSig1>>( - shiftCount );

977

*zSig1Ptr = aSig1<<( shiftCount & 63 );

978

}

979

else {

980

*zSig0Ptr = aSig1<<shiftCount;

981

*zSig1Ptr = 0;

982

}

983

*zExpPtr = - shiftCount - 63;

984

}

985

else {

986

shiftCount = countLeadingZeros64( aSig0 ) - 15;

987

shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );

988

*zExpPtr = 1 - shiftCount;

989

}

990

991

}

992

993

/*----------------------------------------------------------------------------

994

| Packs the sign `zSign', the exponent `zExp', and the significand formed

995

| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision

996

| floating-point value, returning the result. After being shifted into the

997

| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply

998

| added together to form the most significant 32 bits of the result. This

999

| means that any integer portion of `zSig0' will be added into the exponent.

1000

| Since a properly normalized significand will have an integer portion equal

1001

| to 1, the `zExp' input should be 1 less than the desired result exponent

1002

| whenever `zSig0' and `zSig1' concatenated form a complete, normalized

1003

| significand.

1004

*----------------------------------------------------------------------------*/

1005

1006

INLINE float128

1007

packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )

1008

{

1009

float128 z;

1010

1011

z.low = zSig1;

1012

z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;

1013

return z;

1014

1015

}

1016

1017

/*----------------------------------------------------------------------------

1018

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

1019

| and extended significand formed by the concatenation of `zSig0', `zSig1',

1020

| and `zSig2', and returns the proper quadruple-precision floating-point value

1021

| corresponding to the abstract input. Ordinarily, the abstract value is

1022

| simply rounded and packed into the quadruple-precision format, with the

1023

| inexact exception raised if the abstract input cannot be represented

1024

| exactly. However, if the abstract value is too large, the overflow and

1025

| inexact exceptions are raised and an infinity or maximal finite value is

1026

| returned. If the abstract value is too small, the input value is rounded to

1027

| a subnormal number, and the underflow and inexact exceptions are raised if

1028

| the abstract input cannot be represented exactly as a subnormal quadruple-

1029

| precision floating-point number.

1030

| The input significand must be normalized or smaller. If the input

1031

| significand is not normalized, `zExp' must be 0; in that case, the result

1032

| returned is a subnormal number, and it must not require rounding. In the

1033

| usual case that the input significand is normalized, `zExp' must be 1 less

1034

| than the ``true'' floating-point exponent. The handling of underflow and

1035

| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1036

*----------------------------------------------------------------------------*/

1037

1038

static float128

1039

roundAndPackFloat128(

1040

flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)

1041

{

1042

int8 roundingMode;

1043

flag roundNearestEven, increment, isTiny;

1044

1045

roundingMode = STATUS(float_rounding_mode);

1046

roundNearestEven = ( roundingMode == float_round_nearest_even );

1047

increment = ( (int64_t) zSig2 < 0 );

1048

if ( ! roundNearestEven ) {

1049

if ( roundingMode == float_round_to_zero ) {

1050

increment = 0;

1051

}

1052

else {

1053

if ( zSign ) {

1054

increment = ( roundingMode == float_round_down ) && zSig2;

1055

}

1056

else {

1057

increment = ( roundingMode == float_round_up ) && zSig2;

1058

}

1059

}

1060

}

1061

if ( 0x7FFD <= (uint32_t) zExp ) {

1062

if ( ( 0x7FFD < zExp )

1063

|| ( ( zExp == 0x7FFD )

1064

&& eq128(

1065

LIT64( 0x0001FFFFFFFFFFFF ),

1066

LIT64( 0xFFFFFFFFFFFFFFFF ),

1067

zSig0,

1068

zSig1

1069

)

1070

&& increment

1071

)

1072

) {

1073

float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);

1074

if ( ( roundingMode == float_round_to_zero )

1075

|| ( zSign && ( roundingMode == float_round_up ) )

1076

|| ( ! zSign && ( roundingMode == float_round_down ) )

1077

) {

1078

return

1079

packFloat128(

1080

zSign,

1081

0x7FFE,

1082

LIT64( 0x0000FFFFFFFFFFFF ),

1083

LIT64( 0xFFFFFFFFFFFFFFFF )

1084

);

1085

}

1086

return packFloat128( zSign, 0x7FFF, 0, 0 );

1087

}

1088

if ( zExp < 0 ) {

1089

if (STATUS(flush_to_zero)) {

1090

float_raise(float_flag_output_denormal STATUS_VAR);

1091

return packFloat128(zSign, 0, 0, 0);

1092

}

1093

isTiny =

1094

( STATUS(float_detect_tininess) == float_tininess_before_rounding )

1095

|| ( zExp < -1 )

1096

|| ! increment

1097

|| lt128(

1098

zSig0,

1099

zSig1,

1100

LIT64( 0x0001FFFFFFFFFFFF ),

1101

LIT64( 0xFFFFFFFFFFFFFFFF )

1102

);

1103

shift128ExtraRightJamming(

1104

zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );

1105

zExp = 0;

1106

if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);

1107

if ( roundNearestEven ) {

1108

increment = ( (int64_t) zSig2 < 0 );

1109

}

1110

else {

1111

if ( zSign ) {

1112

increment = ( roundingMode == float_round_down ) && zSig2;

1113

}

1114

else {

1115

increment = ( roundingMode == float_round_up ) && zSig2;

1116

}

1117

}

1118

}

1119

}

1120

if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;

1121

if ( increment ) {

1122

add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );

1123

zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );

1124

}

1125

else {

1126

if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;

1127

}

1128

return packFloat128( zSign, zExp, zSig0, zSig1 );

1129

1130

}

1131

1132

/*----------------------------------------------------------------------------

1133

| Takes an abstract floating-point value having sign `zSign', exponent `zExp',

1134

| and significand formed by the concatenation of `zSig0' and `zSig1', and

1135

| returns the proper quadruple-precision floating-point value corresponding

1136

| to the abstract input. This routine is just like `roundAndPackFloat128'

1137

| except that the input significand has fewer bits and does not have to be

1138

| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-

1139

| point exponent.

1140

*----------------------------------------------------------------------------*/

1141

1142

static float128

1143

normalizeRoundAndPackFloat128(

1144

flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)

1145

{

1146

int8 shiftCount;

1147

uint64_t zSig2;

1148

1149

if ( zSig0 == 0 ) {

1150

zSig0 = zSig1;

1151

zSig1 = 0;

1152

zExp -= 64;

1153

}

1154

shiftCount = countLeadingZeros64( zSig0 ) - 15;

1155

if ( 0 <= shiftCount ) {

1156

zSig2 = 0;

1157

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

1158

}

1159

else {

1160

shift128ExtraRightJamming(

1161

zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );

1162

}

1163

zExp -= shiftCount;

1164

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);

1165

1166

}

1167

1168

/*----------------------------------------------------------------------------

1169

| Returns the result of converting the 32-bit two's complement integer `a'

1170

| to the single-precision floating-point format. The conversion is performed

1171

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1172

*----------------------------------------------------------------------------*/

1173

1174

float32 int32_to_float32(int32_t a STATUS_PARAM)

1175

{

1176

flag zSign;

1177

1178

if ( a == 0 ) return float32_zero;

1179

if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );

1180

zSign = ( a < 0 );

1181

return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );

1182

1183

}

1184

1185

/*----------------------------------------------------------------------------

1186

| Returns the result of converting the 32-bit two's complement integer `a'

1187

| to the double-precision floating-point format. The conversion is performed

1188

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1189

*----------------------------------------------------------------------------*/

1190

1191

float64 int32_to_float64(int32_t a STATUS_PARAM)

1192

{

1193

flag zSign;

1194

uint32 absA;

1195

int8 shiftCount;

1196

uint64_t zSig;

1197

1198

if ( a == 0 ) return float64_zero;

1199

zSign = ( a < 0 );

1200

absA = zSign ? - a : a;

1201

shiftCount = countLeadingZeros32( absA ) + 21;

1202

zSig = absA;

1203

return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );

1204

1205

}

1206

1207

/*----------------------------------------------------------------------------

1208

| Returns the result of converting the 32-bit two's complement integer `a'

1209

| to the extended double-precision floating-point format. The conversion

1210

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1211

| Arithmetic.

1212

*----------------------------------------------------------------------------*/

1213

1214

floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)

1215

{

1216

flag zSign;

1217

uint32 absA;

1218

int8 shiftCount;

1219

uint64_t zSig;

1220

1221

if ( a == 0 ) return packFloatx80( 0, 0, 0 );

1222

zSign = ( a < 0 );

1223

absA = zSign ? - a : a;

1224

shiftCount = countLeadingZeros32( absA ) + 32;

1225

zSig = absA;

1226

return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );

1227

1228

}

1229

1230

/*----------------------------------------------------------------------------

1231

| Returns the result of converting the 32-bit two's complement integer `a' to

1232

| the quadruple-precision floating-point format. The conversion is performed

1233

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1234

*----------------------------------------------------------------------------*/

1235

1236

float128 int32_to_float128(int32_t a STATUS_PARAM)

1237

{

1238

flag zSign;

1239

uint32 absA;

1240

int8 shiftCount;

1241

uint64_t zSig0;

1242

1243

if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );

1244

zSign = ( a < 0 );

1245

absA = zSign ? - a : a;

1246

shiftCount = countLeadingZeros32( absA ) + 17;

1247

zSig0 = absA;

1248

return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );

1249

1250

}

1251

1252

/*----------------------------------------------------------------------------

1253

| Returns the result of converting the 64-bit two's complement integer `a'

1254

| to the single-precision floating-point format. The conversion is performed

1255

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1256

*----------------------------------------------------------------------------*/

1257

1258

float32 int64_to_float32(int64_t a STATUS_PARAM)

1259

{

1260

flag zSign;

1261

uint64 absA;

1262

int8 shiftCount;

1263

1264

if ( a == 0 ) return float32_zero;

1265

zSign = ( a < 0 );

1266

absA = zSign ? - a : a;

1267

shiftCount = countLeadingZeros64( absA ) - 40;

1268

if ( 0 <= shiftCount ) {

1269

return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );

1270

}

1271

else {

1272

shiftCount += 7;

1273

if ( shiftCount < 0 ) {

1274

shift64RightJamming( absA, - shiftCount, &absA );

1275

}

1276

else {

1277

absA <<= shiftCount;

1278

}

1279

return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );

1280

}

1281

1282

}

1283

1284

float32 uint64_to_float32(uint64_t a STATUS_PARAM)

1285

{

1286

int8 shiftCount;

1287

1288

if ( a == 0 ) return float32_zero;

1289

shiftCount = countLeadingZeros64( a ) - 40;

1290

if ( 0 <= shiftCount ) {

1291

return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);

1292

}

1293

else {

1294

shiftCount += 7;

1295

if ( shiftCount < 0 ) {

1296

shift64RightJamming( a, - shiftCount, &a );

1297

}

1298

else {

1299

a <<= shiftCount;

1300

}

1301

return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);

1302

}

1303

}

1304

1305

/*----------------------------------------------------------------------------

1306

| Returns the result of converting the 64-bit two's complement integer `a'

1307

| to the double-precision floating-point format. The conversion is performed

1308

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1309

*----------------------------------------------------------------------------*/

1310

1311

float64 int64_to_float64(int64_t a STATUS_PARAM)

1312

{

1313

flag zSign;

1314

1315

if ( a == 0 ) return float64_zero;

1316

if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {

1317

return packFloat64( 1, 0x43E, 0 );

1318

}

1319

zSign = ( a < 0 );

1320

return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );

1321

1322

}

1323

1324

float64 uint64_to_float64(uint64_t a STATUS_PARAM)

1325

{

1326

int exp = 0x43C;

1327

1328

if (a == 0) {

1329

return float64_zero;

1330

}

1331

if ((int64_t)a < 0) {

1332

shift64RightJamming(a, 1, &a);

1333

exp += 1;

1334

}

1335

return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);

1336

}

1337

1338

/*----------------------------------------------------------------------------

1339

| Returns the result of converting the 64-bit two's complement integer `a'

1340

| to the extended double-precision floating-point format. The conversion

1341

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1342

| Arithmetic.

1343

*----------------------------------------------------------------------------*/

1344

1345

floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)

1346

{

1347

flag zSign;

1348

uint64 absA;

1349

int8 shiftCount;

1350

1351

if ( a == 0 ) return packFloatx80( 0, 0, 0 );

1352

zSign = ( a < 0 );

1353

absA = zSign ? - a : a;

1354

shiftCount = countLeadingZeros64( absA );

1355

return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );

1356

1357

}

1358

1359

/*----------------------------------------------------------------------------

1360

| Returns the result of converting the 64-bit two's complement integer `a' to

1361

| the quadruple-precision floating-point format. The conversion is performed

1362

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

1363

*----------------------------------------------------------------------------*/

1364

1365

float128 int64_to_float128(int64_t a STATUS_PARAM)

1366

{

1367

flag zSign;

1368

uint64 absA;

1369

int8 shiftCount;

1370

int32 zExp;

1371

uint64_t zSig0, zSig1;

1372

1373

if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );

1374

zSign = ( a < 0 );

1375

absA = zSign ? - a : a;

1376

shiftCount = countLeadingZeros64( absA ) + 49;

1377

zExp = 0x406E - shiftCount;

1378

if ( 64 <= shiftCount ) {

1379

zSig1 = 0;

1380

zSig0 = absA;

1381

shiftCount -= 64;

1382

}

1383

else {

1384

zSig1 = absA;

1385

zSig0 = 0;

1386

}

1387

shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );

1388

return packFloat128( zSign, zExp, zSig0, zSig1 );

1389

1390

}

1391

1392

float128 uint64_to_float128(uint64_t a STATUS_PARAM)

1393

{

1394

if (a == 0) {

1395

return float128_zero;

1396

}

1397

return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);

1398

}

1399

1400

/*----------------------------------------------------------------------------

1401

| Returns the result of converting the single-precision floating-point value

1402

| `a' to the 32-bit two's complement integer format. The conversion is

1403

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1404

| Arithmetic---which means in particular that the conversion is rounded

1405

| according to the current rounding mode. If `a' is a NaN, the largest

1406

| positive integer is returned. Otherwise, if the conversion overflows, the

1407

| largest integer with the same sign as `a' is returned.

1408

*----------------------------------------------------------------------------*/

1409

1410

int32 float32_to_int32( float32 a STATUS_PARAM )

1411

{

1412

flag aSign;

1413

int_fast16_t aExp, shiftCount;

1414

uint32_t aSig;

1415

uint64_t aSig64;

1416

1417

a = float32_squash_input_denormal(a STATUS_VAR);

1418

aSig = extractFloat32Frac( a );

1419

aExp = extractFloat32Exp( a );

1420

aSign = extractFloat32Sign( a );

1421

if ( ( aExp == 0xFF ) && aSig ) aSign = 0;

1422

if ( aExp ) aSig |= 0x00800000;

1423

shiftCount = 0xAF - aExp;

1424

aSig64 = aSig;

1425

aSig64 <<= 32;

1426

if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );

1427

return roundAndPackInt32( aSign, aSig64 STATUS_VAR );

1428

1429

}

1430

1431

/*----------------------------------------------------------------------------

1432

| Returns the result of converting the single-precision floating-point value

1433

| `a' to the 32-bit two's complement integer format. The conversion is

1434

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1435

| Arithmetic, except that the conversion is always rounded toward zero.

1436

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

1437

| the conversion overflows, the largest integer with the same sign as `a' is

1438

| returned.

1439

*----------------------------------------------------------------------------*/

1440

1441

int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )

1442

{

1443

flag aSign;

1444

int_fast16_t aExp, shiftCount;

1445

uint32_t aSig;

1446

int32_t z;

1447

a = float32_squash_input_denormal(a STATUS_VAR);

1448

1449

aSig = extractFloat32Frac( a );

1450

aExp = extractFloat32Exp( a );

1451

aSign = extractFloat32Sign( a );

1452

shiftCount = aExp - 0x9E;

1453

if ( 0 <= shiftCount ) {

1454

if ( float32_val(a) != 0xCF000000 ) {

1455

float_raise( float_flag_invalid STATUS_VAR);

1456

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;

1457

}

1458

return (int32_t) 0x80000000;

1459

}

1460

else if ( aExp <= 0x7E ) {

1461

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

1462

return 0;

1463

}

1464

aSig = ( aSig | 0x00800000 )<<8;

1465

z = aSig>>( - shiftCount );

1466

if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {

1467

STATUS(float_exception_flags) |= float_flag_inexact;

1468

}

1469

if ( aSign ) z = - z;

1470

return z;

1471

1472

}

1473

1474

/*----------------------------------------------------------------------------

1475

| Returns the result of converting the single-precision floating-point value

1476

| `a' to the 16-bit two's complement integer format. The conversion is

1477

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1478

| Arithmetic, except that the conversion is always rounded toward zero.

1479

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

1480

| the conversion overflows, the largest integer with the same sign as `a' is

1481

| returned.

1482

*----------------------------------------------------------------------------*/

1483

1484

int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)

1485

{

1486

flag aSign;

1487

int_fast16_t aExp, shiftCount;

1488

uint32_t aSig;

1489

int32 z;

1490

1491

aSig = extractFloat32Frac( a );

1492

aExp = extractFloat32Exp( a );

1493

aSign = extractFloat32Sign( a );

1494

shiftCount = aExp - 0x8E;

1495

if ( 0 <= shiftCount ) {

1496

if ( float32_val(a) != 0xC7000000 ) {

1497

float_raise( float_flag_invalid STATUS_VAR);

1498

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1499

return 0x7FFF;

1500

}

1501

}

1502

return (int32_t) 0xffff8000;

1503

}

1504

else if ( aExp <= 0x7E ) {

1505

if ( aExp | aSig ) {

1506

STATUS(float_exception_flags) |= float_flag_inexact;

1507

}

1508

return 0;

1509

}

1510

shiftCount -= 0x10;

1511

aSig = ( aSig | 0x00800000 )<<8;

1512

z = aSig>>( - shiftCount );

1513

if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {

1514

STATUS(float_exception_flags) |= float_flag_inexact;

1515

}

1516

if ( aSign ) {

1517

z = - z;

1518

}

1519

return z;

1520

1521

}

1522

1523

/*----------------------------------------------------------------------------

1524

| Returns the result of converting the single-precision floating-point value

1525

| `a' to the 64-bit two's complement integer format. The conversion is

1526

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1527

| Arithmetic---which means in particular that the conversion is rounded

1528

| according to the current rounding mode. If `a' is a NaN, the largest

1529

| positive integer is returned. Otherwise, if the conversion overflows, the

1530

| largest integer with the same sign as `a' is returned.

1531

*----------------------------------------------------------------------------*/

1532

1533

int64 float32_to_int64( float32 a STATUS_PARAM )

1534

{

1535

flag aSign;

1536

int_fast16_t aExp, shiftCount;

1537

uint32_t aSig;

1538

uint64_t aSig64, aSigExtra;

1539

a = float32_squash_input_denormal(a STATUS_VAR);

1540

1541

aSig = extractFloat32Frac( a );

1542

aExp = extractFloat32Exp( a );

1543

aSign = extractFloat32Sign( a );

1544

shiftCount = 0xBE - aExp;

1545

if ( shiftCount < 0 ) {

1546

float_raise( float_flag_invalid STATUS_VAR);

1547

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1548

return LIT64( 0x7FFFFFFFFFFFFFFF );

1549

}

1550

return (int64_t) LIT64( 0x8000000000000000 );

1551

}

1552

if ( aExp ) aSig |= 0x00800000;

1553

aSig64 = aSig;

1554

aSig64 <<= 40;

1555

shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );

1556

return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );

1557

1558

}

1559

1560

/*----------------------------------------------------------------------------

1561

| Returns the result of converting the single-precision floating-point value

1562

| `a' to the 64-bit two's complement integer format. The conversion is

1563

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1564

| Arithmetic, except that the conversion is always rounded toward zero. If

1565

| `a' is a NaN, the largest positive integer is returned. Otherwise, if the

1566

| conversion overflows, the largest integer with the same sign as `a' is

1567

| returned.

1568

*----------------------------------------------------------------------------*/

1569

1570

int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )

1571

{

1572

flag aSign;

1573

int_fast16_t aExp, shiftCount;

1574

uint32_t aSig;

1575

uint64_t aSig64;

1576

int64 z;

1577

a = float32_squash_input_denormal(a STATUS_VAR);

1578

1579

aSig = extractFloat32Frac( a );

1580

aExp = extractFloat32Exp( a );

1581

aSign = extractFloat32Sign( a );

1582

shiftCount = aExp - 0xBE;

1583

if ( 0 <= shiftCount ) {

1584

if ( float32_val(a) != 0xDF000000 ) {

1585

float_raise( float_flag_invalid STATUS_VAR);

1586

if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {

1587

return LIT64( 0x7FFFFFFFFFFFFFFF );

1588

}

1589

}

1590

return (int64_t) LIT64( 0x8000000000000000 );

1591

}

1592

else if ( aExp <= 0x7E ) {

1593

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

1594

return 0;

1595

}

1596

aSig64 = aSig | 0x00800000;

1597

aSig64 <<= 40;

1598

z = aSig64>>( - shiftCount );

1599

if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {

1600

STATUS(float_exception_flags) |= float_flag_inexact;

1601

}

1602

if ( aSign ) z = - z;

1603

return z;

1604

1605

}

1606

1607

/*----------------------------------------------------------------------------

1608

| Returns the result of converting the single-precision floating-point value

1609

| `a' to the double-precision floating-point format. The conversion is

1610

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1611

| Arithmetic.

1612

*----------------------------------------------------------------------------*/

1613

1614

float64 float32_to_float64( float32 a STATUS_PARAM )

1615

{

1616

flag aSign;

1617

int_fast16_t aExp;

1618

uint32_t aSig;

1619

a = float32_squash_input_denormal(a STATUS_VAR);

1620

1621

aSig = extractFloat32Frac( a );

1622

aExp = extractFloat32Exp( a );

1623

aSign = extractFloat32Sign( a );

1624

if ( aExp == 0xFF ) {

1625

if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1626

return packFloat64( aSign, 0x7FF, 0 );

1627

}

1628

if ( aExp == 0 ) {

1629

if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );

1630

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1631

--aExp;

1632

}

1633

return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );

1634

1635

}

1636

1637

/*----------------------------------------------------------------------------

1638

| Returns the result of converting the single-precision floating-point value

1639

| `a' to the extended double-precision floating-point format. The conversion

1640

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

1641

| Arithmetic.

1642

*----------------------------------------------------------------------------*/

1643

1644

floatx80 float32_to_floatx80( float32 a STATUS_PARAM )

1645

{

1646

flag aSign;

1647

int_fast16_t aExp;

1648

uint32_t aSig;

1649

1650

a = float32_squash_input_denormal(a STATUS_VAR);

1651

aSig = extractFloat32Frac( a );

1652

aExp = extractFloat32Exp( a );

1653

aSign = extractFloat32Sign( a );

1654

if ( aExp == 0xFF ) {

1655

if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1656

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

1657

}

1658

if ( aExp == 0 ) {

1659

if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );

1660

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1661

}

1662

aSig |= 0x00800000;

1663

return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );

1664

1665

}

1666

1667

/*----------------------------------------------------------------------------

1668

| Returns the result of converting the single-precision floating-point value

1669

| `a' to the double-precision floating-point format. The conversion is

1670

| performed according to the IEC/IEEE Standard for Binary Floating-Point

1671

| Arithmetic.

1672

*----------------------------------------------------------------------------*/

1673

1674

float128 float32_to_float128( float32 a STATUS_PARAM )

1675

{

1676

flag aSign;

1677

int_fast16_t aExp;

1678

uint32_t aSig;

1679

1680

a = float32_squash_input_denormal(a STATUS_VAR);

1681

aSig = extractFloat32Frac( a );

1682

aExp = extractFloat32Exp( a );

1683

aSign = extractFloat32Sign( a );

1684

if ( aExp == 0xFF ) {

1685

if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

1686

return packFloat128( aSign, 0x7FFF, 0, 0 );

1687

}

1688

if ( aExp == 0 ) {

1689

if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );

1690

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

1691

--aExp;

1692

}

1693

return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );

1694

1695

}

1696

1697

/*----------------------------------------------------------------------------

1698

| Rounds the single-precision floating-point value `a' to an integer, and

1699

| returns the result as a single-precision floating-point value. The

1700

| operation is performed according to the IEC/IEEE Standard for Binary

1701

| Floating-Point Arithmetic.

1702

*----------------------------------------------------------------------------*/

1703

1704

float32 float32_round_to_int( float32 a STATUS_PARAM)

1705

{

1706

flag aSign;

1707

int_fast16_t aExp;

1708

uint32_t lastBitMask, roundBitsMask;

1709

int8 roundingMode;

1710

uint32_t z;

1711

a = float32_squash_input_denormal(a STATUS_VAR);

1712

1713

aExp = extractFloat32Exp( a );

1714

if ( 0x96 <= aExp ) {

1715

if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {

1716

return propagateFloat32NaN( a, a STATUS_VAR );

1717

}

1718

return a;

1719

}

1720

if ( aExp <= 0x7E ) {

1721

if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;

1722

STATUS(float_exception_flags) |= float_flag_inexact;

1723

aSign = extractFloat32Sign( a );

1724

switch ( STATUS(float_rounding_mode) ) {

1725

case float_round_nearest_even:

1726

if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {

1727

return packFloat32( aSign, 0x7F, 0 );

1728

}

1729

break;

1730

case float_round_down:

1731

return make_float32(aSign ? 0xBF800000 : 0);

1732

case float_round_up:

1733

return make_float32(aSign ? 0x80000000 : 0x3F800000);

1734

}

1735

return packFloat32( aSign, 0, 0 );

1736

}

1737

lastBitMask = 1;

1738

lastBitMask <<= 0x96 - aExp;

1739

roundBitsMask = lastBitMask - 1;

1740

z = float32_val(a);

1741

roundingMode = STATUS(float_rounding_mode);

1742

if ( roundingMode == float_round_nearest_even ) {

1743

z += lastBitMask>>1;

1744

if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;

1745

}

1746

else if ( roundingMode != float_round_to_zero ) {

1747

if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {

1748

z += roundBitsMask;

1749

}

1750

}

1751

z &= ~ roundBitsMask;

1752

if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;

1753

return make_float32(z);

1754

1755

}

1756

1757

/*----------------------------------------------------------------------------

1758

| Returns the result of adding the absolute values of the single-precision

1759

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

1760

| before being returned. `zSign' is ignored if the result is a NaN.

1761

| The addition is performed according to the IEC/IEEE Standard for Binary

1762

| Floating-Point Arithmetic.

1763

*----------------------------------------------------------------------------*/

1764

1765

static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)

1766

{

1767

int_fast16_t aExp, bExp, zExp;

1768

uint32_t aSig, bSig, zSig;

1769

int_fast16_t expDiff;

1770

1771

aSig = extractFloat32Frac( a );

1772

aExp = extractFloat32Exp( a );

1773

bSig = extractFloat32Frac( b );

1774

bExp = extractFloat32Exp( b );

1775

expDiff = aExp - bExp;

1776

aSig <<= 6;

1777

bSig <<= 6;

1778

if ( 0 < expDiff ) {

1779

if ( aExp == 0xFF ) {

1780

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1781

return a;

1782

}

1783

if ( bExp == 0 ) {

1784

--expDiff;

1785

}

1786

else {

1787

bSig |= 0x20000000;

1788

}

1789

shift32RightJamming( bSig, expDiff, &bSig );

1790

zExp = aExp;

1791

}

1792

else if ( expDiff < 0 ) {

1793

if ( bExp == 0xFF ) {

1794

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1795

return packFloat32( zSign, 0xFF, 0 );

1796

}

1797

if ( aExp == 0 ) {

1798

++expDiff;

1799

}

1800

else {

1801

aSig |= 0x20000000;

1802

}

1803

shift32RightJamming( aSig, - expDiff, &aSig );

1804

zExp = bExp;

1805

}

1806

else {

1807

if ( aExp == 0xFF ) {

1808

if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1809

return a;

1810

}

1811

if ( aExp == 0 ) {

1812

if (STATUS(flush_to_zero)) {

1813

if (aSig | bSig) {

1814

float_raise(float_flag_output_denormal STATUS_VAR);

1815

}

1816

return packFloat32(zSign, 0, 0);

1817

}

1818

return packFloat32( zSign, 0, ( aSig + bSig )>>6 );

1819

}

1820

zSig = 0x40000000 + aSig + bSig;

1821

zExp = aExp;

1822

goto roundAndPack;

1823

}

1824

aSig |= 0x20000000;

1825

zSig = ( aSig + bSig )<<1;

1826

--zExp;

1827

if ( (int32_t) zSig < 0 ) {

1828

zSig = aSig + bSig;

1829

++zExp;

1830

}

1831

roundAndPack:

1832

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

1833

1834

}

1835

1836

/*----------------------------------------------------------------------------

1837

| Returns the result of subtracting the absolute values of the single-

1838

| precision floating-point values `a' and `b'. If `zSign' is 1, the

1839

| difference is negated before being returned. `zSign' is ignored if the

1840

| result is a NaN. The subtraction is performed according to the IEC/IEEE

1841

| Standard for Binary Floating-Point Arithmetic.

1842

*----------------------------------------------------------------------------*/

1843

1844

static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)

1845

{

1846

int_fast16_t aExp, bExp, zExp;

1847

uint32_t aSig, bSig, zSig;

1848

int_fast16_t expDiff;

1849

1850

aSig = extractFloat32Frac( a );

1851

aExp = extractFloat32Exp( a );

1852

bSig = extractFloat32Frac( b );

1853

bExp = extractFloat32Exp( b );

1854

expDiff = aExp - bExp;

1855

aSig <<= 7;

1856

bSig <<= 7;

1857

if ( 0 < expDiff ) goto aExpBigger;

1858

if ( expDiff < 0 ) goto bExpBigger;

1859

if ( aExp == 0xFF ) {

1860

if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1861

float_raise( float_flag_invalid STATUS_VAR);

1862

return float32_default_nan;

1863

}

1864

if ( aExp == 0 ) {

1865

aExp = 1;

1866

bExp = 1;

1867

}

1868

if ( bSig < aSig ) goto aBigger;

1869

if ( aSig < bSig ) goto bBigger;

1870

return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

1871

bExpBigger:

1872

if ( bExp == 0xFF ) {

1873

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1874

return packFloat32( zSign ^ 1, 0xFF, 0 );

1875

}

1876

if ( aExp == 0 ) {

1877

++expDiff;

1878

}

1879

else {

1880

aSig |= 0x40000000;

1881

}

1882

shift32RightJamming( aSig, - expDiff, &aSig );

1883

bSig |= 0x40000000;

1884

bBigger:

1885

zSig = bSig - aSig;

1886

zExp = bExp;

1887

zSign ^= 1;

1888

goto normalizeRoundAndPack;

1889

aExpBigger:

1890

if ( aExp == 0xFF ) {

1891

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1892

return a;

1893

}

1894

if ( bExp == 0 ) {

1895

--expDiff;

1896

}

1897

else {

1898

bSig |= 0x40000000;

1899

}

1900

shift32RightJamming( bSig, expDiff, &bSig );

1901

aSig |= 0x40000000;

1902

aBigger:

1903

zSig = aSig - bSig;

1904

zExp = aExp;

1905

normalizeRoundAndPack:

1906

--zExp;

1907

return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

1908

1909

}

1910

1911

/*----------------------------------------------------------------------------

1912

| Returns the result of adding the single-precision floating-point values `a'

1913

| and `b'. The operation is performed according to the IEC/IEEE Standard for

1914

| Binary Floating-Point Arithmetic.

1915

*----------------------------------------------------------------------------*/

1916

1917

float32 float32_add( float32 a, float32 b STATUS_PARAM )

1918

{

1919

flag aSign, bSign;

1920

a = float32_squash_input_denormal(a STATUS_VAR);

1921

b = float32_squash_input_denormal(b STATUS_VAR);

1922

1923

aSign = extractFloat32Sign( a );

1924

bSign = extractFloat32Sign( b );

1925

if ( aSign == bSign ) {

1926

return addFloat32Sigs( a, b, aSign STATUS_VAR);

1927

}

1928

else {

1929

return subFloat32Sigs( a, b, aSign STATUS_VAR );

1930

}

1931

1932

}

1933

1934

/*----------------------------------------------------------------------------

1935

| Returns the result of subtracting the single-precision floating-point values

1936

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

1937

| for Binary Floating-Point Arithmetic.

1938

*----------------------------------------------------------------------------*/

1939

1940

float32 float32_sub( float32 a, float32 b STATUS_PARAM )

1941

{

1942

flag aSign, bSign;

1943

a = float32_squash_input_denormal(a STATUS_VAR);

1944

b = float32_squash_input_denormal(b STATUS_VAR);

1945

1946

aSign = extractFloat32Sign( a );

1947

bSign = extractFloat32Sign( b );

1948

if ( aSign == bSign ) {

1949

return subFloat32Sigs( a, b, aSign STATUS_VAR );

1950

}

1951

else {

1952

return addFloat32Sigs( a, b, aSign STATUS_VAR );

1953

}

1954

1955

}

1956

1957

/*----------------------------------------------------------------------------

1958

| Returns the result of multiplying the single-precision floating-point values

1959

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

1960

| for Binary Floating-Point Arithmetic.

1961

*----------------------------------------------------------------------------*/

1962

1963

float32 float32_mul( float32 a, float32 b STATUS_PARAM )

1964

{

1965

flag aSign, bSign, zSign;

1966

int_fast16_t aExp, bExp, zExp;

1967

uint32_t aSig, bSig;

1968

uint64_t zSig64;

1969

uint32_t zSig;

1970

1971

a = float32_squash_input_denormal(a STATUS_VAR);

1972

b = float32_squash_input_denormal(b STATUS_VAR);

1973

1974

aSig = extractFloat32Frac( a );

1975

aExp = extractFloat32Exp( a );

1976

aSign = extractFloat32Sign( a );

1977

bSig = extractFloat32Frac( b );

1978

bExp = extractFloat32Exp( b );

1979

bSign = extractFloat32Sign( b );

1980

zSign = aSign ^ bSign;

1981

if ( aExp == 0xFF ) {

1982

if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {

1983

return propagateFloat32NaN( a, b STATUS_VAR );

1984

}

1985

if ( ( bExp | bSig ) == 0 ) {

1986

float_raise( float_flag_invalid STATUS_VAR);

1987

return float32_default_nan;

1988

}

1989

return packFloat32( zSign, 0xFF, 0 );

1990

}

1991

if ( bExp == 0xFF ) {

1992

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

1993

if ( ( aExp | aSig ) == 0 ) {

1994

float_raise( float_flag_invalid STATUS_VAR);

1995

return float32_default_nan;

1996

}

1997

return packFloat32( zSign, 0xFF, 0 );

1998

}

1999

if ( aExp == 0 ) {

2000

if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );

2001

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2002

}

2003

if ( bExp == 0 ) {

2004

if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );

2005

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2006

}

2007

zExp = aExp + bExp - 0x7F;

2008

aSig = ( aSig | 0x00800000 )<<7;

2009

bSig = ( bSig | 0x00800000 )<<8;

2010

shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );

2011

zSig = zSig64;

2012

if ( 0 <= (int32_t) ( zSig<<1 ) ) {

2013

zSig <<= 1;

2014

--zExp;

2015

}

2016

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

2017

2018

}

2019

2020

/*----------------------------------------------------------------------------

2021

| Returns the result of dividing the single-precision floating-point value `a'

2022

| by the corresponding value `b'. The operation is performed according to the

2023

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2024

*----------------------------------------------------------------------------*/

2025

2026

float32 float32_div( float32 a, float32 b STATUS_PARAM )

2027

{

2028

flag aSign, bSign, zSign;

2029

int_fast16_t aExp, bExp, zExp;

2030

uint32_t aSig, bSig, zSig;

2031

a = float32_squash_input_denormal(a STATUS_VAR);

2032

b = float32_squash_input_denormal(b STATUS_VAR);

2033

2034

aSig = extractFloat32Frac( a );

2035

aExp = extractFloat32Exp( a );

2036

aSign = extractFloat32Sign( a );

2037

bSig = extractFloat32Frac( b );

2038

bExp = extractFloat32Exp( b );

2039

bSign = extractFloat32Sign( b );

2040

zSign = aSign ^ bSign;

2041

if ( aExp == 0xFF ) {

2042

if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2043

if ( bExp == 0xFF ) {

2044

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2045

float_raise( float_flag_invalid STATUS_VAR);

2046

return float32_default_nan;

2047

}

2048

return packFloat32( zSign, 0xFF, 0 );

2049

}

2050

if ( bExp == 0xFF ) {

2051

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2052

return packFloat32( zSign, 0, 0 );

2053

}

2054

if ( bExp == 0 ) {

2055

if ( bSig == 0 ) {

2056

if ( ( aExp | aSig ) == 0 ) {

2057

float_raise( float_flag_invalid STATUS_VAR);

2058

return float32_default_nan;

2059

}

2060

float_raise( float_flag_divbyzero STATUS_VAR);

2061

return packFloat32( zSign, 0xFF, 0 );

2062

}

2063

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2064

}

2065

if ( aExp == 0 ) {

2066

if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );

2067

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2068

}

2069

zExp = aExp - bExp + 0x7D;

2070

aSig = ( aSig | 0x00800000 )<<7;

2071

bSig = ( bSig | 0x00800000 )<<8;

2072

if ( bSig <= ( aSig + aSig ) ) {

2073

aSig >>= 1;

2074

++zExp;

2075

}

2076

zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;

2077

if ( ( zSig & 0x3F ) == 0 ) {

2078

zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );

2079

}

2080

return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );

2081

2082

}

2083

2084

/*----------------------------------------------------------------------------

2085

| Returns the remainder of the single-precision floating-point value `a'

2086

| with respect to the corresponding value `b'. The operation is performed

2087

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2088

*----------------------------------------------------------------------------*/

2089

2090

float32 float32_rem( float32 a, float32 b STATUS_PARAM )

2091

{

2092

flag aSign, zSign;

2093

int_fast16_t aExp, bExp, expDiff;

2094

uint32_t aSig, bSig;

2095

uint32_t q;

2096

uint64_t aSig64, bSig64, q64;

2097

uint32_t alternateASig;

2098

int32_t sigMean;

2099

a = float32_squash_input_denormal(a STATUS_VAR);

2100

b = float32_squash_input_denormal(b STATUS_VAR);

2101

2102

aSig = extractFloat32Frac( a );

2103

aExp = extractFloat32Exp( a );

2104

aSign = extractFloat32Sign( a );

2105

bSig = extractFloat32Frac( b );

2106

bExp = extractFloat32Exp( b );

2107

if ( aExp == 0xFF ) {

2108

if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {

2109

return propagateFloat32NaN( a, b STATUS_VAR );

2110

}

2111

float_raise( float_flag_invalid STATUS_VAR);

2112

return float32_default_nan;

2113

}

2114

if ( bExp == 0xFF ) {

2115

if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );

2116

return a;

2117

}

2118

if ( bExp == 0 ) {

2119

if ( bSig == 0 ) {

2120

float_raise( float_flag_invalid STATUS_VAR);

2121

return float32_default_nan;

2122

}

2123

normalizeFloat32Subnormal( bSig, &bExp, &bSig );

2124

}

2125

if ( aExp == 0 ) {

2126

if ( aSig == 0 ) return a;

2127

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2128

}

2129

expDiff = aExp - bExp;

2130

aSig |= 0x00800000;

2131

bSig |= 0x00800000;

2132

if ( expDiff < 32 ) {

2133

aSig <<= 8;

2134

bSig <<= 8;

2135

if ( expDiff < 0 ) {

2136

if ( expDiff < -1 ) return a;

2137

aSig >>= 1;

2138

}

2139

q = ( bSig <= aSig );

2140

if ( q ) aSig -= bSig;

2141

if ( 0 < expDiff ) {

2142

q = ( ( (uint64_t) aSig )<<32 ) / bSig;

2143

q >>= 32 - expDiff;

2144

bSig >>= 2;

2145

aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;

2146

}

2147

else {

2148

aSig >>= 2;

2149

bSig >>= 2;

2150

}

2151

}

2152

else {

2153

if ( bSig <= aSig ) aSig -= bSig;

2154

aSig64 = ( (uint64_t) aSig )<<40;

2155

bSig64 = ( (uint64_t) bSig )<<40;

2156

expDiff -= 64;

2157

while ( 0 < expDiff ) {

2158

q64 = estimateDiv128To64( aSig64, 0, bSig64 );

2159

q64 = ( 2 < q64 ) ? q64 - 2 : 0;

2160

aSig64 = - ( ( bSig * q64 )<<38 );

2161

expDiff -= 62;

2162

}

2163

expDiff += 64;

2164

q64 = estimateDiv128To64( aSig64, 0, bSig64 );

2165

q64 = ( 2 < q64 ) ? q64 - 2 : 0;

2166

q = q64>>( 64 - expDiff );

2167

bSig <<= 6;

2168

aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;

2169

}

2170

do {

2171

alternateASig = aSig;

2172

++q;

2173

aSig -= bSig;

2174

} while ( 0 <= (int32_t) aSig );

2175

sigMean = aSig + alternateASig;

2176

if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {

2177

aSig = alternateASig;

2178

}

2179

zSign = ( (int32_t) aSig < 0 );

2180

if ( zSign ) aSig = - aSig;

2181

return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );

2182

2183

}

2184

2185

/*----------------------------------------------------------------------------

2186

| Returns the result of multiplying the single-precision floating-point values

2187

| `a' and `b' then adding 'c', with no intermediate rounding step after the

2188

| multiplication. The operation is performed according to the IEC/IEEE

2189

| Standard for Binary Floating-Point Arithmetic 754-2008.

2190

| The flags argument allows the caller to select negation of the

2191

| addend, the intermediate product, or the final result. (The difference

2192

| between this and having the caller do a separate negation is that negating

2193

| externally will flip the sign bit on NaNs.)

2194

*----------------------------------------------------------------------------*/

2195

2196

float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)

2197

{

2198

flag aSign, bSign, cSign, zSign;

2199

int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;

2200

uint32_t aSig, bSig, cSig;

2201

flag pInf, pZero, pSign;

2202

uint64_t pSig64, cSig64, zSig64;

2203

uint32_t pSig;

2204

int shiftcount;

2205

flag signflip, infzero;

2206

2207

a = float32_squash_input_denormal(a STATUS_VAR);

2208

b = float32_squash_input_denormal(b STATUS_VAR);

2209

c = float32_squash_input_denormal(c STATUS_VAR);

2210

aSig = extractFloat32Frac(a);

2211

aExp = extractFloat32Exp(a);

2212

aSign = extractFloat32Sign(a);

2213

bSig = extractFloat32Frac(b);

2214

bExp = extractFloat32Exp(b);

2215

bSign = extractFloat32Sign(b);

2216

cSig = extractFloat32Frac(c);

2217

cExp = extractFloat32Exp(c);

2218

cSign = extractFloat32Sign(c);

2219

2220

infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||

2221

(aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));

2222

2223

/* It is implementation-defined whether the cases of (0,inf,qnan)

2224

* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN

2225

* they return if they do), so we have to hand this information

2226

* off to the target-specific pick-a-NaN routine.

2227

2228

if (((aExp == 0xff) && aSig) ||

2229

((bExp == 0xff) && bSig) ||

2230

((cExp == 0xff) && cSig)) {

2231

return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);

2232

}

2233

2234

if (infzero) {

2235

float_raise(float_flag_invalid STATUS_VAR);

2236

return float32_default_nan;

2237

}

2238

2239

if (flags & float_muladd_negate_c) {

2240

cSign ^= 1;

2241

}

2242

2243

signflip = (flags & float_muladd_negate_result) ? 1 : 0;

2244

2245

/* Work out the sign and type of the product */

2246

pSign = aSign ^ bSign;

2247

if (flags & float_muladd_negate_product) {

2248

pSign ^= 1;

2249

}

2250

pInf = (aExp == 0xff) || (bExp == 0xff);

2251

pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);

2252

2253

if (cExp == 0xff) {

2254

if (pInf && (pSign ^ cSign)) {

2255

/* addition of opposite-signed infinities => InvalidOperation */

2256

float_raise(float_flag_invalid STATUS_VAR);

2257

return float32_default_nan;

2258

}

2259

/* Otherwise generate an infinity of the same sign */

2260

return packFloat32(cSign ^ signflip, 0xff, 0);

2261

}

2262

2263

if (pInf) {

2264

return packFloat32(pSign ^ signflip, 0xff, 0);

2265

}

2266

2267

if (pZero) {

2268

if (cExp == 0) {

2269

if (cSig == 0) {

2270

/* Adding two exact zeroes */

2271

if (pSign == cSign) {

2272

zSign = pSign;

2273

} else if (STATUS(float_rounding_mode) == float_round_down) {

2274

zSign = 1;

2275

} else {

2276

zSign = 0;

2277

}

2278

return packFloat32(zSign ^ signflip, 0, 0);

2279

}

2280

/* Exact zero plus a denorm */

2281

if (STATUS(flush_to_zero)) {

2282

float_raise(float_flag_output_denormal STATUS_VAR);

2283

return packFloat32(cSign ^ signflip, 0, 0);

2284

}

2285

}

2286

/* Zero plus something non-zero : just return the something */

2287

return packFloat32(cSign ^ signflip, cExp, cSig);

2288

}

2289

2290

if (aExp == 0) {

2291

normalizeFloat32Subnormal(aSig, &aExp, &aSig);

2292

}

2293

if (bExp == 0) {

2294

normalizeFloat32Subnormal(bSig, &bExp, &bSig);

2295

}

2296

2297

/* Calculate the actual result a * b + c */

2298

2299

/* Multiply first; this is easy. */

2300

/* NB: we subtract 0x7e where float32_mul() subtracts 0x7f

2301

* because we want the true exponent, not the "one-less-than"

2302

* flavour that roundAndPackFloat32() takes.

2303

2304

pExp = aExp + bExp - 0x7e;

2305

aSig = (aSig | 0x00800000) << 7;

2306

bSig = (bSig | 0x00800000) << 8;

2307

pSig64 = (uint64_t)aSig * bSig;

2308

if ((int64_t)(pSig64 << 1) >= 0) {

2309

pSig64 <<= 1;

2310

pExp--;

2311

}

2312

2313

zSign = pSign ^ signflip;

2314

2315

/* Now pSig64 is the significand of the multiply, with the explicit bit in

2316

* position 62.

2317

2318

if (cExp == 0) {

2319

if (!cSig) {

2320

/* Throw out the special case of c being an exact zero now */

2321

shift64RightJamming(pSig64, 32, &pSig64);

2322

pSig = pSig64;

2323

return roundAndPackFloat32(zSign, pExp - 1,

2324

pSig STATUS_VAR);

2325

}

2326

normalizeFloat32Subnormal(cSig, &cExp, &cSig);

2327

}

2328

2329

cSig64 = (uint64_t)cSig << (62 - 23);

2330

cSig64 |= LIT64(0x4000000000000000);

2331

expDiff = pExp - cExp;

2332

2333

if (pSign == cSign) {

2334

/* Addition */

2335

if (expDiff > 0) {

2336

/* scale c to match p */

2337

shift64RightJamming(cSig64, expDiff, &cSig64);

2338

zExp = pExp;

2339

} else if (expDiff < 0) {

2340

/* scale p to match c */

2341

shift64RightJamming(pSig64, -expDiff, &pSig64);

2342

zExp = cExp;

2343

} else {

2344

/* no scaling needed */

2345

zExp = cExp;

2346

}

2347

/* Add significands and make sure explicit bit ends up in posn 62 */

2348

zSig64 = pSig64 + cSig64;

2349

if ((int64_t)zSig64 < 0) {

2350

shift64RightJamming(zSig64, 1, &zSig64);

2351

} else {

2352

zExp--;

2353

}

2354

} else {

2355

/* Subtraction */

2356

if (expDiff > 0) {

2357

shift64RightJamming(cSig64, expDiff, &cSig64);

2358

zSig64 = pSig64 - cSig64;

2359

zExp = pExp;

2360

} else if (expDiff < 0) {

2361

shift64RightJamming(pSig64, -expDiff, &pSig64);

2362

zSig64 = cSig64 - pSig64;

2363

zExp = cExp;

2364

zSign ^= 1;

2365

} else {

2366

zExp = pExp;

2367

if (cSig64 < pSig64) {

2368

zSig64 = pSig64 - cSig64;

2369

} else if (pSig64 < cSig64) {

2370

zSig64 = cSig64 - pSig64;

2371

zSign ^= 1;

2372

} else {

2373

/* Exact zero */

2374

zSign = signflip;

2375

if (STATUS(float_rounding_mode) == float_round_down) {

2376

zSign ^= 1;

2377

}

2378

return packFloat32(zSign, 0, 0);

2379

}

2380

}

2381

--zExp;

2382

/* Normalize to put the explicit bit back into bit 62. */

2383

shiftcount = countLeadingZeros64(zSig64) - 1;

2384

zSig64 <<= shiftcount;

2385

zExp -= shiftcount;

2386

}

2387

shift64RightJamming(zSig64, 32, &zSig64);

2388

return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);

2389

}

2390

2391

2392

/*----------------------------------------------------------------------------

2393

| Returns the square root of the single-precision floating-point value `a'.

2394

| The operation is performed according to the IEC/IEEE Standard for Binary

2395

| Floating-Point Arithmetic.

2396

*----------------------------------------------------------------------------*/

2397

2398

float32 float32_sqrt( float32 a STATUS_PARAM )

2399

{

2400

flag aSign;

2401

int_fast16_t aExp, zExp;

2402

uint32_t aSig, zSig;

2403

uint64_t rem, term;

2404

a = float32_squash_input_denormal(a STATUS_VAR);

2405

2406

aSig = extractFloat32Frac( a );

2407

aExp = extractFloat32Exp( a );

2408

aSign = extractFloat32Sign( a );

2409

if ( aExp == 0xFF ) {

2410

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2411

if ( ! aSign ) return a;

2412

float_raise( float_flag_invalid STATUS_VAR);

2413

return float32_default_nan;

2414

}

2415

if ( aSign ) {

2416

if ( ( aExp | aSig ) == 0 ) return a;

2417

float_raise( float_flag_invalid STATUS_VAR);

2418

return float32_default_nan;

2419

}

2420

if ( aExp == 0 ) {

2421

if ( aSig == 0 ) return float32_zero;

2422

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2423

}

2424

zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;

2425

aSig = ( aSig | 0x00800000 )<<8;

2426

zSig = estimateSqrt32( aExp, aSig ) + 2;

2427

if ( ( zSig & 0x7F ) <= 5 ) {

2428

if ( zSig < 2 ) {

2429

zSig = 0x7FFFFFFF;

2430

goto roundAndPack;

2431

}

2432

aSig >>= aExp & 1;

2433

term = ( (uint64_t) zSig ) * zSig;

2434

rem = ( ( (uint64_t) aSig )<<32 ) - term;

2435

while ( (int64_t) rem < 0 ) {

2436

--zSig;

2437

rem += ( ( (uint64_t) zSig )<<1 ) | 1;

2438

}

2439

zSig |= ( rem != 0 );

2440

}

2441

shift32RightJamming( zSig, 1, &zSig );

2442

roundAndPack:

2443

return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );

2444

2445

}

2446

2447

/*----------------------------------------------------------------------------

2448

| Returns the binary exponential of the single-precision floating-point value

2449

| `a'. The operation is performed according to the IEC/IEEE Standard for

2450

| Binary Floating-Point Arithmetic.

2451

2452

| Uses the following identities:

2453

2454

| 1. -------------------------------------------------------------------------

2455

| x x*ln(2)

2456

| 2 = e

2457

2458

| 2. -------------------------------------------------------------------------

2459

| 2 3 4 5 n

2460

| x x x x x x x

2461

| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...

2462

| 1! 2! 3! 4! 5! n!

2463

*----------------------------------------------------------------------------*/

2464

2465

static const float64 float32_exp2_coefficients[15] =

2466

{

2467

const_float64( 0x3ff0000000000000ll ), /* 1 */

2468

const_float64( 0x3fe0000000000000ll ), /* 2 */

2469

const_float64( 0x3fc5555555555555ll ), /* 3 */

2470

const_float64( 0x3fa5555555555555ll ), /* 4 */

2471

const_float64( 0x3f81111111111111ll ), /* 5 */

2472

const_float64( 0x3f56c16c16c16c17ll ), /* 6 */

2473

const_float64( 0x3f2a01a01a01a01all ), /* 7 */

2474

const_float64( 0x3efa01a01a01a01all ), /* 8 */

2475

const_float64( 0x3ec71de3a556c734ll ), /* 9 */

2476

const_float64( 0x3e927e4fb7789f5cll ), /* 10 */

2477

const_float64( 0x3e5ae64567f544e4ll ), /* 11 */

2478

const_float64( 0x3e21eed8eff8d898ll ), /* 12 */

2479

const_float64( 0x3de6124613a86d09ll ), /* 13 */

2480

const_float64( 0x3da93974a8c07c9dll ), /* 14 */

2481

const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */

2482

};

2483

2484

float32 float32_exp2( float32 a STATUS_PARAM )

2485

{

2486

flag aSign;

2487

int_fast16_t aExp;

2488

uint32_t aSig;

2489

float64 r, x, xn;

2490

int i;

2491

a = float32_squash_input_denormal(a STATUS_VAR);

2492

2493

aSig = extractFloat32Frac( a );

2494

aExp = extractFloat32Exp( a );

2495

aSign = extractFloat32Sign( a );

2496

2497

if ( aExp == 0xFF) {

2498

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2499

return (aSign) ? float32_zero : a;

2500

}

2501

if (aExp == 0) {

2502

if (aSig == 0) return float32_one;

2503

}

2504

2505

float_raise( float_flag_inexact STATUS_VAR);

2506

2507

/* ******************************* */

2508

/* using float64 for approximation */

2509

/* ******************************* */

2510

x = float32_to_float64(a STATUS_VAR);

2511

x = float64_mul(x, float64_ln2 STATUS_VAR);

2512

2513

xn = x;

2514

r = float64_one;

2515

for (i = 0 ; i < 15 ; i++) {

2516

float64 f;

2517

2518

f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);

2519

r = float64_add(r, f STATUS_VAR);

2520

2521

xn = float64_mul(xn, x STATUS_VAR);

2522

}

2523

2524

return float64_to_float32(r, status);

2525

}

2526

2527

/*----------------------------------------------------------------------------

2528

| Returns the binary log of the single-precision floating-point value `a'.

2529

| The operation is performed according to the IEC/IEEE Standard for Binary

2530

| Floating-Point Arithmetic.

2531

*----------------------------------------------------------------------------*/

2532

float32 float32_log2( float32 a STATUS_PARAM )

2533

{

2534

flag aSign, zSign;

2535

int_fast16_t aExp;

2536

uint32_t aSig, zSig, i;

2537

2538

a = float32_squash_input_denormal(a STATUS_VAR);

2539

aSig = extractFloat32Frac( a );

2540

aExp = extractFloat32Exp( a );

2541

aSign = extractFloat32Sign( a );

2542

2543

if ( aExp == 0 ) {

2544

if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );

2545

normalizeFloat32Subnormal( aSig, &aExp, &aSig );

2546

}

2547

if ( aSign ) {

2548

float_raise( float_flag_invalid STATUS_VAR);

2549

return float32_default_nan;

2550

}

2551

if ( aExp == 0xFF ) {

2552

if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );

2553

return a;

2554

}

2555

2556

aExp -= 0x7F;

2557

aSig |= 0x00800000;

2558

zSign = aExp < 0;

2559

zSig = aExp << 23;

2560

2561

for (i = 1 << 22; i > 0; i >>= 1) {

2562

aSig = ( (uint64_t)aSig * aSig ) >> 23;

2563

if ( aSig & 0x01000000 ) {

2564

aSig >>= 1;

2565

zSig |= i;

2566

}

2567

}

2568

2569

if ( zSign )

2570

zSig = -zSig;

2571

2572

return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );

2573

}

2574

2575

/*----------------------------------------------------------------------------

2576

| Returns 1 if the single-precision floating-point value `a' is equal to

2577

| the corresponding value `b', and 0 otherwise. The invalid exception is

2578

| raised if either operand is a NaN. Otherwise, the comparison is performed

2579

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2580

*----------------------------------------------------------------------------*/

2581

2582

int float32_eq( float32 a, float32 b STATUS_PARAM )

2583

{

2584

uint32_t av, bv;

2585

a = float32_squash_input_denormal(a STATUS_VAR);

2586

b = float32_squash_input_denormal(b STATUS_VAR);

2587

2588

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2589

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2590

) {

2591

float_raise( float_flag_invalid STATUS_VAR);

2592

return 0;

2593

}

2594

av = float32_val(a);

2595

bv = float32_val(b);

2596

return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2597

}

2598

2599

/*----------------------------------------------------------------------------

2600

| Returns 1 if the single-precision floating-point value `a' is less than

2601

| or equal to the corresponding value `b', and 0 otherwise. The invalid

2602

| exception is raised if either operand is a NaN. The comparison is performed

2603

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2604

*----------------------------------------------------------------------------*/

2605

2606

int float32_le( float32 a, float32 b STATUS_PARAM )

2607

{

2608

flag aSign, bSign;

2609

uint32_t av, bv;

2610

a = float32_squash_input_denormal(a STATUS_VAR);

2611

b = float32_squash_input_denormal(b STATUS_VAR);

2612

2613

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2614

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2615

) {

2616

float_raise( float_flag_invalid STATUS_VAR);

2617

return 0;

2618

}

2619

aSign = extractFloat32Sign( a );

2620

bSign = extractFloat32Sign( b );

2621

av = float32_val(a);

2622

bv = float32_val(b);

2623

if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2624

return ( av == bv ) || ( aSign ^ ( av < bv ) );

2625

2626

}

2627

2628

/*----------------------------------------------------------------------------

2629

| Returns 1 if the single-precision floating-point value `a' is less than

2630

| the corresponding value `b', and 0 otherwise. The invalid exception is

2631

| raised if either operand is a NaN. The comparison is performed according

2632

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2633

*----------------------------------------------------------------------------*/

2634

2635

int float32_lt( float32 a, float32 b STATUS_PARAM )

2636

{

2637

flag aSign, bSign;

2638

uint32_t av, bv;

2639

a = float32_squash_input_denormal(a STATUS_VAR);

2640

b = float32_squash_input_denormal(b STATUS_VAR);

2641

2642

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2643

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2644

) {

2645

float_raise( float_flag_invalid STATUS_VAR);

2646

return 0;

2647

}

2648

aSign = extractFloat32Sign( a );

2649

bSign = extractFloat32Sign( b );

2650

av = float32_val(a);

2651

bv = float32_val(b);

2652

if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );

2653

return ( av != bv ) && ( aSign ^ ( av < bv ) );

2654

2655

}

2656

2657

/*----------------------------------------------------------------------------

2658

| Returns 1 if the single-precision floating-point values `a' and `b' cannot

2659

| be compared, and 0 otherwise. The invalid exception is raised if either

2660

| operand is a NaN. The comparison is performed according to the IEC/IEEE

2661

| Standard for Binary Floating-Point Arithmetic.

2662

*----------------------------------------------------------------------------*/

2663

2664

int float32_unordered( float32 a, float32 b STATUS_PARAM )

2665

{

2666

a = float32_squash_input_denormal(a STATUS_VAR);

2667

b = float32_squash_input_denormal(b STATUS_VAR);

2668

2669

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2670

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2671

) {

2672

float_raise( float_flag_invalid STATUS_VAR);

2673

return 1;

2674

}

2675

return 0;

2676

}

2677

2678

/*----------------------------------------------------------------------------

2679

| Returns 1 if the single-precision floating-point value `a' is equal to

2680

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

2681

| exception. The comparison is performed according to the IEC/IEEE Standard

2682

| for Binary Floating-Point Arithmetic.

2683

*----------------------------------------------------------------------------*/

2684

2685

int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )

2686

{

2687

a = float32_squash_input_denormal(a STATUS_VAR);

2688

b = float32_squash_input_denormal(b STATUS_VAR);

2689

2690

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2691

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2692

) {

2693

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2694

float_raise( float_flag_invalid STATUS_VAR);

2695

}

2696

return 0;

2697

}

2698

return ( float32_val(a) == float32_val(b) ) ||

2699

( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );

2700

}

2701

2702

/*----------------------------------------------------------------------------

2703

| Returns 1 if the single-precision floating-point value `a' is less than or

2704

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

2705

| cause an exception. Otherwise, the comparison is performed according to the

2706

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

2707

*----------------------------------------------------------------------------*/

2708

2709

int float32_le_quiet( float32 a, float32 b STATUS_PARAM )

2710

{

2711

flag aSign, bSign;

2712

uint32_t av, bv;

2713

a = float32_squash_input_denormal(a STATUS_VAR);

2714

b = float32_squash_input_denormal(b STATUS_VAR);

2715

2716

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2717

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2718

) {

2719

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2720

float_raise( float_flag_invalid STATUS_VAR);

2721

}

2722

return 0;

2723

}

2724

aSign = extractFloat32Sign( a );

2725

bSign = extractFloat32Sign( b );

2726

av = float32_val(a);

2727

bv = float32_val(b);

2728

if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );

2729

return ( av == bv ) || ( aSign ^ ( av < bv ) );

2730

2731

}

2732

2733

/*----------------------------------------------------------------------------

2734

| Returns 1 if the single-precision floating-point value `a' is less than

2735

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

2736

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

2737

| Standard for Binary Floating-Point Arithmetic.

2738

*----------------------------------------------------------------------------*/

2739

2740

int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )

2741

{

2742

flag aSign, bSign;

2743

uint32_t av, bv;

2744

a = float32_squash_input_denormal(a STATUS_VAR);

2745

b = float32_squash_input_denormal(b STATUS_VAR);

2746

2747

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2748

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2749

) {

2750

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2751

float_raise( float_flag_invalid STATUS_VAR);

2752

}

2753

return 0;

2754

}

2755

aSign = extractFloat32Sign( a );

2756

bSign = extractFloat32Sign( b );

2757

av = float32_val(a);

2758

bv = float32_val(b);

2759

if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );

2760

return ( av != bv ) && ( aSign ^ ( av < bv ) );

2761

2762

}

2763

2764

/*----------------------------------------------------------------------------

2765

| Returns 1 if the single-precision floating-point values `a' and `b' cannot

2766

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

2767

| comparison is performed according to the IEC/IEEE Standard for Binary

2768

| Floating-Point Arithmetic.

2769

*----------------------------------------------------------------------------*/

2770

2771

int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )

2772

{

2773

a = float32_squash_input_denormal(a STATUS_VAR);

2774

b = float32_squash_input_denormal(b STATUS_VAR);

2775

2776

if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )

2777

|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )

2778

) {

2779

if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {

2780

float_raise( float_flag_invalid STATUS_VAR);

2781

}

2782

return 1;

2783

}

2784

return 0;

2785

}

2786

2787

/*----------------------------------------------------------------------------

2788

| Returns the result of converting the double-precision floating-point value

2789

| `a' to the 32-bit two's complement integer format. The conversion is

2790

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2791

| Arithmetic---which means in particular that the conversion is rounded

2792

| according to the current rounding mode. If `a' is a NaN, the largest

2793

| positive integer is returned. Otherwise, if the conversion overflows, the

2794

| largest integer with the same sign as `a' is returned.

2795

*----------------------------------------------------------------------------*/

2796

2797

int32 float64_to_int32( float64 a STATUS_PARAM )

2798

{

2799

flag aSign;

2800

int_fast16_t aExp, shiftCount;

2801

uint64_t aSig;

2802

a = float64_squash_input_denormal(a STATUS_VAR);

2803

2804

aSig = extractFloat64Frac( a );

2805

aExp = extractFloat64Exp( a );

2806

aSign = extractFloat64Sign( a );

2807

if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;

2808

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

2809

shiftCount = 0x42C - aExp;

2810

if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );

2811

return roundAndPackInt32( aSign, aSig STATUS_VAR );

2812

2813

}

2814

2815

/*----------------------------------------------------------------------------

2816

| Returns the result of converting the double-precision floating-point value

2817

| `a' to the 32-bit two's complement integer format. The conversion is

2818

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2819

| Arithmetic, except that the conversion is always rounded toward zero.

2820

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2821

| the conversion overflows, the largest integer with the same sign as `a' is

2822

| returned.

2823

*----------------------------------------------------------------------------*/

2824

2825

int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )

2826

{

2827

flag aSign;

2828

int_fast16_t aExp, shiftCount;

2829

uint64_t aSig, savedASig;

2830

int32_t z;

2831

a = float64_squash_input_denormal(a STATUS_VAR);

2832

2833

aSig = extractFloat64Frac( a );

2834

aExp = extractFloat64Exp( a );

2835

aSign = extractFloat64Sign( a );

2836

if ( 0x41E < aExp ) {

2837

if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;

2838

goto invalid;

2839

}

2840

else if ( aExp < 0x3FF ) {

2841

if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

2842

return 0;

2843

}

2844

aSig |= LIT64( 0x0010000000000000 );

2845

shiftCount = 0x433 - aExp;

2846

savedASig = aSig;

2847

aSig >>= shiftCount;

2848

z = aSig;

2849

if ( aSign ) z = - z;

2850

if ( ( z < 0 ) ^ aSign ) {

2851

invalid:

2852

float_raise( float_flag_invalid STATUS_VAR);

2853

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

2854

}

2855

if ( ( aSig<<shiftCount ) != savedASig ) {

2856

STATUS(float_exception_flags) |= float_flag_inexact;

2857

}

2858

return z;

2859

2860

}

2861

2862

/*----------------------------------------------------------------------------

2863

| Returns the result of converting the double-precision floating-point value

2864

| `a' to the 16-bit two's complement integer format. The conversion is

2865

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2866

| Arithmetic, except that the conversion is always rounded toward zero.

2867

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2868

| the conversion overflows, the largest integer with the same sign as `a' is

2869

| returned.

2870

*----------------------------------------------------------------------------*/

2871

2872

int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)

2873

{

2874

flag aSign;

2875

int_fast16_t aExp, shiftCount;

2876

uint64_t aSig, savedASig;

2877

int32 z;

2878

2879

aSig = extractFloat64Frac( a );

2880

aExp = extractFloat64Exp( a );

2881

aSign = extractFloat64Sign( a );

2882

if ( 0x40E < aExp ) {

2883

if ( ( aExp == 0x7FF ) && aSig ) {

2884

aSign = 0;

2885

}

2886

goto invalid;

2887

}

2888

else if ( aExp < 0x3FF ) {

2889

if ( aExp || aSig ) {

2890

STATUS(float_exception_flags) |= float_flag_inexact;

2891

}

2892

return 0;

2893

}

2894

aSig |= LIT64( 0x0010000000000000 );

2895

shiftCount = 0x433 - aExp;

2896

savedASig = aSig;

2897

aSig >>= shiftCount;

2898

z = aSig;

2899

if ( aSign ) {

2900

z = - z;

2901

}

2902

if ( ( (int16_t)z < 0 ) ^ aSign ) {

2903

invalid:

2904

float_raise( float_flag_invalid STATUS_VAR);

2905

return aSign ? (int32_t) 0xffff8000 : 0x7FFF;

2906

}

2907

if ( ( aSig<<shiftCount ) != savedASig ) {

2908

STATUS(float_exception_flags) |= float_flag_inexact;

2909

}

2910

return z;

2911

}

2912

2913

/*----------------------------------------------------------------------------

2914

| Returns the result of converting the double-precision floating-point value

2915

| `a' to the 64-bit two's complement integer format. The conversion is

2916

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2917

| Arithmetic---which means in particular that the conversion is rounded

2918

| according to the current rounding mode. If `a' is a NaN, the largest

2919

| positive integer is returned. Otherwise, if the conversion overflows, the

2920

| largest integer with the same sign as `a' is returned.

2921

*----------------------------------------------------------------------------*/

2922

2923

int64 float64_to_int64( float64 a STATUS_PARAM )

2924

{

2925

flag aSign;

2926

int_fast16_t aExp, shiftCount;

2927

uint64_t aSig, aSigExtra;

2928

a = float64_squash_input_denormal(a STATUS_VAR);

2929

2930

aSig = extractFloat64Frac( a );

2931

aExp = extractFloat64Exp( a );

2932

aSign = extractFloat64Sign( a );

2933

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

2934

shiftCount = 0x433 - aExp;

2935

if ( shiftCount <= 0 ) {

2936

if ( 0x43E < aExp ) {

2937

float_raise( float_flag_invalid STATUS_VAR);

2938

if ( ! aSign

2939

|| ( ( aExp == 0x7FF )

2940

&& ( aSig != LIT64( 0x0010000000000000 ) ) )

2941

) {

2942

return LIT64( 0x7FFFFFFFFFFFFFFF );

2943

}

2944

return (int64_t) LIT64( 0x8000000000000000 );

2945

}

2946

aSigExtra = 0;

2947

aSig <<= - shiftCount;

2948

}

2949

else {

2950

shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );

2951

}

2952

return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );

2953

2954

}

2955

2956

/*----------------------------------------------------------------------------

2957

| Returns the result of converting the double-precision floating-point value

2958

| `a' to the 64-bit two's complement integer format. The conversion is

2959

| performed according to the IEC/IEEE Standard for Binary Floating-Point

2960

| Arithmetic, except that the conversion is always rounded toward zero.

2961

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

2962

| the conversion overflows, the largest integer with the same sign as `a' is

2963

| returned.

2964

*----------------------------------------------------------------------------*/

2965

2966

int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )

2967

{

2968

flag aSign;

2969

int_fast16_t aExp, shiftCount;

2970

uint64_t aSig;

2971

int64 z;

2972

a = float64_squash_input_denormal(a STATUS_VAR);

2973

2974

aSig = extractFloat64Frac( a );

2975

aExp = extractFloat64Exp( a );

2976

aSign = extractFloat64Sign( a );

2977

if ( aExp ) aSig |= LIT64( 0x0010000000000000 );

2978

shiftCount = aExp - 0x433;

2979

if ( 0 <= shiftCount ) {

2980

if ( 0x43E <= aExp ) {

2981

if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {

2982

float_raise( float_flag_invalid STATUS_VAR);

2983

if ( ! aSign

2984

|| ( ( aExp == 0x7FF )

2985

&& ( aSig != LIT64( 0x0010000000000000 ) ) )

2986

) {

2987

return LIT64( 0x7FFFFFFFFFFFFFFF );

2988

}

2989

}

2990

return (int64_t) LIT64( 0x8000000000000000 );

2991

}

2992

z = aSig<<shiftCount;

2993

}

2994

else {

2995

if ( aExp < 0x3FE ) {

2996

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

2997

return 0;

2998

}

2999

z = aSig>>( - shiftCount );

3000

if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {

3001

STATUS(float_exception_flags) |= float_flag_inexact;

3002

}

3003

}

3004

if ( aSign ) z = - z;

3005

return z;

3006

3007

}

3008

3009

/*----------------------------------------------------------------------------

3010

| Returns the result of converting the double-precision floating-point value

3011

| `a' to the single-precision floating-point format. The conversion is

3012

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3013

| Arithmetic.

3014

*----------------------------------------------------------------------------*/

3015

3016

float32 float64_to_float32( float64 a STATUS_PARAM )

3017

{

3018

flag aSign;

3019

int_fast16_t aExp;

3020

uint64_t aSig;

3021

uint32_t zSig;

3022

a = float64_squash_input_denormal(a STATUS_VAR);

3023

3024

aSig = extractFloat64Frac( a );

3025

aExp = extractFloat64Exp( a );

3026

aSign = extractFloat64Sign( a );

3027

if ( aExp == 0x7FF ) {

3028

if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3029

return packFloat32( aSign, 0xFF, 0 );

3030

}

3031

shift64RightJamming( aSig, 22, &aSig );

3032

zSig = aSig;

3033

if ( aExp || zSig ) {

3034

zSig |= 0x40000000;

3035

aExp -= 0x381;

3036

}

3037

return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );

3038

3039

}

3040

3041

3042

/*----------------------------------------------------------------------------

3043

| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a

3044

| half-precision floating-point value, returning the result. After being

3045

| shifted into the proper positions, the three fields are simply added

3046

| together to form the result. This means that any integer portion of `zSig'

3047

| will be added into the exponent. Since a properly normalized significand

3048

| will have an integer portion equal to 1, the `zExp' input should be 1 less

3049

| than the desired result exponent whenever `zSig' is a complete, normalized

3050

| significand.

3051

*----------------------------------------------------------------------------*/

3052

static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)

3053

{

3054

return make_float16(

3055

(((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);

3056

}

3057

3058

/* Half precision floats come in two formats: standard IEEE and "ARM" format.

3059

The latter gains extra exponent range by omitting the NaN/Inf encodings. */

3060

3061

float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)

3062

{

3063

flag aSign;

3064

int_fast16_t aExp;

3065

uint32_t aSig;

3066

3067

aSign = extractFloat16Sign(a);

3068

aExp = extractFloat16Exp(a);

3069

aSig = extractFloat16Frac(a);

3070

3071

if (aExp == 0x1f && ieee) {

3072

if (aSig) {

3073

return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3074

}

3075

return packFloat32(aSign, 0xff, 0);

3076

}

3077

if (aExp == 0) {

3078

int8 shiftCount;

3079

3080

if (aSig == 0) {

3081

return packFloat32(aSign, 0, 0);

3082

}

3083

3084

shiftCount = countLeadingZeros32( aSig ) - 21;

3085

aSig = aSig << shiftCount;

3086

aExp = -shiftCount;

3087

}

3088

return packFloat32( aSign, aExp + 0x70, aSig << 13);

3089

}

3090

3091

float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)

3092

{

3093

flag aSign;

3094

int_fast16_t aExp;

3095

uint32_t aSig;

3096

uint32_t mask;

3097

uint32_t increment;

3098

int8 roundingMode;

3099

int maxexp = ieee ? 15 : 16;

3100

bool rounding_bumps_exp;

3101

bool is_tiny = false;

3102

3103

a = float32_squash_input_denormal(a STATUS_VAR);

3104

3105

aSig = extractFloat32Frac( a );

3106

aExp = extractFloat32Exp( a );

3107

aSign = extractFloat32Sign( a );

3108

if ( aExp == 0xFF ) {

3109

if (aSig) {

3110

/* Input is a NaN */

3111

if (!ieee) {

3112

float_raise(float_flag_invalid STATUS_VAR);

3113

return packFloat16(aSign, 0, 0);

3114

}

3115

return commonNaNToFloat16(

3116

float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);

3117

}

3118

/* Infinity */

3119

if (!ieee) {

3120

float_raise(float_flag_invalid STATUS_VAR);

3121

return packFloat16(aSign, 0x1f, 0x3ff);

3122

}

3123

return packFloat16(aSign, 0x1f, 0);

3124

}

3125

if (aExp == 0 && aSig == 0) {

3126

return packFloat16(aSign, 0, 0);

3127

}

3128

/* Decimal point between bits 22 and 23. Note that we add the 1 bit

3129

* even if the input is denormal; however this is harmless because

3130

* the largest possible single-precision denormal is still smaller

3131

* than the smallest representable half-precision denormal, and so we

3132

* will end up ignoring aSig and returning via the "always return zero"

3133

* codepath.

3134

3135

aSig |= 0x00800000;

3136

aExp -= 0x7f;

3137

/* Calculate the mask of bits of the mantissa which are not

3138

* representable in half-precision and will be lost.

3139

3140

if (aExp < -14) {

3141

/* Will be denormal in halfprec */

3142

mask = 0x00ffffff;

3143

if (aExp >= -24) {

3144

mask >>= 25 + aExp;

3145

}

3146

} else {

3147

/* Normal number in halfprec */

3148

mask = 0x00001fff;

3149

}

3150

3151

roundingMode = STATUS(float_rounding_mode);

3152

switch (roundingMode) {

3153

case float_round_nearest_even:

3154

increment = (mask + 1) >> 1;

3155

if ((aSig & mask) == increment) {

3156

increment = aSig & (increment << 1);

3157

}

3158

break;

3159

case float_round_up:

3160

increment = aSign ? 0 : mask;

3161

break;

3162

case float_round_down:

3163

increment = aSign ? mask : 0;

3164

break;

3165

default: /* round_to_zero */

3166

increment = 0;

3167

break;

3168

}

3169

3170

rounding_bumps_exp = (aSig + increment >= 0x01000000);

3171

3172

if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {

3173

if (ieee) {

3174

float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);

3175

return packFloat16(aSign, 0x1f, 0);

3176

} else {

3177

float_raise(float_flag_invalid STATUS_VAR);

3178

return packFloat16(aSign, 0x1f, 0x3ff);

3179

}

3180

}

3181

3182

if (aExp < -14) {

3183

/* Note that flush-to-zero does not affect half-precision results */

3184

is_tiny =

3185

(STATUS(float_detect_tininess) == float_tininess_before_rounding)

3186

|| (aExp < -15)

3187

|| (!rounding_bumps_exp);

3188

}

3189

if (aSig & mask) {

3190

float_raise(float_flag_inexact STATUS_VAR);

3191

if (is_tiny) {

3192

float_raise(float_flag_underflow STATUS_VAR);

3193

}

3194

}

3195

3196

aSig += increment;

3197

if (rounding_bumps_exp) {

3198

aSig >>= 1;

3199

aExp++;

3200

}

3201

3202

if (aExp < -24) {

3203

return packFloat16(aSign, 0, 0);

3204

}

3205

if (aExp < -14) {

3206

aSig >>= -14 - aExp;

3207

aExp = -14;

3208

}

3209

return packFloat16(aSign, aExp + 14, aSig >> 13);

3210

}

3211

3212

/*----------------------------------------------------------------------------

3213

| Returns the result of converting the double-precision floating-point value

3214

| `a' to the extended double-precision floating-point format. The conversion

3215

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

3216

| Arithmetic.

3217

*----------------------------------------------------------------------------*/

3218

3219

floatx80 float64_to_floatx80( float64 a STATUS_PARAM )

3220

{

3221

flag aSign;

3222

int_fast16_t aExp;

3223

uint64_t aSig;

3224

3225

a = float64_squash_input_denormal(a STATUS_VAR);

3226

aSig = extractFloat64Frac( a );

3227

aExp = extractFloat64Exp( a );

3228

aSign = extractFloat64Sign( a );

3229

if ( aExp == 0x7FF ) {

3230

if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3231

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

3232

}

3233

if ( aExp == 0 ) {

3234

if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );

3235

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3236

}

3237

return

3238

packFloatx80(

3239

aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );

3240

3241

}

3242

3243

/*----------------------------------------------------------------------------

3244

| Returns the result of converting the double-precision floating-point value

3245

| `a' to the quadruple-precision floating-point format. The conversion is

3246

| performed according to the IEC/IEEE Standard for Binary Floating-Point

3247

| Arithmetic.

3248

*----------------------------------------------------------------------------*/

3249

3250

float128 float64_to_float128( float64 a STATUS_PARAM )

3251

{

3252

flag aSign;

3253

int_fast16_t aExp;

3254

uint64_t aSig, zSig0, zSig1;

3255

3256

a = float64_squash_input_denormal(a STATUS_VAR);

3257

aSig = extractFloat64Frac( a );

3258

aExp = extractFloat64Exp( a );

3259

aSign = extractFloat64Sign( a );

3260

if ( aExp == 0x7FF ) {

3261

if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

3262

return packFloat128( aSign, 0x7FFF, 0, 0 );

3263

}

3264

if ( aExp == 0 ) {

3265

if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );

3266

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3267

--aExp;

3268

}

3269

shift128Right( aSig, 0, 4, &zSig0, &zSig1 );

3270

return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );

3271

3272

}

3273

3274

/*----------------------------------------------------------------------------

3275

| Rounds the double-precision floating-point value `a' to an integer, and

3276

| returns the result as a double-precision floating-point value. The

3277

| operation is performed according to the IEC/IEEE Standard for Binary

3278

| Floating-Point Arithmetic.

3279

*----------------------------------------------------------------------------*/

3280

3281

float64 float64_round_to_int( float64 a STATUS_PARAM )

3282

{

3283

flag aSign;

3284

int_fast16_t aExp;

3285

uint64_t lastBitMask, roundBitsMask;

3286

int8 roundingMode;

3287

uint64_t z;

3288

a = float64_squash_input_denormal(a STATUS_VAR);

3289

3290

aExp = extractFloat64Exp( a );

3291

if ( 0x433 <= aExp ) {

3292

if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {

3293

return propagateFloat64NaN( a, a STATUS_VAR );

3294

}

3295

return a;

3296

}

3297

if ( aExp < 0x3FF ) {

3298

if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;

3299

STATUS(float_exception_flags) |= float_flag_inexact;

3300

aSign = extractFloat64Sign( a );

3301

switch ( STATUS(float_rounding_mode) ) {

3302

case float_round_nearest_even:

3303

if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {

3304

return packFloat64( aSign, 0x3FF, 0 );

3305

}

3306

break;

3307

case float_round_down:

3308

return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);

3309

case float_round_up:

3310

return make_float64(

3311

aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));

3312

}

3313

return packFloat64( aSign, 0, 0 );

3314

}

3315

lastBitMask = 1;

3316

lastBitMask <<= 0x433 - aExp;

3317

roundBitsMask = lastBitMask - 1;

3318

z = float64_val(a);

3319

roundingMode = STATUS(float_rounding_mode);

3320

if ( roundingMode == float_round_nearest_even ) {

3321

z += lastBitMask>>1;

3322

if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;

3323

}

3324

else if ( roundingMode != float_round_to_zero ) {

3325

if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {

3326

z += roundBitsMask;

3327

}

3328

}

3329

z &= ~ roundBitsMask;

3330

if ( z != float64_val(a) )

3331

STATUS(float_exception_flags) |= float_flag_inexact;

3332

return make_float64(z);

3333

3334

}

3335

3336

float64 float64_trunc_to_int( float64 a STATUS_PARAM)

3337

{

3338

int oldmode;

3339

float64 res;

3340

oldmode = STATUS(float_rounding_mode);

3341

STATUS(float_rounding_mode) = float_round_to_zero;

3342

res = float64_round_to_int(a STATUS_VAR);

3343

STATUS(float_rounding_mode) = oldmode;

3344

return res;

3345

}

3346

3347

/*----------------------------------------------------------------------------

3348

| Returns the result of adding the absolute values of the double-precision

3349

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

3350

| before being returned. `zSign' is ignored if the result is a NaN.

3351

| The addition is performed according to the IEC/IEEE Standard for Binary

3352

| Floating-Point Arithmetic.

3353

*----------------------------------------------------------------------------*/

3354

3355

static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )

3356

{

3357

int_fast16_t aExp, bExp, zExp;

3358

uint64_t aSig, bSig, zSig;

3359

int_fast16_t expDiff;

3360

3361

aSig = extractFloat64Frac( a );

3362

aExp = extractFloat64Exp( a );

3363

bSig = extractFloat64Frac( b );

3364

bExp = extractFloat64Exp( b );

3365

expDiff = aExp - bExp;

3366

aSig <<= 9;

3367

bSig <<= 9;

3368

if ( 0 < expDiff ) {

3369

if ( aExp == 0x7FF ) {

3370

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3371

return a;

3372

}

3373

if ( bExp == 0 ) {

3374

--expDiff;

3375

}

3376

else {

3377

bSig |= LIT64( 0x2000000000000000 );

3378

}

3379

shift64RightJamming( bSig, expDiff, &bSig );

3380

zExp = aExp;

3381

}

3382

else if ( expDiff < 0 ) {

3383

if ( bExp == 0x7FF ) {

3384

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3385

return packFloat64( zSign, 0x7FF, 0 );

3386

}

3387

if ( aExp == 0 ) {

3388

++expDiff;

3389

}

3390

else {

3391

aSig |= LIT64( 0x2000000000000000 );

3392

}

3393

shift64RightJamming( aSig, - expDiff, &aSig );

3394

zExp = bExp;

3395

}

3396

else {

3397

if ( aExp == 0x7FF ) {

3398

if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3399

return a;

3400

}

3401

if ( aExp == 0 ) {

3402

if (STATUS(flush_to_zero)) {

3403

if (aSig | bSig) {

3404

float_raise(float_flag_output_denormal STATUS_VAR);

3405

}

3406

return packFloat64(zSign, 0, 0);

3407

}

3408

return packFloat64( zSign, 0, ( aSig + bSig )>>9 );

3409

}

3410

zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;

3411

zExp = aExp;

3412

goto roundAndPack;

3413

}

3414

aSig |= LIT64( 0x2000000000000000 );

3415

zSig = ( aSig + bSig )<<1;

3416

--zExp;

3417

if ( (int64_t) zSig < 0 ) {

3418

zSig = aSig + bSig;

3419

++zExp;

3420

}

3421

roundAndPack:

3422

return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3423

3424

}

3425

3426

/*----------------------------------------------------------------------------

3427

| Returns the result of subtracting the absolute values of the double-

3428

| precision floating-point values `a' and `b'. If `zSign' is 1, the

3429

| difference is negated before being returned. `zSign' is ignored if the

3430

| result is a NaN. The subtraction is performed according to the IEC/IEEE

3431

| Standard for Binary Floating-Point Arithmetic.

3432

*----------------------------------------------------------------------------*/

3433

3434

static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )

3435

{

3436

int_fast16_t aExp, bExp, zExp;

3437

uint64_t aSig, bSig, zSig;

3438

int_fast16_t expDiff;

3439

3440

aSig = extractFloat64Frac( a );

3441

aExp = extractFloat64Exp( a );

3442

bSig = extractFloat64Frac( b );

3443

bExp = extractFloat64Exp( b );

3444

expDiff = aExp - bExp;

3445

aSig <<= 10;

3446

bSig <<= 10;

3447

if ( 0 < expDiff ) goto aExpBigger;

3448

if ( expDiff < 0 ) goto bExpBigger;

3449

if ( aExp == 0x7FF ) {

3450

if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3451

float_raise( float_flag_invalid STATUS_VAR);

3452

return float64_default_nan;

3453

}

3454

if ( aExp == 0 ) {

3455

aExp = 1;

3456

bExp = 1;

3457

}

3458

if ( bSig < aSig ) goto aBigger;

3459

if ( aSig < bSig ) goto bBigger;

3460

return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

3461

bExpBigger:

3462

if ( bExp == 0x7FF ) {

3463

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3464

return packFloat64( zSign ^ 1, 0x7FF, 0 );

3465

}

3466

if ( aExp == 0 ) {

3467

++expDiff;

3468

}

3469

else {

3470

aSig |= LIT64( 0x4000000000000000 );

3471

}

3472

shift64RightJamming( aSig, - expDiff, &aSig );

3473

bSig |= LIT64( 0x4000000000000000 );

3474

bBigger:

3475

zSig = bSig - aSig;

3476

zExp = bExp;

3477

zSign ^= 1;

3478

goto normalizeRoundAndPack;

3479

aExpBigger:

3480

if ( aExp == 0x7FF ) {

3481

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3482

return a;

3483

}

3484

if ( bExp == 0 ) {

3485

--expDiff;

3486

}

3487

else {

3488

bSig |= LIT64( 0x4000000000000000 );

3489

}

3490

shift64RightJamming( bSig, expDiff, &bSig );

3491

aSig |= LIT64( 0x4000000000000000 );

3492

aBigger:

3493

zSig = aSig - bSig;

3494

zExp = aExp;

3495

normalizeRoundAndPack:

3496

--zExp;

3497

return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3498

3499

}

3500

3501

/*----------------------------------------------------------------------------

3502

| Returns the result of adding the double-precision floating-point values `a'

3503

| and `b'. The operation is performed according to the IEC/IEEE Standard for

3504

| Binary Floating-Point Arithmetic.

3505

*----------------------------------------------------------------------------*/

3506

3507

float64 float64_add( float64 a, float64 b STATUS_PARAM )

3508

{

3509

flag aSign, bSign;

3510

a = float64_squash_input_denormal(a STATUS_VAR);

3511

b = float64_squash_input_denormal(b STATUS_VAR);

3512

3513

aSign = extractFloat64Sign( a );

3514

bSign = extractFloat64Sign( b );

3515

if ( aSign == bSign ) {

3516

return addFloat64Sigs( a, b, aSign STATUS_VAR );

3517

}

3518

else {

3519

return subFloat64Sigs( a, b, aSign STATUS_VAR );

3520

}

3521

3522

}

3523

3524

/*----------------------------------------------------------------------------

3525

| Returns the result of subtracting the double-precision floating-point values

3526

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

3527

| for Binary Floating-Point Arithmetic.

3528

*----------------------------------------------------------------------------*/

3529

3530

float64 float64_sub( float64 a, float64 b STATUS_PARAM )

3531

{

3532

flag aSign, bSign;

3533

a = float64_squash_input_denormal(a STATUS_VAR);

3534

b = float64_squash_input_denormal(b STATUS_VAR);

3535

3536

aSign = extractFloat64Sign( a );

3537

bSign = extractFloat64Sign( b );

3538

if ( aSign == bSign ) {

3539

return subFloat64Sigs( a, b, aSign STATUS_VAR );

3540

}

3541

else {

3542

return addFloat64Sigs( a, b, aSign STATUS_VAR );

3543

}

3544

3545

}

3546

3547

/*----------------------------------------------------------------------------

3548

| Returns the result of multiplying the double-precision floating-point values

3549

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

3550

| for Binary Floating-Point Arithmetic.

3551

*----------------------------------------------------------------------------*/

3552

3553

float64 float64_mul( float64 a, float64 b STATUS_PARAM )

3554

{

3555

flag aSign, bSign, zSign;

3556

int_fast16_t aExp, bExp, zExp;

3557

uint64_t aSig, bSig, zSig0, zSig1;

3558

3559

a = float64_squash_input_denormal(a STATUS_VAR);

3560

b = float64_squash_input_denormal(b STATUS_VAR);

3561

3562

aSig = extractFloat64Frac( a );

3563

aExp = extractFloat64Exp( a );

3564

aSign = extractFloat64Sign( a );

3565

bSig = extractFloat64Frac( b );

3566

bExp = extractFloat64Exp( b );

3567

bSign = extractFloat64Sign( b );

3568

zSign = aSign ^ bSign;

3569

if ( aExp == 0x7FF ) {

3570

if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {

3571

return propagateFloat64NaN( a, b STATUS_VAR );

3572

}

3573

if ( ( bExp | bSig ) == 0 ) {

3574

float_raise( float_flag_invalid STATUS_VAR);

3575

return float64_default_nan;

3576

}

3577

return packFloat64( zSign, 0x7FF, 0 );

3578

}

3579

if ( bExp == 0x7FF ) {

3580

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3581

if ( ( aExp | aSig ) == 0 ) {

3582

float_raise( float_flag_invalid STATUS_VAR);

3583

return float64_default_nan;

3584

}

3585

return packFloat64( zSign, 0x7FF, 0 );

3586

}

3587

if ( aExp == 0 ) {

3588

if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );

3589

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3590

}

3591

if ( bExp == 0 ) {

3592

if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );

3593

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3594

}

3595

zExp = aExp + bExp - 0x3FF;

3596

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;

3597

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3598

mul64To128( aSig, bSig, &zSig0, &zSig1 );

3599

zSig0 |= ( zSig1 != 0 );

3600

if ( 0 <= (int64_t) ( zSig0<<1 ) ) {

3601

zSig0 <<= 1;

3602

--zExp;

3603

}

3604

return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );

3605

3606

}

3607

3608

/*----------------------------------------------------------------------------

3609

| Returns the result of dividing the double-precision floating-point value `a'

3610

| by the corresponding value `b'. The operation is performed according to

3611

| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

3612

*----------------------------------------------------------------------------*/

3613

3614

float64 float64_div( float64 a, float64 b STATUS_PARAM )

3615

{

3616

flag aSign, bSign, zSign;

3617

int_fast16_t aExp, bExp, zExp;

3618

uint64_t aSig, bSig, zSig;

3619

uint64_t rem0, rem1;

3620

uint64_t term0, term1;

3621

a = float64_squash_input_denormal(a STATUS_VAR);

3622

b = float64_squash_input_denormal(b STATUS_VAR);

3623

3624

aSig = extractFloat64Frac( a );

3625

aExp = extractFloat64Exp( a );

3626

aSign = extractFloat64Sign( a );

3627

bSig = extractFloat64Frac( b );

3628

bExp = extractFloat64Exp( b );

3629

bSign = extractFloat64Sign( b );

3630

zSign = aSign ^ bSign;

3631

if ( aExp == 0x7FF ) {

3632

if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3633

if ( bExp == 0x7FF ) {

3634

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3635

float_raise( float_flag_invalid STATUS_VAR);

3636

return float64_default_nan;

3637

}

3638

return packFloat64( zSign, 0x7FF, 0 );

3639

}

3640

if ( bExp == 0x7FF ) {

3641

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3642

return packFloat64( zSign, 0, 0 );

3643

}

3644

if ( bExp == 0 ) {

3645

if ( bSig == 0 ) {

3646

if ( ( aExp | aSig ) == 0 ) {

3647

float_raise( float_flag_invalid STATUS_VAR);

3648

return float64_default_nan;

3649

}

3650

float_raise( float_flag_divbyzero STATUS_VAR);

3651

return packFloat64( zSign, 0x7FF, 0 );

3652

}

3653

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3654

}

3655

if ( aExp == 0 ) {

3656

if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );

3657

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3658

}

3659

zExp = aExp - bExp + 0x3FD;

3660

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;

3661

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3662

if ( bSig <= ( aSig + aSig ) ) {

3663

aSig >>= 1;

3664

++zExp;

3665

}

3666

zSig = estimateDiv128To64( aSig, 0, bSig );

3667

if ( ( zSig & 0x1FF ) <= 2 ) {

3668

mul64To128( bSig, zSig, &term0, &term1 );

3669

sub128( aSig, 0, term0, term1, &rem0, &rem1 );

3670

while ( (int64_t) rem0 < 0 ) {

3671

--zSig;

3672

add128( rem0, rem1, 0, bSig, &rem0, &rem1 );

3673

}

3674

zSig |= ( rem1 != 0 );

3675

}

3676

return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );

3677

3678

}

3679

3680

/*----------------------------------------------------------------------------

3681

| Returns the remainder of the double-precision floating-point value `a'

3682

| with respect to the corresponding value `b'. The operation is performed

3683

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

3684

*----------------------------------------------------------------------------*/

3685

3686

float64 float64_rem( float64 a, float64 b STATUS_PARAM )

3687

{

3688

flag aSign, zSign;

3689

int_fast16_t aExp, bExp, expDiff;

3690

uint64_t aSig, bSig;

3691

uint64_t q, alternateASig;

3692

int64_t sigMean;

3693

3694

a = float64_squash_input_denormal(a STATUS_VAR);

3695

b = float64_squash_input_denormal(b STATUS_VAR);

3696

aSig = extractFloat64Frac( a );

3697

aExp = extractFloat64Exp( a );

3698

aSign = extractFloat64Sign( a );

3699

bSig = extractFloat64Frac( b );

3700

bExp = extractFloat64Exp( b );

3701

if ( aExp == 0x7FF ) {

3702

if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {

3703

return propagateFloat64NaN( a, b STATUS_VAR );

3704

}

3705

float_raise( float_flag_invalid STATUS_VAR);

3706

return float64_default_nan;

3707

}

3708

if ( bExp == 0x7FF ) {

3709

if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );

3710

return a;

3711

}

3712

if ( bExp == 0 ) {

3713

if ( bSig == 0 ) {

3714

float_raise( float_flag_invalid STATUS_VAR);

3715

return float64_default_nan;

3716

}

3717

normalizeFloat64Subnormal( bSig, &bExp, &bSig );

3718

}

3719

if ( aExp == 0 ) {

3720

if ( aSig == 0 ) return a;

3721

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

3722

}

3723

expDiff = aExp - bExp;

3724

aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;

3725

bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;

3726

if ( expDiff < 0 ) {

3727

if ( expDiff < -1 ) return a;

3728

aSig >>= 1;

3729

}

3730

q = ( bSig <= aSig );

3731

if ( q ) aSig -= bSig;

3732

expDiff -= 64;

3733

while ( 0 < expDiff ) {

3734

q = estimateDiv128To64( aSig, 0, bSig );

3735

q = ( 2 < q ) ? q - 2 : 0;

3736

aSig = - ( ( bSig>>2 ) * q );

3737

expDiff -= 62;

3738

}

3739

expDiff += 64;

3740

if ( 0 < expDiff ) {

3741

q = estimateDiv128To64( aSig, 0, bSig );

3742

q = ( 2 < q ) ? q - 2 : 0;

3743

q >>= 64 - expDiff;

3744

bSig >>= 2;

3745

aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;

3746

}

3747

else {

3748

aSig >>= 2;

3749

bSig >>= 2;

3750

}

3751

do {

3752

alternateASig = aSig;

3753

++q;

3754

aSig -= bSig;

3755

} while ( 0 <= (int64_t) aSig );

3756

sigMean = aSig + alternateASig;

3757

if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {

3758

aSig = alternateASig;

3759

}

3760

zSign = ( (int64_t) aSig < 0 );

3761

if ( zSign ) aSig = - aSig;

3762

return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );

3763

3764

}

3765

3766

/*----------------------------------------------------------------------------

3767

| Returns the result of multiplying the double-precision floating-point values

3768

| `a' and `b' then adding 'c', with no intermediate rounding step after the

3769

| multiplication. The operation is performed according to the IEC/IEEE

3770

| Standard for Binary Floating-Point Arithmetic 754-2008.

3771

| The flags argument allows the caller to select negation of the

3772

| addend, the intermediate product, or the final result. (The difference

3773

| between this and having the caller do a separate negation is that negating

3774

| externally will flip the sign bit on NaNs.)

3775

*----------------------------------------------------------------------------*/

3776

3777

float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)

3778

{

3779

flag aSign, bSign, cSign, zSign;

3780

int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;

3781

uint64_t aSig, bSig, cSig;

3782

flag pInf, pZero, pSign;

3783

uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;

3784

int shiftcount;

3785

flag signflip, infzero;

3786

3787

a = float64_squash_input_denormal(a STATUS_VAR);

3788

b = float64_squash_input_denormal(b STATUS_VAR);

3789

c = float64_squash_input_denormal(c STATUS_VAR);

3790

aSig = extractFloat64Frac(a);

3791

aExp = extractFloat64Exp(a);

3792

aSign = extractFloat64Sign(a);

3793

bSig = extractFloat64Frac(b);

3794

bExp = extractFloat64Exp(b);

3795

bSign = extractFloat64Sign(b);

3796

cSig = extractFloat64Frac(c);

3797

cExp = extractFloat64Exp(c);

3798

cSign = extractFloat64Sign(c);

3799

3800

infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||

3801

(aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));

3802

3803

/* It is implementation-defined whether the cases of (0,inf,qnan)

3804

* and (inf,0,qnan) raise InvalidOperation or not (and what QNaN

3805

* they return if they do), so we have to hand this information

3806

* off to the target-specific pick-a-NaN routine.

3807

3808

if (((aExp == 0x7ff) && aSig) ||

3809

((bExp == 0x7ff) && bSig) ||

3810

((cExp == 0x7ff) && cSig)) {

3811

return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);

3812

}

3813

3814

if (infzero) {

3815

float_raise(float_flag_invalid STATUS_VAR);

3816

return float64_default_nan;

3817

}

3818

3819

if (flags & float_muladd_negate_c) {

3820

cSign ^= 1;

3821

}

3822

3823

signflip = (flags & float_muladd_negate_result) ? 1 : 0;

3824

3825

/* Work out the sign and type of the product */

3826

pSign = aSign ^ bSign;

3827

if (flags & float_muladd_negate_product) {

3828

pSign ^= 1;

3829

}

3830

pInf = (aExp == 0x7ff) || (bExp == 0x7ff);

3831

pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);

3832

3833

if (cExp == 0x7ff) {

3834

if (pInf && (pSign ^ cSign)) {

3835

/* addition of opposite-signed infinities => InvalidOperation */

3836

float_raise(float_flag_invalid STATUS_VAR);

3837

return float64_default_nan;

3838

}

3839

/* Otherwise generate an infinity of the same sign */

3840

return packFloat64(cSign ^ signflip, 0x7ff, 0);

3841

}

3842

3843

if (pInf) {

3844

return packFloat64(pSign ^ signflip, 0x7ff, 0);

3845

}

3846

3847

if (pZero) {

3848

if (cExp == 0) {

3849

if (cSig == 0) {

3850

/* Adding two exact zeroes */

3851

if (pSign == cSign) {

3852

zSign = pSign;

3853

} else if (STATUS(float_rounding_mode) == float_round_down) {

3854

zSign = 1;

3855

} else {

3856

zSign = 0;

3857

}

3858

return packFloat64(zSign ^ signflip, 0, 0);

3859

}

3860

/* Exact zero plus a denorm */

3861

if (STATUS(flush_to_zero)) {

3862

float_raise(float_flag_output_denormal STATUS_VAR);

3863

return packFloat64(cSign ^ signflip, 0, 0);

3864

}

3865

}

3866

/* Zero plus something non-zero : just return the something */

3867

return packFloat64(cSign ^ signflip, cExp, cSig);

3868

}

3869

3870

if (aExp == 0) {

3871

normalizeFloat64Subnormal(aSig, &aExp, &aSig);

3872

}

3873

if (bExp == 0) {

3874

normalizeFloat64Subnormal(bSig, &bExp, &bSig);

3875

}

3876

3877

/* Calculate the actual result a * b + c */

3878

3879

/* Multiply first; this is easy. */

3880

/* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff

3881

* because we want the true exponent, not the "one-less-than"

3882

* flavour that roundAndPackFloat64() takes.

3883

3884

pExp = aExp + bExp - 0x3fe;

3885

aSig = (aSig | LIT64(0x0010000000000000))<<10;

3886

bSig = (bSig | LIT64(0x0010000000000000))<<11;

3887

mul64To128(aSig, bSig, &pSig0, &pSig1);

3888

if ((int64_t)(pSig0 << 1) >= 0) {

3889

shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);

3890

pExp--;

3891

}

3892

3893

zSign = pSign ^ signflip;

3894

3895

/* Now [pSig0:pSig1] is the significand of the multiply, with the explicit

3896

* bit in position 126.

3897

3898

if (cExp == 0) {

3899

if (!cSig) {

3900

/* Throw out the special case of c being an exact zero now */

3901

shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);

3902

return roundAndPackFloat64(zSign, pExp - 1,

3903

pSig1 STATUS_VAR);

3904

}

3905

normalizeFloat64Subnormal(cSig, &cExp, &cSig);

3906

}

3907

3908

/* Shift cSig and add the explicit bit so [cSig0:cSig1] is the

3909

* significand of the addend, with the explicit bit in position 126.

3910

3911

cSig0 = cSig << (126 - 64 - 52);

3912

cSig1 = 0;

3913

cSig0 |= LIT64(0x4000000000000000);

3914

expDiff = pExp - cExp;

3915

3916

if (pSign == cSign) {

3917

/* Addition */

3918

if (expDiff > 0) {

3919

/* scale c to match p */

3920

shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);

3921

zExp = pExp;

3922

} else if (expDiff < 0) {

3923

/* scale p to match c */

3924

shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);

3925

zExp = cExp;

3926

} else {

3927

/* no scaling needed */

3928

zExp = cExp;

3929

}

3930

/* Add significands and make sure explicit bit ends up in posn 126 */

3931

add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

3932

if ((int64_t)zSig0 < 0) {

3933

shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);

3934

} else {

3935

zExp--;

3936

}

3937

shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);

3938

return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);

3939

} else {

3940

/* Subtraction */

3941

if (expDiff > 0) {

3942

shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);

3943

sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

3944

zExp = pExp;

3945

} else if (expDiff < 0) {

3946

shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);

3947

sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);

3948

zExp = cExp;

3949

zSign ^= 1;

3950

} else {

3951

zExp = pExp;

3952

if (lt128(cSig0, cSig1, pSig0, pSig1)) {

3953

sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);

3954

} else if (lt128(pSig0, pSig1, cSig0, cSig1)) {

3955

sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);

3956

zSign ^= 1;

3957

} else {

3958

/* Exact zero */

3959

zSign = signflip;

3960

if (STATUS(float_rounding_mode) == float_round_down) {

3961

zSign ^= 1;

3962

}

3963

return packFloat64(zSign, 0, 0);

3964

}

3965

}

3966

--zExp;

3967

/* Do the equivalent of normalizeRoundAndPackFloat64() but

3968

* starting with the significand in a pair of uint64_t.

3969

3970

if (zSig0) {

3971

shiftcount = countLeadingZeros64(zSig0) - 1;

3972

shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);

3973

if (zSig1) {

3974

zSig0 |= 1;

3975

}

3976

zExp -= shiftcount;

3977

} else {

3978

shiftcount = countLeadingZeros64(zSig1);

3979

if (shiftcount == 0) {

3980

zSig0 = (zSig1 >> 1) | (zSig1 & 1);

3981

zExp -= 63;

3982

} else {

3983

shiftcount--;

3984

zSig0 = zSig1 << shiftcount;

3985

zExp -= (shiftcount + 64);

3986

}

3987

}

3988

return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);

3989

}

3990

}

3991

3992

/*----------------------------------------------------------------------------

3993

| Returns the square root of the double-precision floating-point value `a'.

3994

| The operation is performed according to the IEC/IEEE Standard for Binary

3995

| Floating-Point Arithmetic.

3996

*----------------------------------------------------------------------------*/

3997

3998

float64 float64_sqrt( float64 a STATUS_PARAM )

3999

{

4000

flag aSign;

4001

int_fast16_t aExp, zExp;

4002

uint64_t aSig, zSig, doubleZSig;

4003

uint64_t rem0, rem1, term0, term1;

4004

a = float64_squash_input_denormal(a STATUS_VAR);

4005

4006

aSig = extractFloat64Frac( a );

4007

aExp = extractFloat64Exp( a );

4008

aSign = extractFloat64Sign( a );

4009

if ( aExp == 0x7FF ) {

4010

if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );

4011

if ( ! aSign ) return a;

4012

float_raise( float_flag_invalid STATUS_VAR);

4013

return float64_default_nan;

4014

}

4015

if ( aSign ) {

4016

if ( ( aExp | aSig ) == 0 ) return a;

4017

float_raise( float_flag_invalid STATUS_VAR);

4018

return float64_default_nan;

4019

}

4020

if ( aExp == 0 ) {

4021

if ( aSig == 0 ) return float64_zero;

4022

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

4023

}

4024

zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;

4025

aSig |= LIT64( 0x0010000000000000 );

4026

zSig = estimateSqrt32( aExp, aSig>>21 );

4027

aSig <<= 9 - ( aExp & 1 );

4028

zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );

4029

if ( ( zSig & 0x1FF ) <= 5 ) {

4030

doubleZSig = zSig<<1;

4031

mul64To128( zSig, zSig, &term0, &term1 );

4032

sub128( aSig, 0, term0, term1, &rem0, &rem1 );

4033

while ( (int64_t) rem0 < 0 ) {

4034

--zSig;

4035

doubleZSig -= 2;

4036

add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );

4037

}

4038

zSig |= ( ( rem0 | rem1 ) != 0 );

4039

}

4040

return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );

4041

4042

}

4043

4044

/*----------------------------------------------------------------------------

4045

| Returns the binary log of the double-precision floating-point value `a'.

4046

| The operation is performed according to the IEC/IEEE Standard for Binary

4047

| Floating-Point Arithmetic.

4048

*----------------------------------------------------------------------------*/

4049

float64 float64_log2( float64 a STATUS_PARAM )

4050

{

4051

flag aSign, zSign;

4052

int_fast16_t aExp;

4053

uint64_t aSig, aSig0, aSig1, zSig, i;

4054

a = float64_squash_input_denormal(a STATUS_VAR);

4055

4056

aSig = extractFloat64Frac( a );

4057

aExp = extractFloat64Exp( a );

4058

aSign = extractFloat64Sign( a );

4059

4060

if ( aExp == 0 ) {

4061

if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );

4062

normalizeFloat64Subnormal( aSig, &aExp, &aSig );

4063

}

4064

if ( aSign ) {

4065

float_raise( float_flag_invalid STATUS_VAR);

4066

return float64_default_nan;

4067

}

4068

if ( aExp == 0x7FF ) {

4069

if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );

4070

return a;

4071

}

4072

4073

aExp -= 0x3FF;

4074

aSig |= LIT64( 0x0010000000000000 );

4075

zSign = aExp < 0;

4076

zSig = (uint64_t)aExp << 52;

4077

for (i = 1LL << 51; i > 0; i >>= 1) {

4078

mul64To128( aSig, aSig, &aSig0, &aSig1 );

4079

aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );

4080

if ( aSig & LIT64( 0x0020000000000000 ) ) {

4081

aSig >>= 1;

4082

zSig |= i;

4083

}

4084

}

4085

4086

if ( zSign )

4087

zSig = -zSig;

4088

return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );

4089

}

4090

4091

/*----------------------------------------------------------------------------

4092

| Returns 1 if the double-precision floating-point value `a' is equal to the

4093

| corresponding value `b', and 0 otherwise. The invalid exception is raised

4094

| if either operand is a NaN. Otherwise, the comparison is performed

4095

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4096

*----------------------------------------------------------------------------*/

4097

4098

int float64_eq( float64 a, float64 b STATUS_PARAM )

4099

{

4100

uint64_t av, bv;

4101

a = float64_squash_input_denormal(a STATUS_VAR);

4102

b = float64_squash_input_denormal(b STATUS_VAR);

4103

4104

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4105

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4106

) {

4107

float_raise( float_flag_invalid STATUS_VAR);

4108

return 0;

4109

}

4110

av = float64_val(a);

4111

bv = float64_val(b);

4112

return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4113

4114

}

4115

4116

/*----------------------------------------------------------------------------

4117

| Returns 1 if the double-precision floating-point value `a' is less than or

4118

| equal to the corresponding value `b', and 0 otherwise. The invalid

4119

| exception is raised if either operand is a NaN. The comparison is performed

4120

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4121

*----------------------------------------------------------------------------*/

4122

4123

int float64_le( float64 a, float64 b STATUS_PARAM )

4124

{

4125

flag aSign, bSign;

4126

uint64_t av, bv;

4127

a = float64_squash_input_denormal(a STATUS_VAR);

4128

b = float64_squash_input_denormal(b STATUS_VAR);

4129

4130

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4131

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4132

) {

4133

float_raise( float_flag_invalid STATUS_VAR);

4134

return 0;

4135

}

4136

aSign = extractFloat64Sign( a );

4137

bSign = extractFloat64Sign( b );

4138

av = float64_val(a);

4139

bv = float64_val(b);

4140

if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4141

return ( av == bv ) || ( aSign ^ ( av < bv ) );

4142

4143

}

4144

4145

/*----------------------------------------------------------------------------

4146

| Returns 1 if the double-precision floating-point value `a' is less than

4147

| the corresponding value `b', and 0 otherwise. The invalid exception is

4148

| raised if either operand is a NaN. The comparison is performed according

4149

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4150

*----------------------------------------------------------------------------*/

4151

4152

int float64_lt( float64 a, float64 b STATUS_PARAM )

4153

{

4154

flag aSign, bSign;

4155

uint64_t av, bv;

4156

4157

a = float64_squash_input_denormal(a STATUS_VAR);

4158

b = float64_squash_input_denormal(b STATUS_VAR);

4159

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4160

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4161

) {

4162

float_raise( float_flag_invalid STATUS_VAR);

4163

return 0;

4164

}

4165

aSign = extractFloat64Sign( a );

4166

bSign = extractFloat64Sign( b );

4167

av = float64_val(a);

4168

bv = float64_val(b);

4169

if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );

4170

return ( av != bv ) && ( aSign ^ ( av < bv ) );

4171

4172

}

4173

4174

/*----------------------------------------------------------------------------

4175

| Returns 1 if the double-precision floating-point values `a' and `b' cannot

4176

| be compared, and 0 otherwise. The invalid exception is raised if either

4177

| operand is a NaN. The comparison is performed according to the IEC/IEEE

4178

| Standard for Binary Floating-Point Arithmetic.

4179

*----------------------------------------------------------------------------*/

4180

4181

int float64_unordered( float64 a, float64 b STATUS_PARAM )

4182

{

4183

a = float64_squash_input_denormal(a STATUS_VAR);

4184

b = float64_squash_input_denormal(b STATUS_VAR);

4185

4186

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4187

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4188

) {

4189

float_raise( float_flag_invalid STATUS_VAR);

4190

return 1;

4191

}

4192

return 0;

4193

}

4194

4195

/*----------------------------------------------------------------------------

4196

| Returns 1 if the double-precision floating-point value `a' is equal to the

4197

| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

4198

| exception.The comparison is performed according to the IEC/IEEE Standard

4199

| for Binary Floating-Point Arithmetic.

4200

*----------------------------------------------------------------------------*/

4201

4202

int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )

4203

{

4204

uint64_t av, bv;

4205

a = float64_squash_input_denormal(a STATUS_VAR);

4206

b = float64_squash_input_denormal(b STATUS_VAR);

4207

4208

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4209

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4210

) {

4211

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4212

float_raise( float_flag_invalid STATUS_VAR);

4213

}

4214

return 0;

4215

}

4216

av = float64_val(a);

4217

bv = float64_val(b);

4218

return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4219

4220

}

4221

4222

/*----------------------------------------------------------------------------

4223

| Returns 1 if the double-precision floating-point value `a' is less than or

4224

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

4225

| cause an exception. Otherwise, the comparison is performed according to the

4226

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4227

*----------------------------------------------------------------------------*/

4228

4229

int float64_le_quiet( float64 a, float64 b STATUS_PARAM )

4230

{

4231

flag aSign, bSign;

4232

uint64_t av, bv;

4233

a = float64_squash_input_denormal(a STATUS_VAR);

4234

b = float64_squash_input_denormal(b STATUS_VAR);

4235

4236

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4237

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4238

) {

4239

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4240

float_raise( float_flag_invalid STATUS_VAR);

4241

}

4242

return 0;

4243

}

4244

aSign = extractFloat64Sign( a );

4245

bSign = extractFloat64Sign( b );

4246

av = float64_val(a);

4247

bv = float64_val(b);

4248

if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );

4249

return ( av == bv ) || ( aSign ^ ( av < bv ) );

4250

4251

}

4252

4253

/*----------------------------------------------------------------------------

4254

| Returns 1 if the double-precision floating-point value `a' is less than

4255

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

4256

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

4257

| Standard for Binary Floating-Point Arithmetic.

4258

*----------------------------------------------------------------------------*/

4259

4260

int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )

4261

{

4262

flag aSign, bSign;

4263

uint64_t av, bv;

4264

a = float64_squash_input_denormal(a STATUS_VAR);

4265

b = float64_squash_input_denormal(b STATUS_VAR);

4266

4267

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4268

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4269

) {

4270

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4271

float_raise( float_flag_invalid STATUS_VAR);

4272

}

4273

return 0;

4274

}

4275

aSign = extractFloat64Sign( a );

4276

bSign = extractFloat64Sign( b );

4277

av = float64_val(a);

4278

bv = float64_val(b);

4279

if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );

4280

return ( av != bv ) && ( aSign ^ ( av < bv ) );

4281

4282

}

4283

4284

/*----------------------------------------------------------------------------

4285

| Returns 1 if the double-precision floating-point values `a' and `b' cannot

4286

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

4287

| comparison is performed according to the IEC/IEEE Standard for Binary

4288

| Floating-Point Arithmetic.

4289

*----------------------------------------------------------------------------*/

4290

4291

int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )

4292

{

4293

a = float64_squash_input_denormal(a STATUS_VAR);

4294

b = float64_squash_input_denormal(b STATUS_VAR);

4295

4296

if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )

4297

|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )

4298

) {

4299

if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {

4300

float_raise( float_flag_invalid STATUS_VAR);

4301

}

4302

return 1;

4303

}

4304

return 0;

4305

}

4306

4307

/*----------------------------------------------------------------------------

4308

| Returns the result of converting the extended double-precision floating-

4309

| point value `a' to the 32-bit two's complement integer format. The

4310

| conversion is performed according to the IEC/IEEE Standard for Binary

4311

| Floating-Point Arithmetic---which means in particular that the conversion

4312

| is rounded according to the current rounding mode. If `a' is a NaN, the

4313

| largest positive integer is returned. Otherwise, if the conversion

4314

| overflows, the largest integer with the same sign as `a' is returned.

4315

*----------------------------------------------------------------------------*/

4316

4317

int32 floatx80_to_int32( floatx80 a STATUS_PARAM )

4318

{

4319

flag aSign;

4320

int32 aExp, shiftCount;

4321

uint64_t aSig;

4322

4323

aSig = extractFloatx80Frac( a );

4324

aExp = extractFloatx80Exp( a );

4325

aSign = extractFloatx80Sign( a );

4326

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;

4327

shiftCount = 0x4037 - aExp;

4328

if ( shiftCount <= 0 ) shiftCount = 1;

4329

shift64RightJamming( aSig, shiftCount, &aSig );

4330

return roundAndPackInt32( aSign, aSig STATUS_VAR );

4331

4332

}

4333

4334

/*----------------------------------------------------------------------------

4335

| Returns the result of converting the extended double-precision floating-

4336

| point value `a' to the 32-bit two's complement integer format. The

4337

| conversion is performed according to the IEC/IEEE Standard for Binary

4338

| Floating-Point Arithmetic, except that the conversion is always rounded

4339

| toward zero. If `a' is a NaN, the largest positive integer is returned.

4340

| Otherwise, if the conversion overflows, the largest integer with the same

4341

| sign as `a' is returned.

4342

*----------------------------------------------------------------------------*/

4343

4344

int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )

4345

{

4346

flag aSign;

4347

int32 aExp, shiftCount;

4348

uint64_t aSig, savedASig;

4349

int32_t z;

4350

4351

aSig = extractFloatx80Frac( a );

4352

aExp = extractFloatx80Exp( a );

4353

aSign = extractFloatx80Sign( a );

4354

if ( 0x401E < aExp ) {

4355

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;

4356

goto invalid;

4357

}

4358

else if ( aExp < 0x3FFF ) {

4359

if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

4360

return 0;

4361

}

4362

shiftCount = 0x403E - aExp;

4363

savedASig = aSig;

4364

aSig >>= shiftCount;

4365

z = aSig;

4366

if ( aSign ) z = - z;

4367

if ( ( z < 0 ) ^ aSign ) {

4368

invalid:

4369

float_raise( float_flag_invalid STATUS_VAR);

4370

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

4371

}

4372

if ( ( aSig<<shiftCount ) != savedASig ) {

4373

STATUS(float_exception_flags) |= float_flag_inexact;

4374

}

4375

return z;

4376

4377

}

4378

4379

/*----------------------------------------------------------------------------

4380

| Returns the result of converting the extended double-precision floating-

4381

| point value `a' to the 64-bit two's complement integer format. The

4382

| conversion is performed according to the IEC/IEEE Standard for Binary

4383

| Floating-Point Arithmetic---which means in particular that the conversion

4384

| is rounded according to the current rounding mode. If `a' is a NaN,

4385

| the largest positive integer is returned. Otherwise, if the conversion

4386

| overflows, the largest integer with the same sign as `a' is returned.

4387

*----------------------------------------------------------------------------*/

4388

4389

int64 floatx80_to_int64( floatx80 a STATUS_PARAM )

4390

{

4391

flag aSign;

4392

int32 aExp, shiftCount;

4393

uint64_t aSig, aSigExtra;

4394

4395

aSig = extractFloatx80Frac( a );

4396

aExp = extractFloatx80Exp( a );

4397

aSign = extractFloatx80Sign( a );

4398

shiftCount = 0x403E - aExp;

4399

if ( shiftCount <= 0 ) {

4400

if ( shiftCount ) {

4401

float_raise( float_flag_invalid STATUS_VAR);

4402

if ( ! aSign

4403

|| ( ( aExp == 0x7FFF )

4404

&& ( aSig != LIT64( 0x8000000000000000 ) ) )

4405

) {

4406

return LIT64( 0x7FFFFFFFFFFFFFFF );

4407

}

4408

return (int64_t) LIT64( 0x8000000000000000 );

4409

}

4410

aSigExtra = 0;

4411

}

4412

else {

4413

shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );

4414

}

4415

return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );

4416

4417

}

4418

4419

/*----------------------------------------------------------------------------

4420

| Returns the result of converting the extended double-precision floating-

4421

| point value `a' to the 64-bit two's complement integer format. The

4422

| conversion is performed according to the IEC/IEEE Standard for Binary

4423

| Floating-Point Arithmetic, except that the conversion is always rounded

4424

| toward zero. If `a' is a NaN, the largest positive integer is returned.

4425

| Otherwise, if the conversion overflows, the largest integer with the same

4426

| sign as `a' is returned.

4427

*----------------------------------------------------------------------------*/

4428

4429

int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )

4430

{

4431

flag aSign;

4432

int32 aExp, shiftCount;

4433

uint64_t aSig;

4434

int64 z;

4435

4436

aSig = extractFloatx80Frac( a );

4437

aExp = extractFloatx80Exp( a );

4438

aSign = extractFloatx80Sign( a );

4439

shiftCount = aExp - 0x403E;

4440

if ( 0 <= shiftCount ) {

4441

aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );

4442

if ( ( a.high != 0xC03E ) || aSig ) {

4443

float_raise( float_flag_invalid STATUS_VAR);

4444

if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {

4445

return LIT64( 0x7FFFFFFFFFFFFFFF );

4446

}

4447

}

4448

return (int64_t) LIT64( 0x8000000000000000 );

4449

}

4450

else if ( aExp < 0x3FFF ) {

4451

if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;

4452

return 0;

4453

}

4454

z = aSig>>( - shiftCount );

4455

if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {

4456

STATUS(float_exception_flags) |= float_flag_inexact;

4457

}

4458

if ( aSign ) z = - z;

4459

return z;

4460

4461

}

4462

4463

/*----------------------------------------------------------------------------

4464

| Returns the result of converting the extended double-precision floating-

4465

| point value `a' to the single-precision floating-point format. The

4466

| conversion is performed according to the IEC/IEEE Standard for Binary

4467

| Floating-Point Arithmetic.

4468

*----------------------------------------------------------------------------*/

4469

4470

float32 floatx80_to_float32( floatx80 a STATUS_PARAM )

4471

{

4472

flag aSign;

4473

int32 aExp;

4474

uint64_t aSig;

4475

4476

aSig = extractFloatx80Frac( a );

4477

aExp = extractFloatx80Exp( a );

4478

aSign = extractFloatx80Sign( a );

4479

if ( aExp == 0x7FFF ) {

4480

if ( (uint64_t) ( aSig<<1 ) ) {

4481

return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4482

}

4483

return packFloat32( aSign, 0xFF, 0 );

4484

}

4485

shift64RightJamming( aSig, 33, &aSig );

4486

if ( aExp || aSig ) aExp -= 0x3F81;

4487

return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );

4488

4489

}

4490

4491

/*----------------------------------------------------------------------------

4492

| Returns the result of converting the extended double-precision floating-

4493

| point value `a' to the double-precision floating-point format. The

4494

| conversion is performed according to the IEC/IEEE Standard for Binary

4495

| Floating-Point Arithmetic.

4496

*----------------------------------------------------------------------------*/

4497

4498

float64 floatx80_to_float64( floatx80 a STATUS_PARAM )

4499

{

4500

flag aSign;

4501

int32 aExp;

4502

uint64_t aSig, zSig;

4503

4504

aSig = extractFloatx80Frac( a );

4505

aExp = extractFloatx80Exp( a );

4506

aSign = extractFloatx80Sign( a );

4507

if ( aExp == 0x7FFF ) {

4508

if ( (uint64_t) ( aSig<<1 ) ) {

4509

return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4510

}

4511

return packFloat64( aSign, 0x7FF, 0 );

4512

}

4513

shift64RightJamming( aSig, 1, &zSig );

4514

if ( aExp || aSig ) aExp -= 0x3C01;

4515

return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );

4516

4517

}

4518

4519

/*----------------------------------------------------------------------------

4520

| Returns the result of converting the extended double-precision floating-

4521

| point value `a' to the quadruple-precision floating-point format. The

4522

| conversion is performed according to the IEC/IEEE Standard for Binary

4523

| Floating-Point Arithmetic.

4524

*----------------------------------------------------------------------------*/

4525

4526

float128 floatx80_to_float128( floatx80 a STATUS_PARAM )

4527

{

4528

flag aSign;

4529

int_fast16_t aExp;

4530

uint64_t aSig, zSig0, zSig1;

4531

4532

aSig = extractFloatx80Frac( a );

4533

aExp = extractFloatx80Exp( a );

4534

aSign = extractFloatx80Sign( a );

4535

if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {

4536

return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

4537

}

4538

shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );

4539

return packFloat128( aSign, aExp, zSig0, zSig1 );

4540

4541

}

4542

4543

/*----------------------------------------------------------------------------

4544

| Rounds the extended double-precision floating-point value `a' to an integer,

4545

| and returns the result as an extended quadruple-precision floating-point

4546

| value. The operation is performed according to the IEC/IEEE Standard for

4547

| Binary Floating-Point Arithmetic.

4548

*----------------------------------------------------------------------------*/

4549

4550

floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )

4551

{

4552

flag aSign;

4553

int32 aExp;

4554

uint64_t lastBitMask, roundBitsMask;

4555

int8 roundingMode;

4556

floatx80 z;

4557

4558

aExp = extractFloatx80Exp( a );

4559

if ( 0x403E <= aExp ) {

4560

if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {

4561

return propagateFloatx80NaN( a, a STATUS_VAR );

4562

}

4563

return a;

4564

}

4565

if ( aExp < 0x3FFF ) {

4566

if ( ( aExp == 0 )

4567

&& ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {

4568

return a;

4569

}

4570

STATUS(float_exception_flags) |= float_flag_inexact;

4571

aSign = extractFloatx80Sign( a );

4572

switch ( STATUS(float_rounding_mode) ) {

4573

case float_round_nearest_even:

4574

if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )

4575

) {

4576

return

4577

packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );

4578

}

4579

break;

4580

case float_round_down:

4581

return

4582

aSign ?

4583

packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )

4584

: packFloatx80( 0, 0, 0 );

4585

case float_round_up:

4586

return

4587

aSign ? packFloatx80( 1, 0, 0 )

4588

: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );

4589

}

4590

return packFloatx80( aSign, 0, 0 );

4591

}

4592

lastBitMask = 1;

4593

lastBitMask <<= 0x403E - aExp;

4594

roundBitsMask = lastBitMask - 1;

4595

z = a;

4596

roundingMode = STATUS(float_rounding_mode);

4597

if ( roundingMode == float_round_nearest_even ) {

4598

z.low += lastBitMask>>1;

4599

if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;

4600

}

4601

else if ( roundingMode != float_round_to_zero ) {

4602

if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {

4603

z.low += roundBitsMask;

4604

}

4605

}

4606

z.low &= ~ roundBitsMask;

4607

if ( z.low == 0 ) {

4608

++z.high;

4609

z.low = LIT64( 0x8000000000000000 );

4610

}

4611

if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;

4612

return z;

4613

4614

}

4615

4616

/*----------------------------------------------------------------------------

4617

| Returns the result of adding the absolute values of the extended double-

4618

| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is

4619

| negated before being returned. `zSign' is ignored if the result is a NaN.

4620

| The addition is performed according to the IEC/IEEE Standard for Binary

4621

| Floating-Point Arithmetic.

4622

*----------------------------------------------------------------------------*/

4623

4624

static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)

4625

{

4626

int32 aExp, bExp, zExp;

4627

uint64_t aSig, bSig, zSig0, zSig1;

4628

int32 expDiff;

4629

4630

aSig = extractFloatx80Frac( a );

4631

aExp = extractFloatx80Exp( a );

4632

bSig = extractFloatx80Frac( b );

4633

bExp = extractFloatx80Exp( b );

4634

expDiff = aExp - bExp;

4635

if ( 0 < expDiff ) {

4636

if ( aExp == 0x7FFF ) {

4637

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4638

return a;

4639

}

4640

if ( bExp == 0 ) --expDiff;

4641

shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );

4642

zExp = aExp;

4643

}

4644

else if ( expDiff < 0 ) {

4645

if ( bExp == 0x7FFF ) {

4646

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4647

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4648

}

4649

if ( aExp == 0 ) ++expDiff;

4650

shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );

4651

zExp = bExp;

4652

}

4653

else {

4654

if ( aExp == 0x7FFF ) {

4655

if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {

4656

return propagateFloatx80NaN( a, b STATUS_VAR );

4657

}

4658

return a;

4659

}

4660

zSig1 = 0;

4661

zSig0 = aSig + bSig;

4662

if ( aExp == 0 ) {

4663

normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );

4664

goto roundAndPack;

4665

}

4666

zExp = aExp;

4667

goto shiftRight1;

4668

}

4669

zSig0 = aSig + bSig;

4670

if ( (int64_t) zSig0 < 0 ) goto roundAndPack;

4671

shiftRight1:

4672

shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );

4673

zSig0 |= LIT64( 0x8000000000000000 );

4674

++zExp;

4675

roundAndPack:

4676

return

4677

roundAndPackFloatx80(

4678

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4679

4680

}

4681

4682

/*----------------------------------------------------------------------------

4683

| Returns the result of subtracting the absolute values of the extended

4684

| double-precision floating-point values `a' and `b'. If `zSign' is 1, the

4685

| difference is negated before being returned. `zSign' is ignored if the

4686

| result is a NaN. The subtraction is performed according to the IEC/IEEE

4687

| Standard for Binary Floating-Point Arithmetic.

4688

*----------------------------------------------------------------------------*/

4689

4690

static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )

4691

{

4692

int32 aExp, bExp, zExp;

4693

uint64_t aSig, bSig, zSig0, zSig1;

4694

int32 expDiff;

4695

floatx80 z;

4696

4697

aSig = extractFloatx80Frac( a );

4698

aExp = extractFloatx80Exp( a );

4699

bSig = extractFloatx80Frac( b );

4700

bExp = extractFloatx80Exp( b );

4701

expDiff = aExp - bExp;

4702

if ( 0 < expDiff ) goto aExpBigger;

4703

if ( expDiff < 0 ) goto bExpBigger;

4704

if ( aExp == 0x7FFF ) {

4705

if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {

4706

return propagateFloatx80NaN( a, b STATUS_VAR );

4707

}

4708

float_raise( float_flag_invalid STATUS_VAR);

4709

z.low = floatx80_default_nan_low;

4710

z.high = floatx80_default_nan_high;

4711

return z;

4712

}

4713

if ( aExp == 0 ) {

4714

aExp = 1;

4715

bExp = 1;

4716

}

4717

zSig1 = 0;

4718

if ( bSig < aSig ) goto aBigger;

4719

if ( aSig < bSig ) goto bBigger;

4720

return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );

4721

bExpBigger:

4722

if ( bExp == 0x7FFF ) {

4723

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4724

return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );

4725

}

4726

if ( aExp == 0 ) ++expDiff;

4727

shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );

4728

bBigger:

4729

sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );

4730

zExp = bExp;

4731

zSign ^= 1;

4732

goto normalizeRoundAndPack;

4733

aExpBigger:

4734

if ( aExp == 0x7FFF ) {

4735

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4736

return a;

4737

}

4738

if ( bExp == 0 ) --expDiff;

4739

shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );

4740

aBigger:

4741

sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );

4742

zExp = aExp;

4743

normalizeRoundAndPack:

4744

return

4745

normalizeRoundAndPackFloatx80(

4746

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4747

4748

}

4749

4750

/*----------------------------------------------------------------------------

4751

| Returns the result of adding the extended double-precision floating-point

4752

| values `a' and `b'. The operation is performed according to the IEC/IEEE

4753

| Standard for Binary Floating-Point Arithmetic.

4754

*----------------------------------------------------------------------------*/

4755

4756

floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )

4757

{

4758

flag aSign, bSign;

4759

4760

aSign = extractFloatx80Sign( a );

4761

bSign = extractFloatx80Sign( b );

4762

if ( aSign == bSign ) {

4763

return addFloatx80Sigs( a, b, aSign STATUS_VAR );

4764

}

4765

else {

4766

return subFloatx80Sigs( a, b, aSign STATUS_VAR );

4767

}

4768

4769

}

4770

4771

/*----------------------------------------------------------------------------

4772

| Returns the result of subtracting the extended double-precision floating-

4773

| point values `a' and `b'. The operation is performed according to the

4774

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4775

*----------------------------------------------------------------------------*/

4776

4777

floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )

4778

{

4779

flag aSign, bSign;

4780

4781

aSign = extractFloatx80Sign( a );

4782

bSign = extractFloatx80Sign( b );

4783

if ( aSign == bSign ) {

4784

return subFloatx80Sigs( a, b, aSign STATUS_VAR );

4785

}

4786

else {

4787

return addFloatx80Sigs( a, b, aSign STATUS_VAR );

4788

}

4789

4790

}

4791

4792

/*----------------------------------------------------------------------------

4793

| Returns the result of multiplying the extended double-precision floating-

4794

| point values `a' and `b'. The operation is performed according to the

4795

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4796

*----------------------------------------------------------------------------*/

4797

4798

floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )

4799

{

4800

flag aSign, bSign, zSign;

4801

int32 aExp, bExp, zExp;

4802

uint64_t aSig, bSig, zSig0, zSig1;

4803

floatx80 z;

4804

4805

aSig = extractFloatx80Frac( a );

4806

aExp = extractFloatx80Exp( a );

4807

aSign = extractFloatx80Sign( a );

4808

bSig = extractFloatx80Frac( b );

4809

bExp = extractFloatx80Exp( b );

4810

bSign = extractFloatx80Sign( b );

4811

zSign = aSign ^ bSign;

4812

if ( aExp == 0x7FFF ) {

4813

if ( (uint64_t) ( aSig<<1 )

4814

|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {

4815

return propagateFloatx80NaN( a, b STATUS_VAR );

4816

}

4817

if ( ( bExp | bSig ) == 0 ) goto invalid;

4818

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4819

}

4820

if ( bExp == 0x7FFF ) {

4821

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4822

if ( ( aExp | aSig ) == 0 ) {

4823

invalid:

4824

float_raise( float_flag_invalid STATUS_VAR);

4825

z.low = floatx80_default_nan_low;

4826

z.high = floatx80_default_nan_high;

4827

return z;

4828

}

4829

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4830

}

4831

if ( aExp == 0 ) {

4832

if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );

4833

normalizeFloatx80Subnormal( aSig, &aExp, &aSig );

4834

}

4835

if ( bExp == 0 ) {

4836

if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );

4837

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

4838

}

4839

zExp = aExp + bExp - 0x3FFE;

4840

mul64To128( aSig, bSig, &zSig0, &zSig1 );

4841

if ( 0 < (int64_t) zSig0 ) {

4842

shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );

4843

--zExp;

4844

}

4845

return

4846

roundAndPackFloatx80(

4847

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4848

4849

}

4850

4851

/*----------------------------------------------------------------------------

4852

| Returns the result of dividing the extended double-precision floating-point

4853

| value `a' by the corresponding value `b'. The operation is performed

4854

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4855

*----------------------------------------------------------------------------*/

4856

4857

floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )

4858

{

4859

flag aSign, bSign, zSign;

4860

int32 aExp, bExp, zExp;

4861

uint64_t aSig, bSig, zSig0, zSig1;

4862

uint64_t rem0, rem1, rem2, term0, term1, term2;

4863

floatx80 z;

4864

4865

aSig = extractFloatx80Frac( a );

4866

aExp = extractFloatx80Exp( a );

4867

aSign = extractFloatx80Sign( a );

4868

bSig = extractFloatx80Frac( b );

4869

bExp = extractFloatx80Exp( b );

4870

bSign = extractFloatx80Sign( b );

4871

zSign = aSign ^ bSign;

4872

if ( aExp == 0x7FFF ) {

4873

if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4874

if ( bExp == 0x7FFF ) {

4875

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4876

goto invalid;

4877

}

4878

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4879

}

4880

if ( bExp == 0x7FFF ) {

4881

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4882

return packFloatx80( zSign, 0, 0 );

4883

}

4884

if ( bExp == 0 ) {

4885

if ( bSig == 0 ) {

4886

if ( ( aExp | aSig ) == 0 ) {

4887

invalid:

4888

float_raise( float_flag_invalid STATUS_VAR);

4889

z.low = floatx80_default_nan_low;

4890

z.high = floatx80_default_nan_high;

4891

return z;

4892

}

4893

float_raise( float_flag_divbyzero STATUS_VAR);

4894

return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

4895

}

4896

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

4897

}

4898

if ( aExp == 0 ) {

4899

if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );

4900

normalizeFloatx80Subnormal( aSig, &aExp, &aSig );

4901

}

4902

zExp = aExp - bExp + 0x3FFE;

4903

rem1 = 0;

4904

if ( bSig <= aSig ) {

4905

shift128Right( aSig, 0, 1, &aSig, &rem1 );

4906

++zExp;

4907

}

4908

zSig0 = estimateDiv128To64( aSig, rem1, bSig );

4909

mul64To128( bSig, zSig0, &term0, &term1 );

4910

sub128( aSig, rem1, term0, term1, &rem0, &rem1 );

4911

while ( (int64_t) rem0 < 0 ) {

4912

--zSig0;

4913

add128( rem0, rem1, 0, bSig, &rem0, &rem1 );

4914

}

4915

zSig1 = estimateDiv128To64( rem1, 0, bSig );

4916

if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {

4917

mul64To128( bSig, zSig1, &term1, &term2 );

4918

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

4919

while ( (int64_t) rem1 < 0 ) {

4920

--zSig1;

4921

add128( rem1, rem2, 0, bSig, &rem1, &rem2 );

4922

}

4923

zSig1 |= ( ( rem1 | rem2 ) != 0 );

4924

}

4925

return

4926

roundAndPackFloatx80(

4927

STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );

4928

4929

}

4930

4931

/*----------------------------------------------------------------------------

4932

| Returns the remainder of the extended double-precision floating-point value

4933

| `a' with respect to the corresponding value `b'. The operation is performed

4934

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

4935

*----------------------------------------------------------------------------*/

4936

4937

floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )

4938

{

4939

flag aSign, zSign;

4940

int32 aExp, bExp, expDiff;

4941

uint64_t aSig0, aSig1, bSig;

4942

uint64_t q, term0, term1, alternateASig0, alternateASig1;

4943

floatx80 z;

4944

4945

aSig0 = extractFloatx80Frac( a );

4946

aExp = extractFloatx80Exp( a );

4947

aSign = extractFloatx80Sign( a );

4948

bSig = extractFloatx80Frac( b );

4949

bExp = extractFloatx80Exp( b );

4950

if ( aExp == 0x7FFF ) {

4951

if ( (uint64_t) ( aSig0<<1 )

4952

|| ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {

4953

return propagateFloatx80NaN( a, b STATUS_VAR );

4954

}

4955

goto invalid;

4956

}

4957

if ( bExp == 0x7FFF ) {

4958

if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );

4959

return a;

4960

}

4961

if ( bExp == 0 ) {

4962

if ( bSig == 0 ) {

4963

invalid:

4964

float_raise( float_flag_invalid STATUS_VAR);

4965

z.low = floatx80_default_nan_low;

4966

z.high = floatx80_default_nan_high;

4967

return z;

4968

}

4969

normalizeFloatx80Subnormal( bSig, &bExp, &bSig );

4970

}

4971

if ( aExp == 0 ) {

4972

if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;

4973

normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );

4974

}

4975

bSig |= LIT64( 0x8000000000000000 );

4976

zSign = aSign;

4977

expDiff = aExp - bExp;

4978

aSig1 = 0;

4979

if ( expDiff < 0 ) {

4980

if ( expDiff < -1 ) return a;

4981

shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );

4982

expDiff = 0;

4983

}

4984

q = ( bSig <= aSig0 );

4985

if ( q ) aSig0 -= bSig;

4986

expDiff -= 64;

4987

while ( 0 < expDiff ) {

4988

q = estimateDiv128To64( aSig0, aSig1, bSig );

4989

q = ( 2 < q ) ? q - 2 : 0;

4990

mul64To128( bSig, q, &term0, &term1 );

4991

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

4992

shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );

4993

expDiff -= 62;

4994

}

4995

expDiff += 64;

4996

if ( 0 < expDiff ) {

4997

q = estimateDiv128To64( aSig0, aSig1, bSig );

4998

q = ( 2 < q ) ? q - 2 : 0;

4999

q >>= 64 - expDiff;

5000

mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );

5001

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5002

shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );

5003

while ( le128( term0, term1, aSig0, aSig1 ) ) {

5004

++q;

5005

sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );

5006

}

5007

}

5008

else {

5009

term1 = 0;

5010

term0 = bSig;

5011

}

5012

sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );

5013

if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )

5014

|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )

5015

&& ( q & 1 ) )

5016

) {

5017

aSig0 = alternateASig0;

5018

aSig1 = alternateASig1;

5019

zSign = ! zSign;

5020

}

5021

return

5022

normalizeRoundAndPackFloatx80(

5023

80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );

5024

5025

}

5026

5027

/*----------------------------------------------------------------------------

5028

| Returns the square root of the extended double-precision floating-point

5029

| value `a'. The operation is performed according to the IEC/IEEE Standard

5030

| for Binary Floating-Point Arithmetic.

5031

*----------------------------------------------------------------------------*/

5032

5033

floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )

5034

{

5035

flag aSign;

5036

int32 aExp, zExp;

5037

uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;

5038

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

5039

floatx80 z;

5040

5041

aSig0 = extractFloatx80Frac( a );

5042

aExp = extractFloatx80Exp( a );

5043

aSign = extractFloatx80Sign( a );

5044

if ( aExp == 0x7FFF ) {

5045

if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );

5046

if ( ! aSign ) return a;

5047

goto invalid;

5048

}

5049

if ( aSign ) {

5050

if ( ( aExp | aSig0 ) == 0 ) return a;

5051

invalid:

5052

float_raise( float_flag_invalid STATUS_VAR);

5053

z.low = floatx80_default_nan_low;

5054

z.high = floatx80_default_nan_high;

5055

return z;

5056

}

5057

if ( aExp == 0 ) {

5058

if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );

5059

normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );

5060

}

5061

zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;

5062

zSig0 = estimateSqrt32( aExp, aSig0>>32 );

5063

shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );

5064

zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );

5065

doubleZSig0 = zSig0<<1;

5066

mul64To128( zSig0, zSig0, &term0, &term1 );

5067

sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );

5068

while ( (int64_t) rem0 < 0 ) {

5069

--zSig0;

5070

doubleZSig0 -= 2;

5071

add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );

5072

}

5073

zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );

5074

if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {

5075

if ( zSig1 == 0 ) zSig1 = 1;

5076

mul64To128( doubleZSig0, zSig1, &term1, &term2 );

5077

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

5078

mul64To128( zSig1, zSig1, &term2, &term3 );

5079

sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );

5080

while ( (int64_t) rem1 < 0 ) {

5081

--zSig1;

5082

shortShift128Left( 0, zSig1, 1, &term2, &term3 );

5083

term3 |= 1;

5084

term2 |= doubleZSig0;

5085

add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );

5086

}

5087

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

5088

}

5089

shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );

5090

zSig0 |= doubleZSig0;

5091

return

5092

roundAndPackFloatx80(

5093

STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );

5094

5095

}

5096

5097

/*----------------------------------------------------------------------------

5098

| Returns 1 if the extended double-precision floating-point value `a' is equal

5099

| to the corresponding value `b', and 0 otherwise. The invalid exception is

5100

| raised if either operand is a NaN. Otherwise, the comparison is performed

5101

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5102

*----------------------------------------------------------------------------*/

5103

5104

int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )

5105

{

5106

5107

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5108

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5109

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5110

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5111

) {

5112

float_raise( float_flag_invalid STATUS_VAR);

5113

return 0;

5114

}

5115

return

5116

( a.low == b.low )

5117

&& ( ( a.high == b.high )

5118

|| ( ( a.low == 0 )

5119

&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )

5120

);

5121

5122

}

5123

5124

/*----------------------------------------------------------------------------

5125

| Returns 1 if the extended double-precision floating-point value `a' is

5126

| less than or equal to the corresponding value `b', and 0 otherwise. The

5127

| invalid exception is raised if either operand is a NaN. The comparison is

5128

| performed according to the IEC/IEEE Standard for Binary Floating-Point

5129

| Arithmetic.

5130

*----------------------------------------------------------------------------*/

5131

5132

int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )

5133

{

5134

flag aSign, bSign;

5135

5136

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5137

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5138

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5139

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5140

) {

5141

float_raise( float_flag_invalid STATUS_VAR);

5142

return 0;

5143

}

5144

aSign = extractFloatx80Sign( a );

5145

bSign = extractFloatx80Sign( b );

5146

if ( aSign != bSign ) {

5147

return

5148

aSign

5149

|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5150

== 0 );

5151

}

5152

return

5153

aSign ? le128( b.high, b.low, a.high, a.low )

5154

: le128( a.high, a.low, b.high, b.low );

5155

5156

}

5157

5158

/*----------------------------------------------------------------------------

5159

| Returns 1 if the extended double-precision floating-point value `a' is

5160

| less than the corresponding value `b', and 0 otherwise. The invalid

5161

| exception is raised if either operand is a NaN. The comparison is performed

5162

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5163

*----------------------------------------------------------------------------*/

5164

5165

int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )

5166

{

5167

flag aSign, bSign;

5168

5169

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5170

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5171

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5172

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5173

) {

5174

float_raise( float_flag_invalid STATUS_VAR);

5175

return 0;

5176

}

5177

aSign = extractFloatx80Sign( a );

5178

bSign = extractFloatx80Sign( b );

5179

if ( aSign != bSign ) {

5180

return

5181

aSign

5182

&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5183

!= 0 );

5184

}

5185

return

5186

aSign ? lt128( b.high, b.low, a.high, a.low )

5187

: lt128( a.high, a.low, b.high, b.low );

5188

5189

}

5190

5191

/*----------------------------------------------------------------------------

5192

| Returns 1 if the extended double-precision floating-point values `a' and `b'

5193

| cannot be compared, and 0 otherwise. The invalid exception is raised if

5194

| either operand is a NaN. The comparison is performed according to the

5195

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5196

*----------------------------------------------------------------------------*/

5197

int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )

5198

{

5199

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5200

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5201

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5202

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5203

) {

5204

float_raise( float_flag_invalid STATUS_VAR);

5205

return 1;

5206

}

5207

return 0;

5208

}

5209

5210

/*----------------------------------------------------------------------------

5211

| Returns 1 if the extended double-precision floating-point value `a' is

5212

| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

5213

| cause an exception. The comparison is performed according to the IEC/IEEE

5214

| Standard for Binary Floating-Point Arithmetic.

5215

*----------------------------------------------------------------------------*/

5216

5217

int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5218

{

5219

5220

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5221

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5222

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5223

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5224

) {

5225

if ( floatx80_is_signaling_nan( a )

5226

|| floatx80_is_signaling_nan( b ) ) {

5227

float_raise( float_flag_invalid STATUS_VAR);

5228

}

5229

return 0;

5230

}

5231

return

5232

( a.low == b.low )

5233

&& ( ( a.high == b.high )

5234

|| ( ( a.low == 0 )

5235

&& ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )

5236

);

5237

5238

}

5239

5240

/*----------------------------------------------------------------------------

5241

| Returns 1 if the extended double-precision floating-point value `a' is less

5242

| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs

5243

| do not cause an exception. Otherwise, the comparison is performed according

5244

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5245

*----------------------------------------------------------------------------*/

5246

5247

int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5248

{

5249

flag aSign, bSign;

5250

5251

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5252

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5253

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5254

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5255

) {

5256

if ( floatx80_is_signaling_nan( a )

5257

|| floatx80_is_signaling_nan( b ) ) {

5258

float_raise( float_flag_invalid STATUS_VAR);

5259

}

5260

return 0;

5261

}

5262

aSign = extractFloatx80Sign( a );

5263

bSign = extractFloatx80Sign( b );

5264

if ( aSign != bSign ) {

5265

return

5266

aSign

5267

|| ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5268

== 0 );

5269

}

5270

return

5271

aSign ? le128( b.high, b.low, a.high, a.low )

5272

: le128( a.high, a.low, b.high, b.low );

5273

5274

}

5275

5276

/*----------------------------------------------------------------------------

5277

| Returns 1 if the extended double-precision floating-point value `a' is less

5278

| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause

5279

| an exception. Otherwise, the comparison is performed according to the

5280

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

5281

*----------------------------------------------------------------------------*/

5282

5283

int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5284

{

5285

flag aSign, bSign;

5286

5287

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5288

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5289

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5290

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5291

) {

5292

if ( floatx80_is_signaling_nan( a )

5293

|| floatx80_is_signaling_nan( b ) ) {

5294

float_raise( float_flag_invalid STATUS_VAR);

5295

}

5296

return 0;

5297

}

5298

aSign = extractFloatx80Sign( a );

5299

bSign = extractFloatx80Sign( b );

5300

if ( aSign != bSign ) {

5301

return

5302

aSign

5303

&& ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

5304

!= 0 );

5305

}

5306

return

5307

aSign ? lt128( b.high, b.low, a.high, a.low )

5308

: lt128( a.high, a.low, b.high, b.low );

5309

5310

}

5311

5312

/*----------------------------------------------------------------------------

5313

| Returns 1 if the extended double-precision floating-point values `a' and `b'

5314

| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.

5315

| The comparison is performed according to the IEC/IEEE Standard for Binary

5316

| Floating-Point Arithmetic.

5317

*----------------------------------------------------------------------------*/

5318

int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )

5319

{

5320

if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )

5321

&& (uint64_t) ( extractFloatx80Frac( a )<<1 ) )

5322

|| ( ( extractFloatx80Exp( b ) == 0x7FFF )

5323

&& (uint64_t) ( extractFloatx80Frac( b )<<1 ) )

5324

) {

5325

if ( floatx80_is_signaling_nan( a )

5326

|| floatx80_is_signaling_nan( b ) ) {

5327

float_raise( float_flag_invalid STATUS_VAR);

5328

}

5329

return 1;

5330

}

5331

return 0;

5332

}

5333

5334

/*----------------------------------------------------------------------------

5335

| Returns the result of converting the quadruple-precision floating-point

5336

| value `a' to the 32-bit two's complement integer format. The conversion

5337

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5338

| Arithmetic---which means in particular that the conversion is rounded

5339

| according to the current rounding mode. If `a' is a NaN, the largest

5340

| positive integer is returned. Otherwise, if the conversion overflows, the

5341

| largest integer with the same sign as `a' is returned.

5342

*----------------------------------------------------------------------------*/

5343

5344

int32 float128_to_int32( float128 a STATUS_PARAM )

5345

{

5346

flag aSign;

5347

int32 aExp, shiftCount;

5348

uint64_t aSig0, aSig1;

5349

5350

aSig1 = extractFloat128Frac1( a );

5351

aSig0 = extractFloat128Frac0( a );

5352

aExp = extractFloat128Exp( a );

5353

aSign = extractFloat128Sign( a );

5354

if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;

5355

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5356

aSig0 |= ( aSig1 != 0 );

5357

shiftCount = 0x4028 - aExp;

5358

if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );

5359

return roundAndPackInt32( aSign, aSig0 STATUS_VAR );

5360

5361

}

5362

5363

/*----------------------------------------------------------------------------

5364

| Returns the result of converting the quadruple-precision floating-point

5365

| value `a' to the 32-bit two's complement integer format. The conversion

5366

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5367

| Arithmetic, except that the conversion is always rounded toward zero. If

5368

| `a' is a NaN, the largest positive integer is returned. Otherwise, if the

5369

| conversion overflows, the largest integer with the same sign as `a' is

5370

| returned.

5371

*----------------------------------------------------------------------------*/

5372

5373

int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )

5374

{

5375

flag aSign;

5376

int32 aExp, shiftCount;

5377

uint64_t aSig0, aSig1, savedASig;

5378

int32_t z;

5379

5380

aSig1 = extractFloat128Frac1( a );

5381

aSig0 = extractFloat128Frac0( a );

5382

aExp = extractFloat128Exp( a );

5383

aSign = extractFloat128Sign( a );

5384

aSig0 |= ( aSig1 != 0 );

5385

if ( 0x401E < aExp ) {

5386

if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;

5387

goto invalid;

5388

}

5389

else if ( aExp < 0x3FFF ) {

5390

if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;

5391

return 0;

5392

}

5393

aSig0 |= LIT64( 0x0001000000000000 );

5394

shiftCount = 0x402F - aExp;

5395

savedASig = aSig0;

5396

aSig0 >>= shiftCount;

5397

z = aSig0;

5398

if ( aSign ) z = - z;

5399

if ( ( z < 0 ) ^ aSign ) {

5400

invalid:

5401

float_raise( float_flag_invalid STATUS_VAR);

5402

return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;

5403

}

5404

if ( ( aSig0<<shiftCount ) != savedASig ) {

5405

STATUS(float_exception_flags) |= float_flag_inexact;

5406

}

5407

return z;

5408

5409

}

5410

5411

/*----------------------------------------------------------------------------

5412

| Returns the result of converting the quadruple-precision floating-point

5413

| value `a' to the 64-bit two's complement integer format. The conversion

5414

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5415

| Arithmetic---which means in particular that the conversion is rounded

5416

| according to the current rounding mode. If `a' is a NaN, the largest

5417

| positive integer is returned. Otherwise, if the conversion overflows, the

5418

| largest integer with the same sign as `a' is returned.

5419

*----------------------------------------------------------------------------*/

5420

5421

int64 float128_to_int64( float128 a STATUS_PARAM )

5422

{

5423

flag aSign;

5424

int32 aExp, shiftCount;

5425

uint64_t aSig0, aSig1;

5426

5427

aSig1 = extractFloat128Frac1( a );

5428

aSig0 = extractFloat128Frac0( a );

5429

aExp = extractFloat128Exp( a );

5430

aSign = extractFloat128Sign( a );

5431

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5432

shiftCount = 0x402F - aExp;

5433

if ( shiftCount <= 0 ) {

5434

if ( 0x403E < aExp ) {

5435

float_raise( float_flag_invalid STATUS_VAR);

5436

if ( ! aSign

5437

|| ( ( aExp == 0x7FFF )

5438

&& ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )

5439

)

5440

) {

5441

return LIT64( 0x7FFFFFFFFFFFFFFF );

5442

}

5443

return (int64_t) LIT64( 0x8000000000000000 );

5444

}

5445

shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );

5446

}

5447

else {

5448

shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );

5449

}

5450

return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );

5451

5452

}

5453

5454

/*----------------------------------------------------------------------------

5455

| Returns the result of converting the quadruple-precision floating-point

5456

| value `a' to the 64-bit two's complement integer format. The conversion

5457

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5458

| Arithmetic, except that the conversion is always rounded toward zero.

5459

| If `a' is a NaN, the largest positive integer is returned. Otherwise, if

5460

| the conversion overflows, the largest integer with the same sign as `a' is

5461

| returned.

5462

*----------------------------------------------------------------------------*/

5463

5464

int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )

5465

{

5466

flag aSign;

5467

int32 aExp, shiftCount;

5468

uint64_t aSig0, aSig1;

5469

int64 z;

5470

5471

aSig1 = extractFloat128Frac1( a );

5472

aSig0 = extractFloat128Frac0( a );

5473

aExp = extractFloat128Exp( a );

5474

aSign = extractFloat128Sign( a );

5475

if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );

5476

shiftCount = aExp - 0x402F;

5477

if ( 0 < shiftCount ) {

5478

if ( 0x403E <= aExp ) {

5479

aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );

5480

if ( ( a.high == LIT64( 0xC03E000000000000 ) )

5481

&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {

5482

if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;

5483

}

5484

else {

5485

float_raise( float_flag_invalid STATUS_VAR);

5486

if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {

5487

return LIT64( 0x7FFFFFFFFFFFFFFF );

5488

}

5489

}

5490

return (int64_t) LIT64( 0x8000000000000000 );

5491

}

5492

z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );

5493

if ( (uint64_t) ( aSig1<<shiftCount ) ) {

5494

STATUS(float_exception_flags) |= float_flag_inexact;

5495

}

5496

}

5497

else {

5498

if ( aExp < 0x3FFF ) {

5499

if ( aExp | aSig0 | aSig1 ) {

5500

STATUS(float_exception_flags) |= float_flag_inexact;

5501

}

5502

return 0;

5503

}

5504

z = aSig0>>( - shiftCount );

5505

if ( aSig1

5506

|| ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {

5507

STATUS(float_exception_flags) |= float_flag_inexact;

5508

}

5509

}

5510

if ( aSign ) z = - z;

5511

return z;

5512

5513

}

5514

5515

/*----------------------------------------------------------------------------

5516

| Returns the result of converting the quadruple-precision floating-point

5517

| value `a' to the single-precision floating-point format. The conversion

5518

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5519

| Arithmetic.

5520

*----------------------------------------------------------------------------*/

5521

5522

float32 float128_to_float32( float128 a STATUS_PARAM )

5523

{

5524

flag aSign;

5525

int32 aExp;

5526

uint64_t aSig0, aSig1;

5527

uint32_t zSig;

5528

5529

aSig1 = extractFloat128Frac1( a );

5530

aSig0 = extractFloat128Frac0( a );

5531

aExp = extractFloat128Exp( a );

5532

aSign = extractFloat128Sign( a );

5533

if ( aExp == 0x7FFF ) {

5534

if ( aSig0 | aSig1 ) {

5535

return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5536

}

5537

return packFloat32( aSign, 0xFF, 0 );

5538

}

5539

aSig0 |= ( aSig1 != 0 );

5540

shift64RightJamming( aSig0, 18, &aSig0 );

5541

zSig = aSig0;

5542

if ( aExp || zSig ) {

5543

zSig |= 0x40000000;

5544

aExp -= 0x3F81;

5545

}

5546

return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );

5547

5548

}

5549

5550

/*----------------------------------------------------------------------------

5551

| Returns the result of converting the quadruple-precision floating-point

5552

| value `a' to the double-precision floating-point format. The conversion

5553

| is performed according to the IEC/IEEE Standard for Binary Floating-Point

5554

| Arithmetic.

5555

*----------------------------------------------------------------------------*/

5556

5557

float64 float128_to_float64( float128 a STATUS_PARAM )

5558

{

5559

flag aSign;

5560

int32 aExp;

5561

uint64_t aSig0, aSig1;

5562

5563

aSig1 = extractFloat128Frac1( a );

5564

aSig0 = extractFloat128Frac0( a );

5565

aExp = extractFloat128Exp( a );

5566

aSign = extractFloat128Sign( a );

5567

if ( aExp == 0x7FFF ) {

5568

if ( aSig0 | aSig1 ) {

5569

return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5570

}

5571

return packFloat64( aSign, 0x7FF, 0 );

5572

}

5573

shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );

5574

aSig0 |= ( aSig1 != 0 );

5575

if ( aExp || aSig0 ) {

5576

aSig0 |= LIT64( 0x4000000000000000 );

5577

aExp -= 0x3C01;

5578

}

5579

return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );

5580

5581

}

5582

5583

/*----------------------------------------------------------------------------

5584

| Returns the result of converting the quadruple-precision floating-point

5585

| value `a' to the extended double-precision floating-point format. The

5586

| conversion is performed according to the IEC/IEEE Standard for Binary

5587

| Floating-Point Arithmetic.

5588

*----------------------------------------------------------------------------*/

5589

5590

floatx80 float128_to_floatx80( float128 a STATUS_PARAM )

5591

{

5592

flag aSign;

5593

int32 aExp;

5594

uint64_t aSig0, aSig1;

5595

5596

aSig1 = extractFloat128Frac1( a );

5597

aSig0 = extractFloat128Frac0( a );

5598

aExp = extractFloat128Exp( a );

5599

aSign = extractFloat128Sign( a );

5600

if ( aExp == 0x7FFF ) {

5601

if ( aSig0 | aSig1 ) {

5602

return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );

5603

}

5604

return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );

5605

}

5606

if ( aExp == 0 ) {

5607

if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );

5608

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

5609

}

5610

else {

5611

aSig0 |= LIT64( 0x0001000000000000 );

5612

}

5613

shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );

5614

return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );

5615

5616

}

5617

5618

/*----------------------------------------------------------------------------

5619

| Rounds the quadruple-precision floating-point value `a' to an integer, and

5620

| returns the result as a quadruple-precision floating-point value. The

5621

| operation is performed according to the IEC/IEEE Standard for Binary

5622

| Floating-Point Arithmetic.

5623

*----------------------------------------------------------------------------*/

5624

5625

float128 float128_round_to_int( float128 a STATUS_PARAM )

5626

{

5627

flag aSign;

5628

int32 aExp;

5629

uint64_t lastBitMask, roundBitsMask;

5630

int8 roundingMode;

5631

float128 z;

5632

5633

aExp = extractFloat128Exp( a );

5634

if ( 0x402F <= aExp ) {

5635

if ( 0x406F <= aExp ) {

5636

if ( ( aExp == 0x7FFF )

5637

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )

5638

) {

5639

return propagateFloat128NaN( a, a STATUS_VAR );

5640

}

5641

return a;

5642

}

5643

lastBitMask = 1;

5644

lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;

5645

roundBitsMask = lastBitMask - 1;

5646

z = a;

5647

roundingMode = STATUS(float_rounding_mode);

5648

if ( roundingMode == float_round_nearest_even ) {

5649

if ( lastBitMask ) {

5650

add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );

5651

if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;

5652

}

5653

else {

5654

if ( (int64_t) z.low < 0 ) {

5655

++z.high;

5656

if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;

5657

}

5658

}

5659

}

5660

else if ( roundingMode != float_round_to_zero ) {

5661

if ( extractFloat128Sign( z )

5662

^ ( roundingMode == float_round_up ) ) {

5663

add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );

5664

}

5665

}

5666

z.low &= ~ roundBitsMask;

5667

}

5668

else {

5669

if ( aExp < 0x3FFF ) {

5670

if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;

5671

STATUS(float_exception_flags) |= float_flag_inexact;

5672

aSign = extractFloat128Sign( a );

5673

switch ( STATUS(float_rounding_mode) ) {

5674

case float_round_nearest_even:

5675

if ( ( aExp == 0x3FFE )

5676

&& ( extractFloat128Frac0( a )

5677

| extractFloat128Frac1( a ) )

5678

) {

5679

return packFloat128( aSign, 0x3FFF, 0, 0 );

5680

}

5681

break;

5682

case float_round_down:

5683

return

5684

aSign ? packFloat128( 1, 0x3FFF, 0, 0 )

5685

: packFloat128( 0, 0, 0, 0 );

5686

case float_round_up:

5687

return

5688

aSign ? packFloat128( 1, 0, 0, 0 )

5689

: packFloat128( 0, 0x3FFF, 0, 0 );

5690

}

5691

return packFloat128( aSign, 0, 0, 0 );

5692

}

5693

lastBitMask = 1;

5694

lastBitMask <<= 0x402F - aExp;

5695

roundBitsMask = lastBitMask - 1;

5696

z.low = 0;

5697

z.high = a.high;

5698

roundingMode = STATUS(float_rounding_mode);

5699

if ( roundingMode == float_round_nearest_even ) {

5700

z.high += lastBitMask>>1;

5701

if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {

5702

z.high &= ~ lastBitMask;

5703

}

5704

}

5705

else if ( roundingMode != float_round_to_zero ) {

5706

if ( extractFloat128Sign( z )

5707

^ ( roundingMode == float_round_up ) ) {

5708

z.high |= ( a.low != 0 );

5709

z.high += roundBitsMask;

5710

}

5711

}

5712

z.high &= ~ roundBitsMask;

5713

}

5714

if ( ( z.low != a.low ) || ( z.high != a.high ) ) {

5715

STATUS(float_exception_flags) |= float_flag_inexact;

5716

}

5717

return z;

5718

5719

}

5720

5721

/*----------------------------------------------------------------------------

5722

| Returns the result of adding the absolute values of the quadruple-precision

5723

| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated

5724

| before being returned. `zSign' is ignored if the result is a NaN.

5725

| The addition is performed according to the IEC/IEEE Standard for Binary

5726

| Floating-Point Arithmetic.

5727

*----------------------------------------------------------------------------*/

5728

5729

static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)

5730

{

5731

int32 aExp, bExp, zExp;

5732

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;

5733

int32 expDiff;

5734

5735

aSig1 = extractFloat128Frac1( a );

5736

aSig0 = extractFloat128Frac0( a );

5737

aExp = extractFloat128Exp( a );

5738

bSig1 = extractFloat128Frac1( b );

5739

bSig0 = extractFloat128Frac0( b );

5740

bExp = extractFloat128Exp( b );

5741

expDiff = aExp - bExp;

5742

if ( 0 < expDiff ) {

5743

if ( aExp == 0x7FFF ) {

5744

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5745

return a;

5746

}

5747

if ( bExp == 0 ) {

5748

--expDiff;

5749

}

5750

else {

5751

bSig0 |= LIT64( 0x0001000000000000 );

5752

}

5753

shift128ExtraRightJamming(

5754

bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );

5755

zExp = aExp;

5756

}

5757

else if ( expDiff < 0 ) {

5758

if ( bExp == 0x7FFF ) {

5759

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5760

return packFloat128( zSign, 0x7FFF, 0, 0 );

5761

}

5762

if ( aExp == 0 ) {

5763

++expDiff;

5764

}

5765

else {

5766

aSig0 |= LIT64( 0x0001000000000000 );

5767

}

5768

shift128ExtraRightJamming(

5769

aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );

5770

zExp = bExp;

5771

}

5772

else {

5773

if ( aExp == 0x7FFF ) {

5774

if ( aSig0 | aSig1 | bSig0 | bSig1 ) {

5775

return propagateFloat128NaN( a, b STATUS_VAR );

5776

}

5777

return a;

5778

}

5779

add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

5780

if ( aExp == 0 ) {

5781

if (STATUS(flush_to_zero)) {

5782

if (zSig0 | zSig1) {

5783

float_raise(float_flag_output_denormal STATUS_VAR);

5784

}

5785

return packFloat128(zSign, 0, 0, 0);

5786

}

5787

return packFloat128( zSign, 0, zSig0, zSig1 );

5788

}

5789

zSig2 = 0;

5790

zSig0 |= LIT64( 0x0002000000000000 );

5791

zExp = aExp;

5792

goto shiftRight1;

5793

}

5794

aSig0 |= LIT64( 0x0001000000000000 );

5795

add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

5796

--zExp;

5797

if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;

5798

++zExp;

5799

shiftRight1:

5800

shift128ExtraRightJamming(

5801

zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );

5802

roundAndPack:

5803

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

5804

5805

}

5806

5807

/*----------------------------------------------------------------------------

5808

| Returns the result of subtracting the absolute values of the quadruple-

5809

| precision floating-point values `a' and `b'. If `zSign' is 1, the

5810

| difference is negated before being returned. `zSign' is ignored if the

5811

| result is a NaN. The subtraction is performed according to the IEC/IEEE

5812

| Standard for Binary Floating-Point Arithmetic.

5813

*----------------------------------------------------------------------------*/

5814

5815

static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)

5816

{

5817

int32 aExp, bExp, zExp;

5818

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;

5819

int32 expDiff;

5820

float128 z;

5821

5822

aSig1 = extractFloat128Frac1( a );

5823

aSig0 = extractFloat128Frac0( a );

5824

aExp = extractFloat128Exp( a );

5825

bSig1 = extractFloat128Frac1( b );

5826

bSig0 = extractFloat128Frac0( b );

5827

bExp = extractFloat128Exp( b );

5828

expDiff = aExp - bExp;

5829

shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );

5830

shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );

5831

if ( 0 < expDiff ) goto aExpBigger;

5832

if ( expDiff < 0 ) goto bExpBigger;

5833

if ( aExp == 0x7FFF ) {

5834

if ( aSig0 | aSig1 | bSig0 | bSig1 ) {

5835

return propagateFloat128NaN( a, b STATUS_VAR );

5836

}

5837

float_raise( float_flag_invalid STATUS_VAR);

5838

z.low = float128_default_nan_low;

5839

z.high = float128_default_nan_high;

5840

return z;

5841

}

5842

if ( aExp == 0 ) {

5843

aExp = 1;

5844

bExp = 1;

5845

}

5846

if ( bSig0 < aSig0 ) goto aBigger;

5847

if ( aSig0 < bSig0 ) goto bBigger;

5848

if ( bSig1 < aSig1 ) goto aBigger;

5849

if ( aSig1 < bSig1 ) goto bBigger;

5850

return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );

5851

bExpBigger:

5852

if ( bExp == 0x7FFF ) {

5853

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5854

return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );

5855

}

5856

if ( aExp == 0 ) {

5857

++expDiff;

5858

}

5859

else {

5860

aSig0 |= LIT64( 0x4000000000000000 );

5861

}

5862

shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );

5863

bSig0 |= LIT64( 0x4000000000000000 );

5864

bBigger:

5865

sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );

5866

zExp = bExp;

5867

zSign ^= 1;

5868

goto normalizeRoundAndPack;

5869

aExpBigger:

5870

if ( aExp == 0x7FFF ) {

5871

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5872

return a;

5873

}

5874

if ( bExp == 0 ) {

5875

--expDiff;

5876

}

5877

else {

5878

bSig0 |= LIT64( 0x4000000000000000 );

5879

}

5880

shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );

5881

aSig0 |= LIT64( 0x4000000000000000 );

5882

aBigger:

5883

sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );

5884

zExp = aExp;

5885

normalizeRoundAndPack:

5886

--zExp;

5887

return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );

5888

5889

}

5890

5891

/*----------------------------------------------------------------------------

5892

| Returns the result of adding the quadruple-precision floating-point values

5893

| `a' and `b'. The operation is performed according to the IEC/IEEE Standard

5894

| for Binary Floating-Point Arithmetic.

5895

*----------------------------------------------------------------------------*/

5896

5897

float128 float128_add( float128 a, float128 b STATUS_PARAM )

5898

{

5899

flag aSign, bSign;

5900

5901

aSign = extractFloat128Sign( a );

5902

bSign = extractFloat128Sign( b );

5903

if ( aSign == bSign ) {

5904

return addFloat128Sigs( a, b, aSign STATUS_VAR );

5905

}

5906

else {

5907

return subFloat128Sigs( a, b, aSign STATUS_VAR );

5908

}

5909

5910

}

5911

5912

/*----------------------------------------------------------------------------

5913

| Returns the result of subtracting the quadruple-precision floating-point

5914

| values `a' and `b'. The operation is performed according to the IEC/IEEE

5915

| Standard for Binary Floating-Point Arithmetic.

5916

*----------------------------------------------------------------------------*/

5917

5918

float128 float128_sub( float128 a, float128 b STATUS_PARAM )

5919

{

5920

flag aSign, bSign;

5921

5922

aSign = extractFloat128Sign( a );

5923

bSign = extractFloat128Sign( b );

5924

if ( aSign == bSign ) {

5925

return subFloat128Sigs( a, b, aSign STATUS_VAR );

5926

}

5927

else {

5928

return addFloat128Sigs( a, b, aSign STATUS_VAR );

5929

}

5930

5931

}

5932

5933

/*----------------------------------------------------------------------------

5934

| Returns the result of multiplying the quadruple-precision floating-point

5935

| values `a' and `b'. The operation is performed according to the IEC/IEEE

5936

| Standard for Binary Floating-Point Arithmetic.

5937

*----------------------------------------------------------------------------*/

5938

5939

float128 float128_mul( float128 a, float128 b STATUS_PARAM )

5940

{

5941

flag aSign, bSign, zSign;

5942

int32 aExp, bExp, zExp;

5943

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;

5944

float128 z;

5945

5946

aSig1 = extractFloat128Frac1( a );

5947

aSig0 = extractFloat128Frac0( a );

5948

aExp = extractFloat128Exp( a );

5949

aSign = extractFloat128Sign( a );

5950

bSig1 = extractFloat128Frac1( b );

5951

bSig0 = extractFloat128Frac0( b );

5952

bExp = extractFloat128Exp( b );

5953

bSign = extractFloat128Sign( b );

5954

zSign = aSign ^ bSign;

5955

if ( aExp == 0x7FFF ) {

5956

if ( ( aSig0 | aSig1 )

5957

|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {

5958

return propagateFloat128NaN( a, b STATUS_VAR );

5959

}

5960

if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;

5961

return packFloat128( zSign, 0x7FFF, 0, 0 );

5962

}

5963

if ( bExp == 0x7FFF ) {

5964

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

5965

if ( ( aExp | aSig0 | aSig1 ) == 0 ) {

5966

invalid:

5967

float_raise( float_flag_invalid STATUS_VAR);

5968

z.low = float128_default_nan_low;

5969

z.high = float128_default_nan_high;

5970

return z;

5971

}

5972

return packFloat128( zSign, 0x7FFF, 0, 0 );

5973

}

5974

if ( aExp == 0 ) {

5975

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

5976

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

5977

}

5978

if ( bExp == 0 ) {

5979

if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

5980

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

5981

}

5982

zExp = aExp + bExp - 0x4000;

5983

aSig0 |= LIT64( 0x0001000000000000 );

5984

shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );

5985

mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );

5986

add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );

5987

zSig2 |= ( zSig3 != 0 );

5988

if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {

5989

shift128ExtraRightJamming(

5990

zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );

5991

++zExp;

5992

}

5993

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

5994

5995

}

5996

5997

/*----------------------------------------------------------------------------

5998

| Returns the result of dividing the quadruple-precision floating-point value

5999

| `a' by the corresponding value `b'. The operation is performed according to

6000

| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6001

*----------------------------------------------------------------------------*/

6002

6003

float128 float128_div( float128 a, float128 b STATUS_PARAM )

6004

{

6005

flag aSign, bSign, zSign;

6006

int32 aExp, bExp, zExp;

6007

uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;

6008

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

6009

float128 z;

6010

6011

aSig1 = extractFloat128Frac1( a );

6012

aSig0 = extractFloat128Frac0( a );

6013

aExp = extractFloat128Exp( a );

6014

aSign = extractFloat128Sign( a );

6015

bSig1 = extractFloat128Frac1( b );

6016

bSig0 = extractFloat128Frac0( b );

6017

bExp = extractFloat128Exp( b );

6018

bSign = extractFloat128Sign( b );

6019

zSign = aSign ^ bSign;

6020

if ( aExp == 0x7FFF ) {

6021

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6022

if ( bExp == 0x7FFF ) {

6023

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6024

goto invalid;

6025

}

6026

return packFloat128( zSign, 0x7FFF, 0, 0 );

6027

}

6028

if ( bExp == 0x7FFF ) {

6029

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6030

return packFloat128( zSign, 0, 0, 0 );

6031

}

6032

if ( bExp == 0 ) {

6033

if ( ( bSig0 | bSig1 ) == 0 ) {

6034

if ( ( aExp | aSig0 | aSig1 ) == 0 ) {

6035

invalid:

6036

float_raise( float_flag_invalid STATUS_VAR);

6037

z.low = float128_default_nan_low;

6038

z.high = float128_default_nan_high;

6039

return z;

6040

}

6041

float_raise( float_flag_divbyzero STATUS_VAR);

6042

return packFloat128( zSign, 0x7FFF, 0, 0 );

6043

}

6044

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6045

}

6046

if ( aExp == 0 ) {

6047

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );

6048

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6049

}

6050

zExp = aExp - bExp + 0x3FFD;

6051

shortShift128Left(

6052

aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );

6053

shortShift128Left(

6054

bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );

6055

if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {

6056

shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );

6057

++zExp;

6058

}

6059

zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );

6060

mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );

6061

sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );

6062

while ( (int64_t) rem0 < 0 ) {

6063

--zSig0;

6064

add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );

6065

}

6066

zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );

6067

if ( ( zSig1 & 0x3FFF ) <= 4 ) {

6068

mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );

6069

sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );

6070

while ( (int64_t) rem1 < 0 ) {

6071

--zSig1;

6072

add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );

6073

}

6074

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

6075

}

6076

shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );

6077

return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6078

6079

}

6080

6081

/*----------------------------------------------------------------------------

6082

| Returns the remainder of the quadruple-precision floating-point value `a'

6083

| with respect to the corresponding value `b'. The operation is performed

6084

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6085

*----------------------------------------------------------------------------*/

6086

6087

float128 float128_rem( float128 a, float128 b STATUS_PARAM )

6088

{

6089

flag aSign, zSign;

6090

int32 aExp, bExp, expDiff;

6091

uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;

6092

uint64_t allZero, alternateASig0, alternateASig1, sigMean1;

6093

int64_t sigMean0;

6094

float128 z;

6095

6096

aSig1 = extractFloat128Frac1( a );

6097

aSig0 = extractFloat128Frac0( a );

6098

aExp = extractFloat128Exp( a );

6099

aSign = extractFloat128Sign( a );

6100

bSig1 = extractFloat128Frac1( b );

6101

bSig0 = extractFloat128Frac0( b );

6102

bExp = extractFloat128Exp( b );

6103

if ( aExp == 0x7FFF ) {

6104

if ( ( aSig0 | aSig1 )

6105

|| ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {

6106

return propagateFloat128NaN( a, b STATUS_VAR );

6107

}

6108

goto invalid;

6109

}

6110

if ( bExp == 0x7FFF ) {

6111

if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );

6112

return a;

6113

}

6114

if ( bExp == 0 ) {

6115

if ( ( bSig0 | bSig1 ) == 0 ) {

6116

invalid:

6117

float_raise( float_flag_invalid STATUS_VAR);

6118

z.low = float128_default_nan_low;

6119

z.high = float128_default_nan_high;

6120

return z;

6121

}

6122

normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );

6123

}

6124

if ( aExp == 0 ) {

6125

if ( ( aSig0 | aSig1 ) == 0 ) return a;

6126

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6127

}

6128

expDiff = aExp - bExp;

6129

if ( expDiff < -1 ) return a;

6130

shortShift128Left(

6131

aSig0 | LIT64( 0x0001000000000000 ),

6132

aSig1,

6133

15 - ( expDiff < 0 ),

6134

&aSig0,

6135

&aSig1

6136

);

6137

shortShift128Left(

6138

bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );

6139

q = le128( bSig0, bSig1, aSig0, aSig1 );

6140

if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );

6141

expDiff -= 64;

6142

while ( 0 < expDiff ) {

6143

q = estimateDiv128To64( aSig0, aSig1, bSig0 );

6144

q = ( 4 < q ) ? q - 4 : 0;

6145

mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );

6146

shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );

6147

shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );

6148

sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );

6149

expDiff -= 61;

6150

}

6151

if ( -64 < expDiff ) {

6152

q = estimateDiv128To64( aSig0, aSig1, bSig0 );

6153

q = ( 4 < q ) ? q - 4 : 0;

6154

q >>= - expDiff;

6155

shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );

6156

expDiff += 52;

6157

if ( expDiff < 0 ) {

6158

shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );

6159

}

6160

else {

6161

shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );

6162

}

6163

mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );

6164

sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );

6165

}

6166

else {

6167

shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );

6168

shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );

6169

}

6170

do {

6171

alternateASig0 = aSig0;

6172

alternateASig1 = aSig1;

6173

++q;

6174

sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );

6175

} while ( 0 <= (int64_t) aSig0 );

6176

add128(

6177

aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );

6178

if ( ( sigMean0 < 0 )

6179

|| ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {

6180

aSig0 = alternateASig0;

6181

aSig1 = alternateASig1;

6182

}

6183

zSign = ( (int64_t) aSig0 < 0 );

6184

if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );

6185

return

6186

normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );

6187

6188

}

6189

6190

/*----------------------------------------------------------------------------

6191

| Returns the square root of the quadruple-precision floating-point value `a'.

6192

| The operation is performed according to the IEC/IEEE Standard for Binary

6193

| Floating-Point Arithmetic.

6194

*----------------------------------------------------------------------------*/

6195

6196

float128 float128_sqrt( float128 a STATUS_PARAM )

6197

{

6198

flag aSign;

6199

int32 aExp, zExp;

6200

uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;

6201

uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;

6202

float128 z;

6203

6204

aSig1 = extractFloat128Frac1( a );

6205

aSig0 = extractFloat128Frac0( a );

6206

aExp = extractFloat128Exp( a );

6207

aSign = extractFloat128Sign( a );

6208

if ( aExp == 0x7FFF ) {

6209

if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );

6210

if ( ! aSign ) return a;

6211

goto invalid;

6212

}

6213

if ( aSign ) {

6214

if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;

6215

invalid:

6216

float_raise( float_flag_invalid STATUS_VAR);

6217

z.low = float128_default_nan_low;

6218

z.high = float128_default_nan_high;

6219

return z;

6220

}

6221

if ( aExp == 0 ) {

6222

if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );

6223

normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );

6224

}

6225

zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;

6226

aSig0 |= LIT64( 0x0001000000000000 );

6227

zSig0 = estimateSqrt32( aExp, aSig0>>17 );

6228

shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );

6229

zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );

6230

doubleZSig0 = zSig0<<1;

6231

mul64To128( zSig0, zSig0, &term0, &term1 );

6232

sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );

6233

while ( (int64_t) rem0 < 0 ) {

6234

--zSig0;

6235

doubleZSig0 -= 2;

6236

add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );

6237

}

6238

zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );

6239

if ( ( zSig1 & 0x1FFF ) <= 5 ) {

6240

if ( zSig1 == 0 ) zSig1 = 1;

6241

mul64To128( doubleZSig0, zSig1, &term1, &term2 );

6242

sub128( rem1, 0, term1, term2, &rem1, &rem2 );

6243

mul64To128( zSig1, zSig1, &term2, &term3 );

6244

sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );

6245

while ( (int64_t) rem1 < 0 ) {

6246

--zSig1;

6247

shortShift128Left( 0, zSig1, 1, &term2, &term3 );

6248

term3 |= 1;

6249

term2 |= doubleZSig0;

6250

add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );

6251

}

6252

zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );

6253

}

6254

shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );

6255

return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );

6256

6257

}

6258

6259

/*----------------------------------------------------------------------------

6260

| Returns 1 if the quadruple-precision floating-point value `a' is equal to

6261

| the corresponding value `b', and 0 otherwise. The invalid exception is

6262

| raised if either operand is a NaN. Otherwise, the comparison is performed

6263

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6264

*----------------------------------------------------------------------------*/

6265

6266

int float128_eq( float128 a, float128 b STATUS_PARAM )

6267

{

6268

6269

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6270

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6271

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6272

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6273

) {

6274

float_raise( float_flag_invalid STATUS_VAR);

6275

return 0;

6276

}

6277

return

6278

( a.low == b.low )

6279

&& ( ( a.high == b.high )

6280

|| ( ( a.low == 0 )

6281

&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )

6282

);

6283

6284

}

6285

6286

/*----------------------------------------------------------------------------

6287

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6288

| or equal to the corresponding value `b', and 0 otherwise. The invalid

6289

| exception is raised if either operand is a NaN. The comparison is performed

6290

| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6291

*----------------------------------------------------------------------------*/

6292

6293

int float128_le( float128 a, float128 b STATUS_PARAM )

6294

{

6295

flag aSign, bSign;

6296

6297

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6298

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6299

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6300

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6301

) {

6302

float_raise( float_flag_invalid STATUS_VAR);

6303

return 0;

6304

}

6305

aSign = extractFloat128Sign( a );

6306

bSign = extractFloat128Sign( b );

6307

if ( aSign != bSign ) {

6308

return

6309

aSign

6310

|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6311

== 0 );

6312

}

6313

return

6314

aSign ? le128( b.high, b.low, a.high, a.low )

6315

: le128( a.high, a.low, b.high, b.low );

6316

6317

}

6318

6319

/*----------------------------------------------------------------------------

6320

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6321

| the corresponding value `b', and 0 otherwise. The invalid exception is

6322

| raised if either operand is a NaN. The comparison is performed according

6323

| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6324

*----------------------------------------------------------------------------*/

6325

6326

int float128_lt( float128 a, float128 b STATUS_PARAM )

6327

{

6328

flag aSign, bSign;

6329

6330

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6331

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6332

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6333

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6334

) {

6335

float_raise( float_flag_invalid STATUS_VAR);

6336

return 0;

6337

}

6338

aSign = extractFloat128Sign( a );

6339

bSign = extractFloat128Sign( b );

6340

if ( aSign != bSign ) {

6341

return

6342

aSign

6343

&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6344

!= 0 );

6345

}

6346

return

6347

aSign ? lt128( b.high, b.low, a.high, a.low )

6348

: lt128( a.high, a.low, b.high, b.low );

6349

6350

}

6351

6352

/*----------------------------------------------------------------------------

6353

| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot

6354

| be compared, and 0 otherwise. The invalid exception is raised if either

6355

| operand is a NaN. The comparison is performed according to the IEC/IEEE

6356

| Standard for Binary Floating-Point Arithmetic.

6357

*----------------------------------------------------------------------------*/

6358

6359

int float128_unordered( float128 a, float128 b STATUS_PARAM )

6360

{

6361

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6362

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6363

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6364

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6365

) {

6366

float_raise( float_flag_invalid STATUS_VAR);

6367

return 1;

6368

}

6369

return 0;

6370

}

6371

6372

/*----------------------------------------------------------------------------

6373

| Returns 1 if the quadruple-precision floating-point value `a' is equal to

6374

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

6375

| exception. The comparison is performed according to the IEC/IEEE Standard

6376

| for Binary Floating-Point Arithmetic.

6377

*----------------------------------------------------------------------------*/

6378

6379

int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )

6380

{

6381

6382

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6383

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6384

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6385

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6386

) {

6387

if ( float128_is_signaling_nan( a )

6388

|| float128_is_signaling_nan( b ) ) {

6389

float_raise( float_flag_invalid STATUS_VAR);

6390

}

6391

return 0;

6392

}

6393

return

6394

( a.low == b.low )

6395

&& ( ( a.high == b.high )

6396

|| ( ( a.low == 0 )

6397

&& ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )

6398

);

6399

6400

}

6401

6402

/*----------------------------------------------------------------------------

6403

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6404

| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not

6405

| cause an exception. Otherwise, the comparison is performed according to the

6406

| IEC/IEEE Standard for Binary Floating-Point Arithmetic.

6407

*----------------------------------------------------------------------------*/

6408

6409

int float128_le_quiet( float128 a, float128 b STATUS_PARAM )

6410

{

6411

flag aSign, bSign;

6412

6413

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6414

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6415

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6416

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6417

) {

6418

if ( float128_is_signaling_nan( a )

6419

|| float128_is_signaling_nan( b ) ) {

6420

float_raise( float_flag_invalid STATUS_VAR);

6421

}

6422

return 0;

6423

}

6424

aSign = extractFloat128Sign( a );

6425

bSign = extractFloat128Sign( b );

6426

if ( aSign != bSign ) {

6427

return

6428

aSign

6429

|| ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6430

== 0 );

6431

}

6432

return

6433

aSign ? le128( b.high, b.low, a.high, a.low )

6434

: le128( a.high, a.low, b.high, b.low );

6435

6436

}

6437

6438

/*----------------------------------------------------------------------------

6439

| Returns 1 if the quadruple-precision floating-point value `a' is less than

6440

| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an

6441

| exception. Otherwise, the comparison is performed according to the IEC/IEEE

6442

| Standard for Binary Floating-Point Arithmetic.

6443

*----------------------------------------------------------------------------*/

6444

6445

int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )

6446

{

6447

flag aSign, bSign;

6448

6449

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6450

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6451

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6452

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6453

) {

6454

if ( float128_is_signaling_nan( a )

6455

|| float128_is_signaling_nan( b ) ) {

6456

float_raise( float_flag_invalid STATUS_VAR);

6457

}

6458

return 0;

6459

}

6460

aSign = extractFloat128Sign( a );

6461

bSign = extractFloat128Sign( b );

6462

if ( aSign != bSign ) {

6463

return

6464

aSign

6465

&& ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )

6466

!= 0 );

6467

}

6468

return

6469

aSign ? lt128( b.high, b.low, a.high, a.low )

6470

: lt128( a.high, a.low, b.high, b.low );

6471

6472

}

6473

6474

/*----------------------------------------------------------------------------

6475

| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot

6476

| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The

6477

| comparison is performed according to the IEC/IEEE Standard for Binary

6478

| Floating-Point Arithmetic.

6479

*----------------------------------------------------------------------------*/

6480

6481

int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )

6482

{

6483

if ( ( ( extractFloat128Exp( a ) == 0x7FFF )

6484

&& ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )

6485

|| ( ( extractFloat128Exp( b ) == 0x7FFF )

6486

&& ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )

6487

) {

6488

if ( float128_is_signaling_nan( a )

6489

|| float128_is_signaling_nan( b ) ) {

6490

float_raise( float_flag_invalid STATUS_VAR);

6491

}

6492

return 1;

6493

}

6494

return 0;

6495

}

6496

6497

/* misc functions */

6498

float32 uint32_to_float32(uint32_t a STATUS_PARAM)

6499

{

6500

return int64_to_float32(a STATUS_VAR);

6501

}

6502

6503

float64 uint32_to_float64(uint32_t a STATUS_PARAM)

6504

{

6505

return int64_to_float64(a STATUS_VAR);

6506

}

6507

6508

uint32 float32_to_uint32( float32 a STATUS_PARAM )

6509

{

6510

int64_t v;

6511

uint32 res;

6512

int old_exc_flags = get_float_exception_flags(status);

6513

6514

v = float32_to_int64(a STATUS_VAR);

6515

if (v < 0) {

6516

res = 0;

6517

} else if (v > 0xffffffff) {

6518

res = 0xffffffff;

6519

} else {

6520

return v;

6521

}

6522

set_float_exception_flags(old_exc_flags, status);

6523

float_raise(float_flag_invalid STATUS_VAR);

6524

return res;

6525

}

6526

6527

uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )

6528

{

6529

int64_t v;

6530

uint32 res;

6531

int old_exc_flags = get_float_exception_flags(status);

6532

6533

v = float32_to_int64_round_to_zero(a STATUS_VAR);

6534

if (v < 0) {

6535

res = 0;

6536

} else if (v > 0xffffffff) {

6537

res = 0xffffffff;

6538

} else {

6539

return v;

6540

}

6541

set_float_exception_flags(old_exc_flags, status);

6542

float_raise(float_flag_invalid STATUS_VAR);

6543

return res;

6544

}

6545

6546

int_fast16_t float32_to_int16(float32 a STATUS_PARAM)

6547

{

6548

int32_t v;

6549

int_fast16_t res;

6550

int old_exc_flags = get_float_exception_flags(status);

6551

6552

v = float32_to_int32(a STATUS_VAR);

6553

if (v < -0x8000) {

6554

res = -0x8000;

6555

} else if (v > 0x7fff) {

6556

res = 0x7fff;

6557

} else {

6558

return v;

6559

}

6560

6561

set_float_exception_flags(old_exc_flags, status);

6562

float_raise(float_flag_invalid STATUS_VAR);

6563

return res;

6564

}

6565

6566

uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)

6567

{

6568

int32_t v;

6569

uint_fast16_t res;

6570

int old_exc_flags = get_float_exception_flags(status);

6571

6572

v = float32_to_int32(a STATUS_VAR);

6573

if (v < 0) {

6574

res = 0;

6575

} else if (v > 0xffff) {

6576

res = 0xffff;

6577

} else {

6578

return v;

6579

}

6580

6581

set_float_exception_flags(old_exc_flags, status);

6582

float_raise(float_flag_invalid STATUS_VAR);

6583

return res;

6584

}

6585

6586

uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)

6587

{

6588

int64_t v;

6589

uint_fast16_t res;

6590

int old_exc_flags = get_float_exception_flags(status);

6591

6592

v = float32_to_int64_round_to_zero(a STATUS_VAR);

6593

if (v < 0) {

6594

res = 0;

6595

} else if (v > 0xffff) {

6596

res = 0xffff;

6597

} else {

6598

return v;

6599

}

6600

set_float_exception_flags(old_exc_flags, status);

6601

float_raise(float_flag_invalid STATUS_VAR);

6602

return res;

6603

}

6604

6605

uint32 float64_to_uint32( float64 a STATUS_PARAM )

6606

{

6607

int64_t v;

6608

uint32 res;

6609

6610

v = float64_to_int64(a STATUS_VAR);

6611

if (v < 0) {

6612

res = 0;

6613

float_raise( float_flag_invalid STATUS_VAR);

6614

} else if (v > 0xffffffff) {

6615

res = 0xffffffff;

6616

float_raise( float_flag_invalid STATUS_VAR);

6617

} else {

6618

res = v;

6619

}

6620

return res;

6621

}

6622

6623

uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )

6624

{

6625

int64_t v;

6626

uint32 res;

6627

6628

v = float64_to_int64_round_to_zero(a STATUS_VAR);

6629

if (v < 0) {

6630

res = 0;

6631

float_raise( float_flag_invalid STATUS_VAR);

6632

} else if (v > 0xffffffff) {

6633

res = 0xffffffff;

6634

float_raise( float_flag_invalid STATUS_VAR);

6635

} else {

6636

res = v;

6637

}

6638

return res;

6639

}

6640

6641

int_fast16_t float64_to_int16(float64 a STATUS_PARAM)

6642

{

6643

int64_t v;

6644

int_fast16_t res;

6645

int old_exc_flags = get_float_exception_flags(status);

6646

6647

v = float64_to_int32(a STATUS_VAR);

6648

if (v < -0x8000) {

6649

res = -0x8000;

6650

} else if (v > 0x7fff) {

6651

res = 0x7fff;

6652

} else {

6653

return v;

6654

}

6655

6656

set_float_exception_flags(old_exc_flags, status);

6657

float_raise(float_flag_invalid STATUS_VAR);

6658

return res;

6659

}

6660

6661

uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)

6662

{

6663

int64_t v;

6664

uint_fast16_t res;

6665

int old_exc_flags = get_float_exception_flags(status);

6666

6667

v = float64_to_int32(a STATUS_VAR);

6668

if (v < 0) {

6669

res = 0;

6670

} else if (v > 0xffff) {

6671

res = 0xffff;

6672

} else {

6673

return v;

6674

}

6675

6676

set_float_exception_flags(old_exc_flags, status);

6677

float_raise(float_flag_invalid STATUS_VAR);

6678

return res;

6679

}

6680

6681

uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)

6682

{

6683

int64_t v;

6684

uint_fast16_t res;

6685

int old_exc_flags = get_float_exception_flags(status);

6686

6687

v = float64_to_int64_round_to_zero(a STATUS_VAR);

6688

if (v < 0) {

6689

res = 0;

6690

} else if (v > 0xffff) {

6691

res = 0xffff;

6692

} else {

6693

return v;

6694

}

6695

set_float_exception_flags(old_exc_flags, status);

6696

float_raise(float_flag_invalid STATUS_VAR);

6697

return res;

6698

}

6699

6700

/*----------------------------------------------------------------------------

6701

| Returns the result of converting the double-precision floating-point value

6702

| `a' to the 64-bit unsigned integer format. The conversion is

6703

| performed according to the IEC/IEEE Standard for Binary Floating-Point

6704

| Arithmetic---which means in particular that the conversion is rounded

6705

| according to the current rounding mode. If `a' is a NaN, the largest

6706

| positive integer is returned. If the conversion overflows, the

6707

| largest unsigned integer is returned. If 'a' is negative, the value is

6708

| rounded and zero is returned; negative values that do not round to zero

6709

| will raise the inexact exception.

6710

*----------------------------------------------------------------------------*/

6711

6712

uint64_t float64_to_uint64(float64 a STATUS_PARAM)

6713

{

6714

flag aSign;

6715

int_fast16_t aExp, shiftCount;

6716

uint64_t aSig, aSigExtra;

6717

a = float64_squash_input_denormal(a STATUS_VAR);

6718

6719

aSig = extractFloat64Frac(a);

6720

aExp = extractFloat64Exp(a);

6721

aSign = extractFloat64Sign(a);

6722

if (aSign && (aExp > 1022)) {

6723

float_raise(float_flag_invalid STATUS_VAR);

6724

if (float64_is_any_nan(a)) {

6725

return LIT64(0xFFFFFFFFFFFFFFFF);

6726

} else {

6727

return 0;

6728

}

6729

}

6730

if (aExp) {

6731

aSig |= LIT64(0x0010000000000000);

6732

}

6733

shiftCount = 0x433 - aExp;

6734

if (shiftCount <= 0) {

6735

if (0x43E < aExp) {

6736

float_raise(float_flag_invalid STATUS_VAR);

6737

return LIT64(0xFFFFFFFFFFFFFFFF);

6738

}

6739

aSigExtra = 0;

6740

aSig <<= -shiftCount;

6741

} else {

6742

shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);

6743

}

6744

return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);

6745

}

6746

6747

uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)

6748

{

6749

int64_t v;

6750

6751

v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));

6752

v += float64_val(a);

6753

v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);

6754

6755

return v - INT64_MIN;

6756

}

6757

6758

#define COMPARE(s, nan_exp) \

6759

INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \

6760

int is_quiet STATUS_PARAM ) \

6761

{ \

6762

flag aSign, bSign; \

6763

uint ## s ## _t av, bv; \

6764

a = float ## s ## _squash_input_denormal(a STATUS_VAR); \

6765

b = float ## s ## _squash_input_denormal(b STATUS_VAR); \

6766

6767

if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \

6768

extractFloat ## s ## Frac( a ) ) || \

6769

( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \

6770

extractFloat ## s ## Frac( b ) )) { \

6771

if (!is_quiet || \

6772

float ## s ## _is_signaling_nan( a ) || \

6773

float ## s ## _is_signaling_nan( b ) ) { \

6774

float_raise( float_flag_invalid STATUS_VAR); \

6775

} \

6776

return float_relation_unordered; \

6777

} \

6778

aSign = extractFloat ## s ## Sign( a ); \

6779

bSign = extractFloat ## s ## Sign( b ); \

6780

av = float ## s ## _val(a); \

6781

bv = float ## s ## _val(b); \

6782

if ( aSign != bSign ) { \

6783

if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \

6784

/* zero case */ \

6785

return float_relation_equal; \

6786

} else { \

6787

return 1 - (2 * aSign); \

6788

} \

6789

} else { \

6790

if (av == bv) { \

6791

return float_relation_equal; \

6792

} else { \

6793

return 1 - 2 * (aSign ^ ( av < bv )); \

6794

} \

6795

} \

6796

} \

6797

6798

int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \

6799

{ \

6800

return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \

6801

} \

6802

6803

int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \

6804

{ \

6805

return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \

6806

}

6807

6808

COMPARE(32, 0xff)

6809

COMPARE(64, 0x7ff)

6810

6811

INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,

6812

int is_quiet STATUS_PARAM )

6813

{

6814

flag aSign, bSign;

6815

6816

if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&

6817

( extractFloatx80Frac( a )<<1 ) ) ||

6818

( ( extractFloatx80Exp( b ) == 0x7fff ) &&

6819

( extractFloatx80Frac( b )<<1 ) )) {

6820

if (!is_quiet ||

6821

floatx80_is_signaling_nan( a ) ||

6822

floatx80_is_signaling_nan( b ) ) {

6823

float_raise( float_flag_invalid STATUS_VAR);

6824

}

6825

return float_relation_unordered;

6826

}

6827

aSign = extractFloatx80Sign( a );

6828

bSign = extractFloatx80Sign( b );

6829

if ( aSign != bSign ) {

6830

6831

if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&

6832

( ( a.low | b.low ) == 0 ) ) {

6833

/* zero case */

6834

return float_relation_equal;

6835

} else {

6836

return 1 - (2 * aSign);

6837

}

6838

} else {

6839

if (a.low == b.low && a.high == b.high) {

6840

return float_relation_equal;

6841

} else {

6842

return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));

6843

}

6844

}

6845

}

6846

6847

int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )

6848

{

6849

return floatx80_compare_internal(a, b, 0 STATUS_VAR);

6850

}

6851

6852

int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )

6853

{

6854

return floatx80_compare_internal(a, b, 1 STATUS_VAR);

6855

}

6856

6857

INLINE int float128_compare_internal( float128 a, float128 b,

6858

int is_quiet STATUS_PARAM )

6859

{

6860

flag aSign, bSign;

6861

6862

if (( ( extractFloat128Exp( a ) == 0x7fff ) &&

6863

( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||

6864

( ( extractFloat128Exp( b ) == 0x7fff ) &&

6865

( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {

6866

if (!is_quiet ||

6867

float128_is_signaling_nan( a ) ||

6868

float128_is_signaling_nan( b ) ) {

6869

float_raise( float_flag_invalid STATUS_VAR);

6870

}

6871

return float_relation_unordered;

6872

}

6873

aSign = extractFloat128Sign( a );

6874

bSign = extractFloat128Sign( b );

6875

if ( aSign != bSign ) {

6876

if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {

6877

/* zero case */

6878

return float_relation_equal;

6879

} else {

6880

return 1 - (2 * aSign);

6881

}

6882

} else {

6883

if (a.low == b.low && a.high == b.high) {

6884

return float_relation_equal;

6885

} else {

6886

return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));

6887

}

6888

}

6889

}

6890

6891

int float128_compare( float128 a, float128 b STATUS_PARAM )

6892

{

6893

return float128_compare_internal(a, b, 0 STATUS_VAR);

6894

}

6895

6896

int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )

6897

{

6898

return float128_compare_internal(a, b, 1 STATUS_VAR);

6899

}

6900

6901

/* min() and max() functions. These can't be implemented as

6902

* 'compare and pick one input' because that would mishandle

6903

* NaNs and +0 vs -0.

6904

6905

* minnum() and maxnum() functions. These are similar to the min()

6906

* and max() functions but if one of the arguments is a QNaN and

6907

* the other is numerical then the numerical argument is returned.

6908

* minnum() and maxnum correspond to the IEEE 754-2008 minNum()

6909

* and maxNum() operations. min() and max() are the typical min/max

6910

* semantics provided by many CPUs which predate that specification.

6911

6912

#define MINMAX(s, nan_exp) \

6913

INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \

6914

int ismin, int isieee STATUS_PARAM) \

6915

{ \

6916

flag aSign, bSign; \

6917

uint ## s ## _t av, bv; \

6918

a = float ## s ## _squash_input_denormal(a STATUS_VAR); \

6919

b = float ## s ## _squash_input_denormal(b STATUS_VAR); \

6920

if (float ## s ## _is_any_nan(a) || \

6921

float ## s ## _is_any_nan(b)) { \

6922

if (isieee) { \

6923

if (float ## s ## _is_quiet_nan(a) && \

6924

!float ## s ##_is_any_nan(b)) { \

6925

return b; \

6926

} else if (float ## s ## _is_quiet_nan(b) && \

6927

!float ## s ## _is_any_nan(a)) { \

6928

return a; \

6929

} \

6930

} \

6931

return propagateFloat ## s ## NaN(a, b STATUS_VAR); \

6932

} \

6933

aSign = extractFloat ## s ## Sign(a); \

6934

bSign = extractFloat ## s ## Sign(b); \

6935

av = float ## s ## _val(a); \

6936

bv = float ## s ## _val(b); \

6937

if (aSign != bSign) { \

6938

if (ismin) { \

6939

return aSign ? a : b; \

6940

} else { \

6941

return aSign ? b : a; \

6942

} \

6943

} else { \

6944

if (ismin) { \

6945

return (aSign ^ (av < bv)) ? a : b; \

6946

} else { \

6947

return (aSign ^ (av < bv)) ? b : a; \

6948

} \

6949

} \

6950

} \

6951

6952

float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \

6953

{ \

6954

return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \

6955

} \

6956

6957

float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \

6958

{ \

6959

return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \

6960

} \

6961

6962

float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \

6963

{ \

6964

return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \

6965

} \

6966

6967

float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \

6968

{ \

6969

return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \

6970

}

6971

6972

MINMAX(32, 0xff)

6973

MINMAX(64, 0x7ff)

6974

6975

6976

/* Multiply A by 2 raised to the power N. */

6977

float32 float32_scalbn( float32 a, int n STATUS_PARAM )

6978

{

6979

flag aSign;

6980

int16_t aExp;

6981

uint32_t aSig;

6982

6983

a = float32_squash_input_denormal(a STATUS_VAR);

6984

aSig = extractFloat32Frac( a );

6985

aExp = extractFloat32Exp( a );

6986

aSign = extractFloat32Sign( a );

6987

6988

if ( aExp == 0xFF ) {

6989

if ( aSig ) {

6990

return propagateFloat32NaN( a, a STATUS_VAR );

6991

}

6992

return a;

6993

}

6994

if (aExp != 0) {

6995

aSig |= 0x00800000;

6996

} else if (aSig == 0) {

6997

return a;

6998

} else {

6999

aExp++;

7000

}

7001

7002

if (n > 0x200) {

7003

n = 0x200;

7004

} else if (n < -0x200) {

7005

n = -0x200;

7006

}

7007

7008

aExp += n - 1;

7009

aSig <<= 7;

7010

return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );

7011

}

7012

7013

float64 float64_scalbn( float64 a, int n STATUS_PARAM )

7014

{

7015

flag aSign;

7016

int16_t aExp;

7017

uint64_t aSig;

7018

7019

a = float64_squash_input_denormal(a STATUS_VAR);

7020

aSig = extractFloat64Frac( a );

7021

aExp = extractFloat64Exp( a );

7022

aSign = extractFloat64Sign( a );

7023

7024

if ( aExp == 0x7FF ) {

7025

if ( aSig ) {

7026

return propagateFloat64NaN( a, a STATUS_VAR );

7027

}

7028

return a;

7029

}

7030

if (aExp != 0) {

7031

aSig |= LIT64( 0x0010000000000000 );

7032

} else if (aSig == 0) {

7033

return a;

7034

} else {

7035

aExp++;

7036

}

7037

7038

if (n > 0x1000) {

7039

n = 0x1000;

7040

} else if (n < -0x1000) {

7041

n = -0x1000;

7042

}

7043

7044

aExp += n - 1;

7045

aSig <<= 10;

7046

return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );

7047

}

7048

7049

floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )

7050

{

7051

flag aSign;

7052

int32_t aExp;

7053

uint64_t aSig;

7054

7055

aSig = extractFloatx80Frac( a );

7056

aExp = extractFloatx80Exp( a );

7057

aSign = extractFloatx80Sign( a );

7058

7059

if ( aExp == 0x7FFF ) {

7060

if ( aSig<<1 ) {

7061

return propagateFloatx80NaN( a, a STATUS_VAR );

7062

}

7063

return a;

7064

}

7065

7066

if (aExp == 0) {

7067

if (aSig == 0) {

7068

return a;

7069

}

7070

aExp++;

7071

}

7072

7073

if (n > 0x10000) {

7074

n = 0x10000;

7075

} else if (n < -0x10000) {

7076

n = -0x10000;

7077

}

7078

7079

aExp += n;

7080

return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),

7081

aSign, aExp, aSig, 0 STATUS_VAR );

7082

}

7083

7084

float128 float128_scalbn( float128 a, int n STATUS_PARAM )

7085

{

7086

flag aSign;

7087

int32_t aExp;

7088

uint64_t aSig0, aSig1;

7089

7090

aSig1 = extractFloat128Frac1( a );

7091

aSig0 = extractFloat128Frac0( a );

7092

aExp = extractFloat128Exp( a );

7093

aSign = extractFloat128Sign( a );

7094

if ( aExp == 0x7FFF ) {

7095

if ( aSig0 | aSig1 ) {

7096

return propagateFloat128NaN( a, a STATUS_VAR );

7097

}

7098

return a;

7099

}

7100

if (aExp != 0) {

7101

aSig0 |= LIT64( 0x0001000000000000 );

7102

} else if (aSig0 == 0 && aSig1 == 0) {

7103

return a;

7104

} else {

7105

aExp++;

7106

}

7107

7108

if (n > 0x10000) {

7109

n = 0x10000;

7110

} else if (n < -0x10000) {

7111

n = -0x10000;

7112

}

7113

7114

aExp += n - 1;

7115

return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1

7116

STATUS_VAR );

7117

7118

}

Older »