From 0424577f6cbbab0be491cc493b6c9d9ca55770e8 Mon Sep 17 00:00:00 2001 From: Alex Huszagh Date: Thu, 12 Dec 2024 22:18:17 -0600 Subject: [PATCH] Default to not using intrinsics when they would be lossy. Related to https://github.com/starkat99/half-rs/issues/116 --- src/binary16.rs | 110 +++++++++++++++++++++++++++++++++++++-- src/binary16/arch.rs | 4 +- src/binary16/arch/x86.rs | 14 +++++ 3 files changed, 121 insertions(+), 7 deletions(-) diff --git a/src/binary16.rs b/src/binary16.rs index bda9147..41affe5 100644 --- a/src/binary16.rs +++ b/src/binary16.rs @@ -54,10 +54,13 @@ impl f16 { /// Exponents that underflow the minimum 16-bit exponent will result in /// 16-bit subnormals or ±0. All other values are truncated and rounded /// to the nearest representable 16-bit value. + /// + /// This will prefer correctness over speed. Currently, this always + /// uses an intrinsic if available. #[inline] #[must_use] pub fn from_f32(value: f32) -> f16 { - f16(arch::f32_to_f16(value)) + Self::from_f32_instrinsic(value) } /// Constructs a 16-bit floating point value from a 32-bit floating point @@ -80,6 +83,21 @@ impl f16 { f16(arch::f32_to_f16_fallback(value)) } + /// Constructs a 16-bit floating point value from a 32-bit floating point + /// value. + /// + /// This operation is lossy. If the 32-bit value is to large to fit in + /// 16-bits, ±∞ will result. NaN values are preserved. 32-bit subnormal + /// values are too tiny to be represented in 16-bits and result in ±0. + /// Exponents that underflow the minimum 16-bit exponent will result in + /// 16-bit subnormals or ±0. All other values are truncated and rounded + /// to the nearest representable 16-bit value. + #[inline] + #[must_use] + pub fn from_f32_instrinsic(value: f32) -> f16 { + f16(arch::f32_to_f16(value)) + } + /// Constructs a 16-bit floating point value from a 64-bit floating point /// value. /// @@ -89,10 +107,18 @@ impl f16 { /// Exponents that underflow the minimum 16-bit exponent will result in /// 16-bit subnormals or ±0. All other values are truncated and rounded /// to the nearest representable 16-bit value. + /// + /// This will prefer correctness over speed: on x86 systems, this currently + /// uses a software rather than an instrinsic implementation on x86. #[inline] #[must_use] pub fn from_f64(value: f64) -> f16 { - f16(arch::f64_to_f16(value)) + // FIXME: Once `_mm_cvtpd_ph` is stablized, move to using the intrinsic. + if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + Self::from_f64_const(value) + } else { + Self::from_f64_instrinsic(value) + } } /// Constructs a 16-bit floating point value from a 64-bit floating point @@ -115,6 +141,25 @@ impl f16 { f16(arch::f64_to_f16_fallback(value)) } + /// Constructs a 16-bit floating point value from a 64-bit floating point + /// value. + /// + /// This operation is lossy. If the 64-bit value is to large to fit in + /// 16-bits, ±∞ will result. NaN values are preserved. 64-bit subnormal + /// values are too tiny to be represented in 16-bits and result in ±0. + /// Exponents that underflow the minimum 16-bit exponent will result in + /// 16-bit subnormals or ±0. All other values are truncated and rounded + /// to the nearest representable 16-bit value. + /// + /// This prefers to use vendor instrinsics if possible, otherwise, it + /// goes to a fallback. On x86 and x86_64, this can be more lossy than + /// `from_f64`. + #[inline] + #[must_use] + pub fn from_f64_instrinsic(value: f64) -> f16 { + f16(arch::f64_to_f16(value)) + } + /// Converts a [`struct@f16`] into the underlying bit representation. #[inline] #[must_use] @@ -238,10 +283,13 @@ impl f16 { /// /// This conversion is lossless as all 16-bit floating point values can be /// represented exactly in 32-bit floating point. + /// + /// This will prefer correctness over speed. Currently, this always + /// uses an intrinsic if available. #[inline] #[must_use] pub fn to_f32(self) -> f32 { - arch::f16_to_f32(self.0) + self.to_f32_intrinsic() } /// Converts a [`struct@f16`] value into a `f32` value. @@ -259,6 +307,16 @@ impl f16 { arch::f16_to_f32_fallback(self.0) } + /// Converts a [`struct@f16`] value into a `f32` value. + /// + /// This conversion is lossless as all 16-bit floating point values can be + /// represented exactly in 32-bit floating point. + #[inline] + #[must_use] + pub fn to_f32_intrinsic(self) -> f32 { + arch::f16_to_f32(self.0) + } + /// Convert the data to an `f32` type, used for numerical operations. #[inline(always)] pub fn as_f32(self) -> f32 { @@ -275,10 +333,13 @@ impl f16 { /// /// This conversion is lossless as all 16-bit floating point values can be /// represented exactly in 64-bit floating point. + /// + /// This will prefer correctness over speed: on x86 systems, this currently + /// uses a software rather than an instrinsic implementation on x86. #[inline] #[must_use] pub fn to_f64(self) -> f64 { - arch::f16_to_f64(self.0) + self.to_f64_const() } /// Converts a [`struct@f16`] value into a `f64` value. @@ -296,6 +357,16 @@ impl f16 { arch::f16_to_f64_fallback(self.0) } + /// Converts a [`struct@f16`] value into a `f32` value. + /// + /// This conversion is lossless as all 16-bit floating point values can be + /// represented exactly in 32-bit floating point. + #[inline] + #[must_use] + pub fn to_f64_intrinsic(self) -> f64 { + arch::f16_to_f64(self.0) + } + /// Convert the data to an `f64` type, used for numerical operations. #[inline(always)] pub fn as_f64(self) -> f64 { @@ -1395,8 +1466,8 @@ const fn ge(lhs: f16, rhs: f16) -> bool { #[allow(clippy::cognitive_complexity, clippy::float_cmp, clippy::neg_cmp_op_on_partial_ord)] #[cfg(test)] mod test { - #[allow(unused_imports)] use core::cmp::Ordering; + use core::mem; use super::*; @@ -1808,4 +1879,33 @@ mod test { assert_eq!(f16::from_f32(4.) / f16::from_f32(2.), f16::from_f32(2.)); assert_eq!(f16::from_f32(4.) % f16::from_f32(3.), f16::from_f32(1.)); } + + #[test] + fn issue_116() { + // SEE: https://github.com/starkat99/half-rs/issues/116 + // This is lossy until `_mm_cvtpd_ph` will be stable on x86. + let max_diff = if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + 1 + } else { + 0 + }; + + // from the round-to-even section of the test case + let x: f64 = unsafe { mem::transmute(0x3f0ffbfffffffffcu64) }; + let bits = f16::from_f64(x).to_bits(); + let const_bits = f16::from_f64_const(x).to_bits(); + let inst_bits = f16::from_f64_instrinsic(x).to_bits(); + assert_eq!(const_bits, bits); + assert!(inst_bits.abs_diff(bits) <= max_diff); + + // from the double rounding section of the test case + // comment from the cpython test case: should be 2047, if double-rounded + // 64>32>16, becomes 2048 + let x: f64 = unsafe { mem::transmute(0x409ffdffffff0000u64) }; + let bits = f16::from_f64(x).to_bits(); + let const_bits = f16::from_f64_const(x).to_bits(); + let inst_bits = f16::from_f64_instrinsic(x).to_bits(); + assert_eq!(const_bits, bits); + assert!(inst_bits.abs_diff(bits) <= max_diff); + } } diff --git a/src/binary16/arch.rs b/src/binary16/arch.rs index 3aa533d..bf59b33 100644 --- a/src/binary16/arch.rs +++ b/src/binary16/arch.rs @@ -81,7 +81,7 @@ pub(crate) fn f32_to_f16(f: f32) -> u16 { pub(crate) fn f64_to_f16(f: f64) -> u16 { convert_fn! { if x86_feature("f16c") { - unsafe { x86::f32_to_f16_x86_f16c(f as f32) } + unsafe { x86::f64_to_f16_x86_f16c(f) } } else if aarch64_feature("fp16") { unsafe { aarch64::f64_to_f16_fp16(f) } } else { @@ -107,7 +107,7 @@ pub(crate) fn f16_to_f32(i: u16) -> f32 { pub(crate) fn f16_to_f64(i: u16) -> f64 { convert_fn! { if x86_feature("f16c") { - unsafe { x86::f16_to_f32_x86_f16c(i) as f64 } + unsafe { x86::f16_to_f64_x86_f16c(i) } } else if aarch64_feature("fp16") { unsafe { aarch64::f16_to_f64_fp16(i) } } else { diff --git a/src/binary16/arch/x86.rs b/src/binary16/arch/x86.rs index 8fe781b..4f7373f 100644 --- a/src/binary16/arch/x86.rs +++ b/src/binary16/arch/x86.rs @@ -41,6 +41,13 @@ pub(super) unsafe fn f16_to_f32_x86_f16c(i: u16) -> f32 { *(&retval as *const __m128).cast() } +#[inline] +#[target_feature(enable = "f16c")] +pub(super) unsafe fn f16_to_f64_x86_f16c(i: u16) -> f64 { + // FIXME: Change to use `_mm_cvtph_pd` when stable. + f16_to_f32_x86_f16c(i) as f64 +} + #[inline] #[target_feature(enable = "f16c")] pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 { @@ -50,6 +57,13 @@ pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 { *(&retval as *const __m128i).cast() } +#[inline] +#[target_feature(enable = "f16c")] +pub(super) unsafe fn f64_to_f16_x86_f16c(f: f64) -> u16 { + // FIXME: Change to use `_mm_cvtpd_ph` when stable. + f32_to_f16_x86_f16c(f as f32) +} + #[inline] #[target_feature(enable = "f16c")] pub(super) unsafe fn f16x4_to_f32x4_x86_f16c(v: &[u16; 4]) -> [f32; 4] {