Skip to content

Commit

Permalink
Default to not using intrinsics when they would be lossy.
Browse files Browse the repository at this point in the history
Related to starkat99#116
  • Loading branch information
Alexhuszagh committed Dec 13, 2024
1 parent 16d87e9 commit 0424577
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 7 deletions.
110 changes: 105 additions & 5 deletions src/binary16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,13 @@ impl f16 {
/// Exponents that underflow the minimum 16-bit exponent will result in
/// 16-bit subnormals or ±0. All other values are truncated and rounded
/// to the nearest representable 16-bit value.
///
/// This will prefer correctness over speed. Currently, this always
/// uses an intrinsic if available.
#[inline]
#[must_use]
pub fn from_f32(value: f32) -> f16 {
f16(arch::f32_to_f16(value))
Self::from_f32_instrinsic(value)
}

/// Constructs a 16-bit floating point value from a 32-bit floating point
Expand All @@ -80,6 +83,21 @@ impl f16 {
f16(arch::f32_to_f16_fallback(value))
}

/// Constructs a 16-bit floating point value from a 32-bit floating point
/// value.
///
/// This operation is lossy. If the 32-bit value is to large to fit in
/// 16-bits, ±∞ will result. NaN values are preserved. 32-bit subnormal
/// values are too tiny to be represented in 16-bits and result in ±0.
/// Exponents that underflow the minimum 16-bit exponent will result in
/// 16-bit subnormals or ±0. All other values are truncated and rounded
/// to the nearest representable 16-bit value.
#[inline]
#[must_use]
pub fn from_f32_instrinsic(value: f32) -> f16 {
f16(arch::f32_to_f16(value))
}

/// Constructs a 16-bit floating point value from a 64-bit floating point
/// value.
///
Expand All @@ -89,10 +107,18 @@ impl f16 {
/// Exponents that underflow the minimum 16-bit exponent will result in
/// 16-bit subnormals or ±0. All other values are truncated and rounded
/// to the nearest representable 16-bit value.
///
/// This will prefer correctness over speed: on x86 systems, this currently
/// uses a software rather than an instrinsic implementation on x86.
#[inline]
#[must_use]
pub fn from_f64(value: f64) -> f16 {
f16(arch::f64_to_f16(value))
// FIXME: Once `_mm_cvtpd_ph` is stablized, move to using the intrinsic.
if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
Self::from_f64_const(value)
} else {
Self::from_f64_instrinsic(value)
}
}

/// Constructs a 16-bit floating point value from a 64-bit floating point
Expand All @@ -115,6 +141,25 @@ impl f16 {
f16(arch::f64_to_f16_fallback(value))
}

/// Constructs a 16-bit floating point value from a 64-bit floating point
/// value.
///
/// This operation is lossy. If the 64-bit value is to large to fit in
/// 16-bits, ±∞ will result. NaN values are preserved. 64-bit subnormal
/// values are too tiny to be represented in 16-bits and result in ±0.
/// Exponents that underflow the minimum 16-bit exponent will result in
/// 16-bit subnormals or ±0. All other values are truncated and rounded
/// to the nearest representable 16-bit value.
///
/// This prefers to use vendor instrinsics if possible, otherwise, it
/// goes to a fallback. On x86 and x86_64, this can be more lossy than
/// `from_f64`.
#[inline]
#[must_use]
pub fn from_f64_instrinsic(value: f64) -> f16 {
f16(arch::f64_to_f16(value))
}

/// Converts a [`struct@f16`] into the underlying bit representation.
#[inline]
#[must_use]
Expand Down Expand Up @@ -238,10 +283,13 @@ impl f16 {
///
/// This conversion is lossless as all 16-bit floating point values can be
/// represented exactly in 32-bit floating point.
///
/// This will prefer correctness over speed. Currently, this always
/// uses an intrinsic if available.
#[inline]
#[must_use]
pub fn to_f32(self) -> f32 {
arch::f16_to_f32(self.0)
self.to_f32_intrinsic()
}

/// Converts a [`struct@f16`] value into a `f32` value.
Expand All @@ -259,6 +307,16 @@ impl f16 {
arch::f16_to_f32_fallback(self.0)
}

/// Converts a [`struct@f16`] value into a `f32` value.
///
/// This conversion is lossless as all 16-bit floating point values can be
/// represented exactly in 32-bit floating point.
#[inline]
#[must_use]
pub fn to_f32_intrinsic(self) -> f32 {
arch::f16_to_f32(self.0)
}

/// Convert the data to an `f32` type, used for numerical operations.
#[inline(always)]
pub fn as_f32(self) -> f32 {
Expand All @@ -275,10 +333,13 @@ impl f16 {
///
/// This conversion is lossless as all 16-bit floating point values can be
/// represented exactly in 64-bit floating point.
///
/// This will prefer correctness over speed: on x86 systems, this currently
/// uses a software rather than an instrinsic implementation on x86.
#[inline]
#[must_use]
pub fn to_f64(self) -> f64 {
arch::f16_to_f64(self.0)
self.to_f64_const()
}

/// Converts a [`struct@f16`] value into a `f64` value.
Expand All @@ -296,6 +357,16 @@ impl f16 {
arch::f16_to_f64_fallback(self.0)
}

/// Converts a [`struct@f16`] value into a `f32` value.
///
/// This conversion is lossless as all 16-bit floating point values can be
/// represented exactly in 32-bit floating point.
#[inline]
#[must_use]
pub fn to_f64_intrinsic(self) -> f64 {
arch::f16_to_f64(self.0)
}

/// Convert the data to an `f64` type, used for numerical operations.
#[inline(always)]
pub fn as_f64(self) -> f64 {
Expand Down Expand Up @@ -1395,8 +1466,8 @@ const fn ge(lhs: f16, rhs: f16) -> bool {
#[allow(clippy::cognitive_complexity, clippy::float_cmp, clippy::neg_cmp_op_on_partial_ord)]
#[cfg(test)]
mod test {
#[allow(unused_imports)]
use core::cmp::Ordering;
use core::mem;

use super::*;

Expand Down Expand Up @@ -1808,4 +1879,33 @@ mod test {
assert_eq!(f16::from_f32(4.) / f16::from_f32(2.), f16::from_f32(2.));
assert_eq!(f16::from_f32(4.) % f16::from_f32(3.), f16::from_f32(1.));
}

#[test]
fn issue_116() {
// SEE: https://github.com/starkat99/half-rs/issues/116
// This is lossy until `_mm_cvtpd_ph` will be stable on x86.
let max_diff = if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
1
} else {
0
};

// from the round-to-even section of the test case
let x: f64 = unsafe { mem::transmute(0x3f0ffbfffffffffcu64) };
let bits = f16::from_f64(x).to_bits();
let const_bits = f16::from_f64_const(x).to_bits();
let inst_bits = f16::from_f64_instrinsic(x).to_bits();
assert_eq!(const_bits, bits);
assert!(inst_bits.abs_diff(bits) <= max_diff);

// from the double rounding section of the test case
// comment from the cpython test case: should be 2047, if double-rounded
// 64>32>16, becomes 2048
let x: f64 = unsafe { mem::transmute(0x409ffdffffff0000u64) };
let bits = f16::from_f64(x).to_bits();
let const_bits = f16::from_f64_const(x).to_bits();
let inst_bits = f16::from_f64_instrinsic(x).to_bits();
assert_eq!(const_bits, bits);
assert!(inst_bits.abs_diff(bits) <= max_diff);
}
}
4 changes: 2 additions & 2 deletions src/binary16/arch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ pub(crate) fn f32_to_f16(f: f32) -> u16 {
pub(crate) fn f64_to_f16(f: f64) -> u16 {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f32_to_f16_x86_f16c(f as f32) }
unsafe { x86::f64_to_f16_x86_f16c(f) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f64_to_f16_fp16(f) }
} else {
Expand All @@ -107,7 +107,7 @@ pub(crate) fn f16_to_f32(i: u16) -> f32 {
pub(crate) fn f16_to_f64(i: u16) -> f64 {
convert_fn! {
if x86_feature("f16c") {
unsafe { x86::f16_to_f32_x86_f16c(i) as f64 }
unsafe { x86::f16_to_f64_x86_f16c(i) }
} else if aarch64_feature("fp16") {
unsafe { aarch64::f16_to_f64_fp16(i) }
} else {
Expand Down
14 changes: 14 additions & 0 deletions src/binary16/arch/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ pub(super) unsafe fn f16_to_f32_x86_f16c(i: u16) -> f32 {
*(&retval as *const __m128).cast()
}

#[inline]
#[target_feature(enable = "f16c")]
pub(super) unsafe fn f16_to_f64_x86_f16c(i: u16) -> f64 {
// FIXME: Change to use `_mm_cvtph_pd` when stable.
f16_to_f32_x86_f16c(i) as f64
}

#[inline]
#[target_feature(enable = "f16c")]
pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
Expand All @@ -50,6 +57,13 @@ pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
*(&retval as *const __m128i).cast()
}

#[inline]
#[target_feature(enable = "f16c")]
pub(super) unsafe fn f64_to_f16_x86_f16c(f: f64) -> u16 {
// FIXME: Change to use `_mm_cvtpd_ph` when stable.
f32_to_f16_x86_f16c(f as f32)
}

#[inline]
#[target_feature(enable = "f16c")]
pub(super) unsafe fn f16x4_to_f32x4_x86_f16c(v: &[u16; 4]) -> [f32; 4] {
Expand Down

0 comments on commit 0424577

Please sign in to comment.