From 6e3bf04174983c9502cbea795103c9008d3c3053 Mon Sep 17 00:00:00 2001 From: Alex Huszagh Date: Sat, 11 Jan 2025 11:22:29 -0600 Subject: [PATCH 1/2] Add minor performance enhancements via better inlining. --- lexical-parse-float/src/binary.rs | 1 - lexical-parse-float/src/slow.rs | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lexical-parse-float/src/binary.rs b/lexical-parse-float/src/binary.rs index a1baee63..1e787deb 100644 --- a/lexical-parse-float/src/binary.rs +++ b/lexical-parse-float/src/binary.rs @@ -149,7 +149,6 @@ pub fn parse_u64_digits<'a, Iter, const FORMAT: u128>( /// /// This avoids the need for arbitrary-precision arithmetic, since the result /// will always be a near-halfway representation where rounded-down it's even. -#[cfg_attr(not(feature = "compact"), inline(always))] pub fn slow_binary(num: Number) -> ExtendedFloat80 { let format = NumberFormat::<{ FORMAT }> {}; let radix = format.radix(); diff --git a/lexical-parse-float/src/slow.rs b/lexical-parse-float/src/slow.rs index 6aa1e4e2..8ab711a5 100644 --- a/lexical-parse-float/src/slow.rs +++ b/lexical-parse-float/src/slow.rs @@ -44,8 +44,8 @@ use crate::shared; /// any value before or equal to `16777217.0` must be rounded down /// to `16777216.0`. These near-halfway conversions therefore may require /// a large number of digits to unambiguously determine how to round. +#[cold] #[must_use] -#[inline(always)] #[allow(clippy::unwrap_used)] // reason = "none is a developer error" pub fn slow_radix( num: Number, @@ -98,7 +98,6 @@ pub fn slow_radix( /// digits to the theoretical digits for `b` and determine if we /// need to round-up. #[must_use] -#[inline(always)] #[allow(clippy::cast_possible_wrap)] // reason = "the value range is [-324, 308]" pub fn digit_comp( num: Number, @@ -119,6 +118,7 @@ pub fn digit_comp( /// Generate the significant digits with a positive exponent relative to /// mantissa. #[must_use] +#[inline(always)] #[allow(clippy::unwrap_used)] // reason = "none is a developer error" #[allow(clippy::cast_possible_wrap)] // reason = "can't wrap in practice: max is ~1000 limbs" #[allow(clippy::missing_inline_in_public_items)] // reason = "only public for testing" @@ -174,6 +174,7 @@ pub fn positive_digit_comp( /// /// This allows us to compare both floats using integers efficiently /// without any loss of precision. +#[inline(always)] #[allow(clippy::match_bool)] // reason = "simplifies documentation" #[allow(clippy::unwrap_used)] // reason = "unwrap panics if a developer error" #[allow(clippy::comparison_chain)] // reason = "logically different conditions for algorithm" From a80aa9bf0443227872aeeb4fb267a19c5be0efb0 Mon Sep 17 00:00:00 2001 From: Alex Huszagh Date: Sat, 11 Jan 2025 11:58:26 -0600 Subject: [PATCH 2/2] Fix some more inlining for the slow-path algorithms. --- CHANGELOG | 1 + lexical-parse-float/src/lemire.rs | 2 ++ lexical-parse-float/src/parse.rs | 5 +++++ lexical-parse-float/src/slow.rs | 3 ++- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 0481b000..6a4eb8e6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Bug where the `radix` feature wasn't enabling `power-of-two` in `lexical-core` or `lexical`. +- Fixed performance issues due to a lack of inlining on the Eisel-Lemire algorithm (#210). ## [1.0.5] 2024-12-08 diff --git a/lexical-parse-float/src/lemire.rs b/lexical-parse-float/src/lemire.rs index 8ee48224..a3840a4e 100644 --- a/lexical-parse-float/src/lemire.rs +++ b/lexical-parse-float/src/lemire.rs @@ -51,6 +51,7 @@ pub fn lemire(num: &Number, lossy: bool) -> ExtendedFloat80 { /// at a Gigabyte per Second" in section 5, "Fast Algorithm", and /// section 6, "Exact Numbers And Ties", available online: /// . +#[inline] #[must_use] #[allow(clippy::missing_inline_in_public_items)] // reason="public for testing only" pub fn compute_float(q: i64, mut w: u64, lossy: bool) -> ExtendedFloat80 { @@ -201,6 +202,7 @@ const fn full_multiplication(a: u64, b: u64) -> (u64, u64) { // 64-bit words approximating the result, with the "high" part corresponding to // the most significant bits and the low part corresponding to the least // significant bits. +#[inline] fn compute_product_approx(q: i64, w: u64, precision: usize) -> (u64, u64) { debug_assert!(q >= SMALLEST_POWER_OF_FIVE as i64, "must be within our required pow5 range"); debug_assert!(q <= LARGEST_POWER_OF_FIVE as i64, "must be within our required pow5 range"); diff --git a/lexical-parse-float/src/parse.rs b/lexical-parse-float/src/parse.rs index ccf82bd0..93714a20 100644 --- a/lexical-parse-float/src/parse.rs +++ b/lexical-parse-float/src/parse.rs @@ -238,6 +238,7 @@ macro_rules! to_native { } /// Parse a float from bytes using a complete parser. +#[inline(always)] #[allow(clippy::missing_inline_in_public_items)] // reason = "only public for testing" pub fn parse_complete( bytes: &[u8], @@ -280,6 +281,7 @@ pub fn parse_complete( } /// Parse a float using only the fast path as a complete parser. +#[inline(always)] #[allow(clippy::missing_inline_in_public_items)] // reason = "only public for testing" pub fn fast_path_complete( bytes: &[u8], @@ -304,6 +306,7 @@ pub fn fast_path_complete( } /// Parse a float from bytes using a partial parser. +#[inline(always)] #[allow(clippy::missing_inline_in_public_items)] // reason = "only public for testing" pub fn parse_partial( bytes: &[u8], @@ -352,6 +355,7 @@ pub fn parse_partial( } /// Parse a float using only the fast path as a partial parser. +#[inline(always)] #[allow(clippy::missing_inline_in_public_items)] // reason = "only public for testing" pub fn fast_path_partial( bytes: &[u8], @@ -825,6 +829,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( )) } +#[inline(always)] pub fn parse_partial_number<'a, const FORMAT: u128>( byte: Bytes<'a, FORMAT>, is_negative: bool, diff --git a/lexical-parse-float/src/slow.rs b/lexical-parse-float/src/slow.rs index 8ab711a5..343159cb 100644 --- a/lexical-parse-float/src/slow.rs +++ b/lexical-parse-float/src/slow.rs @@ -44,7 +44,6 @@ use crate::shared; /// any value before or equal to `16777217.0` must be rounded down /// to `16777216.0`. These near-halfway conversions therefore may require /// a large number of digits to unambiguously determine how to round. -#[cold] #[must_use] #[allow(clippy::unwrap_used)] // reason = "none is a developer error" pub fn slow_radix( @@ -98,6 +97,7 @@ pub fn slow_radix( /// digits to the theoretical digits for `b` and determine if we /// need to round-up. #[must_use] +#[inline(always)] #[allow(clippy::cast_possible_wrap)] // reason = "the value range is [-324, 308]" pub fn digit_comp( num: Number, @@ -174,6 +174,7 @@ pub fn positive_digit_comp( /// /// This allows us to compare both floats using integers efficiently /// without any loss of precision. +#[must_use] #[inline(always)] #[allow(clippy::match_bool)] // reason = "simplifies documentation" #[allow(clippy::unwrap_used)] // reason = "unwrap panics if a developer error"