From e8a7616160f8834f53927d72532aab12ec27bf2a Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Mon, 27 May 2024 16:43:59 +0200 Subject: [PATCH 1/7] BACKEND BREAKING: implement fallible API --- Cargo.toml | 2 +- src/backend/bucket/fixed_str.rs | 20 +++++---- src/backend/bucket/mod.rs | 73 +++++++++++++++++++++++---------- src/backend/buffer.rs | 69 +++++++++++++++++++++---------- src/backend/mod.rs | 15 ++++--- src/backend/string.rs | 48 +++++++++++++++------- src/error.rs | 38 +++++++++++++++++ src/interner.rs | 72 +++++++++++++++++++++++++++----- src/lib.rs | 4 ++ 9 files changed, 259 insertions(+), 82 deletions(-) create mode 100644 src/error.rs diff --git a/Cargo.toml b/Cargo.toml index 4fdf110..813002d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,7 @@ edition = "2021" [dependencies] cfg-if = "1.0" -hashbrown = { version = "0.14.0", default-features = false, features = ["ahash"] } +hashbrown = { version = "0.14.0", default-features = false, features = ["ahash", "raw"] } serde = { version = "1.0", default-features = false, features = ["alloc"], optional = true } [dev-dependencies] diff --git a/src/backend/bucket/fixed_str.rs b/src/backend/bucket/fixed_str.rs index a9c256c..1b09e75 100644 --- a/src/backend/bucket/fixed_str.rs +++ b/src/backend/bucket/fixed_str.rs @@ -1,6 +1,7 @@ use super::InternedStr; #[cfg(not(feature = "std"))] use alloc::string::String; +use crate::Result; #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct FixedString { @@ -10,10 +11,14 @@ pub struct FixedString { impl FixedString { /// Creates a new fixed string with the given fixed capacity. #[inline] - pub fn with_capacity(cap: usize) -> Self { - Self { - contents: String::with_capacity(cap), - } + pub fn try_with_capacity(cap: usize) -> Result { + let mut contents = String::new(); + contents.try_reserve(cap)?; + Ok(Self { contents }) + // FIXME: try_with_capacity #91913, replace with the following: + // Ok(Self { + // contents: String::try_with_capacity(cap)?, + // }) } /// Returns the underlying [`Box`]. @@ -43,16 +48,17 @@ impl FixedString { #[inline] pub fn push_str(&mut self, string: &str) -> Option { let len = self.len(); - if self.capacity() < len + string.len() { + let new_len = len + string.len(); + if self.capacity() < new_len { return None; } self.contents.push_str(string); - debug_assert_eq!(self.contents.len(), len + string.len()); + debug_assert_eq!(self.contents.len(), new_len); Some(InternedStr::new( // SAFETY: We convert from bytes to utf8 from which we know through the // input string that they must represent valid utf8. unsafe { - core::str::from_utf8_unchecked(&self.contents.as_bytes()[len..len + string.len()]) + core::str::from_utf8_unchecked(&self.contents.as_bytes()[len..new_len]) }, )) } diff --git a/src/backend/bucket/mod.rs b/src/backend/bucket/mod.rs index e5c95c2..b1be2f5 100644 --- a/src/backend/bucket/mod.rs +++ b/src/backend/bucket/mod.rs @@ -5,7 +5,7 @@ mod interned_str; use self::{fixed_str::FixedString, interned_str::InternedStr}; use super::Backend; -use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol}; +use crate::{symbol::expect_valid_symbol, DefaultSymbol, Error, Result, Symbol}; #[cfg(not(feature = "std"))] use alloc::string::String; use alloc::vec::Vec; @@ -91,25 +91,30 @@ where fn with_capacity(cap: usize) -> Self { Self { spans: Vec::with_capacity(cap), - head: FixedString::with_capacity(cap), + head: FixedString::try_with_capacity(cap).unwrap(), full: Vec::new(), marker: Default::default(), } } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { + fn try_intern(&mut self, string: &str) -> Result { // SAFETY: This is safe because we never hand out the returned // interned string instance to the outside and only operate // on it within this backend. - let interned = unsafe { self.alloc(string) }; - self.push_span(interned) + let interned = unsafe { self.try_alloc(string)? }; + self.try_push_span(interned) } #[cfg_attr(feature = "inline-more", inline)] - fn intern_static(&mut self, string: &'static str) -> Self::Symbol { + fn try_intern_static(&mut self, string: &'static str) -> Result { let interned = InternedStr::new(string); - self.push_span(interned) + self.try_push_span(interned) + } + + fn try_reserve(&mut self, additional: usize) -> Result<()> { + self.spans.try_reserve(additional)?; + self.try_reserve_head(additional) } fn shrink_to_fit(&mut self) { @@ -142,29 +147,53 @@ where S: Symbol, { /// Returns the next available symbol. - fn next_symbol(&self) -> S { - expect_valid_symbol(self.spans.len()) + fn try_next_symbol(&self) -> Result { + S::try_from_usize(self.spans.len()).ok_or(Error::OutOfSymbols) } - /// Pushes the given interned string into the spans and returns its symbol. - fn push_span(&mut self, interned: InternedStr) -> S { - let symbol = self.next_symbol(); + /// Pushes the given interned string into the spans and returns its symbol on success. + fn try_push_span(&mut self, interned: InternedStr) -> Result { + let symbol = self.try_next_symbol()?; + // FIXME: vec_push_within_capacity #100486, replace the following with: + // + // if let Err(value) = self.spans.push_within_capacity(interned) { + // self.spans.try_reserve(1)?; + // // this cannot fail, the previous line either returned or added at least 1 free slot + // let _ = self.spans.push_within_capacity(value); + // } + self.spans.try_reserve(1)?; self.spans.push(interned); - symbol + + Ok(symbol) } - /// Interns a new string into the backend and returns a reference to it. - unsafe fn alloc(&mut self, string: &str) -> InternedStr { + /// Ensure head has enough reserved capacity or replace it with a new one. + #[cfg_attr(feature = "inline-more", inline)] + fn try_reserve_head(&mut self, additional: usize) -> Result<()> { let cap = self.head.capacity(); - if cap < self.head.len() + string.len() { - let new_cap = (usize::max(cap, string.len()) + 1).next_power_of_two(); - let new_head = FixedString::with_capacity(new_cap); + if cap < self.head.len() + additional { + let new_cap = (usize::max(cap, additional) + 1).next_power_of_two(); + let new_head = FixedString::try_with_capacity(new_cap)?; let old_head = core::mem::replace(&mut self.head, new_head); - self.full.push(old_head.finish()); + let old_string = old_head.finish(); + // FIXME: vec_push_within_capacity #100486, replace the following with: + // + // if let Err(value) = self.full.push_within_capacity(old_string) { + // self.full.try_reserve(1)?; + // // this cannot fail, the previous line either returned or added at least 1 free slot + // let _ = self.full.push_within_capacity(value); + // } + self.full.try_reserve(1)?; + self.full.push(old_string); } - self.head + Ok(()) + } + /// Interns a new string into the backend and returns a reference to it. + unsafe fn try_alloc(&mut self, string: &str) -> Result { + self.try_reserve_head(string.len())?; + Ok(self.head .push_str(string) - .expect("encountered invalid head capacity (2)") + .expect("encountered invalid head capacity (2)")) } } @@ -174,7 +203,7 @@ impl Clone for BucketBackend { // head string leaving the cloned `full` empty. let new_head_cap = self.head.capacity() + self.full.iter().fold(0, |lhs, rhs| lhs + rhs.len()); - let mut head = FixedString::with_capacity(new_head_cap); + let mut head = FixedString::try_with_capacity(new_head_cap).unwrap(); let mut spans = Vec::with_capacity(self.spans.len()); for span in &self.spans { let string = span.as_str(); diff --git a/src/backend/buffer.rs b/src/backend/buffer.rs index 03f6539..e6ab58e 100644 --- a/src/backend/buffer.rs +++ b/src/backend/buffer.rs @@ -1,9 +1,9 @@ #![cfg(feature = "backends")] use super::Backend; -use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol}; +use crate::{DefaultSymbol, Error, Result, Symbol}; use alloc::vec::Vec; -use core::{marker::PhantomData, mem, str}; +use core::{marker::PhantomData, str}; /// An interner backend that appends all interned string information in a single buffer. /// @@ -78,8 +78,8 @@ where { /// Returns the next available symbol. #[inline] - fn next_symbol(&self) -> S { - expect_valid_symbol(self.buffer.len()) + fn try_next_symbol(&self) -> Result { + S::try_from_usize(self.buffer.len()).ok_or(Error::OutOfSymbols) } /// Resolves the string for the given symbol if any. @@ -123,30 +123,39 @@ where unsafe { str::from_utf8_unchecked(str_bytes) } } - /// Pushes the given value onto the buffer with `var7` encoding. + /// Pushes the given length value onto the buffer with `var7` encoding. + /// Ensures there's enough capacity for the `var7` encoded length and + /// the following string bytes ahead of pushing. /// /// Returns the amount of `var7` encoded bytes. #[inline] - fn encode_var_usize(&mut self, value: usize) -> usize { - encode_var_usize(&mut self.buffer, value) + fn try_encode_var_length(&mut self, length: usize) -> Result<()> { + let add_len = length + calculate_var7_size(length); + self.buffer.try_reserve(add_len)?; + encode_var_usize(&mut self.buffer, length); + Ok(()) } - /// Pushes the given string into the buffer and returns its span. + /// Pushes the given string into the buffer and returns its span on success. /// - /// # Panics + /// # Errors /// - /// If the backend ran out of symbols. - fn push_string(&mut self, string: &str) -> S { - let symbol = self.next_symbol(); + /// Returns an [`Error`] if the backend ran out of symbols or memory. + fn try_push_string(&mut self, string: &str) -> Result { + let symbol = self.try_next_symbol()?; let str_len = string.len(); let str_bytes = string.as_bytes(); - self.encode_var_usize(str_len); - self.buffer.extend(str_bytes); + self.try_encode_var_length(str_len)?; + // try_encode_var_length ensures there's enough space left for str_bytes + self.buffer.extend_from_slice(str_bytes); self.len_strings += 1; - symbol + Ok(symbol) } } +/// According to google the approx. word length is 5. +const DEFAULT_STR_LEN: usize = 5; + impl Backend for BufferBackend where S: Symbol, @@ -158,11 +167,9 @@ where #[cfg_attr(feature = "inline-more", inline)] fn with_capacity(capacity: usize) -> Self { - /// We encode the `usize` string length into the buffer as well. - const LEN_USIZE: usize = mem::size_of::(); - /// According to google the approx. word length is 5. - const DEFAULT_STR_LEN: usize = 5; - let bytes_per_string = DEFAULT_STR_LEN + LEN_USIZE; + // We encode the `usize` string length into the buffer as well. + let var7_len: usize = calculate_var7_size(capacity); + let bytes_per_string = DEFAULT_STR_LEN + var7_len; Self { len_strings: 0, buffer: Vec::with_capacity(capacity * bytes_per_string), @@ -171,8 +178,8 @@ where } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { - self.push_string(string) + fn try_intern(&mut self, string: &str) -> Result { + self.try_push_string(string) } #[inline] @@ -183,6 +190,14 @@ where } } + fn try_reserve(&mut self, additional: usize) -> Result<()> { + // We encode the `usize` string length into the buffer as well. + let var7_len: usize = calculate_var7_size(self.buffer.len() + additional); + let bytes_per_string = DEFAULT_STR_LEN + var7_len; + self.buffer.try_reserve(additional * bytes_per_string)?; + Ok(()) + } + fn shrink_to_fit(&mut self) { self.buffer.shrink_to_fit(); } @@ -200,6 +215,16 @@ where } } +/// Calculate var7 encoded size from a given value. +#[inline] +fn calculate_var7_size(value: usize) -> usize { + // number of bits to encode + // value = 0 would give 0 bits, hence: |1, could be anything up to |0x7F as well + let bits = usize::BITS - (value|1).leading_zeros(); + // (bits to encode / 7).ceil() + ((bits + 6) / 7) as usize +} + /// Encodes the value using variable length encoding into the buffer. /// /// Returns the amount of bytes used for the encoding. diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 1609291..d41abf8 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -10,7 +10,7 @@ mod string; #[cfg(feature = "backends")] pub use self::{bucket::BucketBackend, buffer::BufferBackend, string::StringBackend}; -use crate::Symbol; +use crate::{Result, Symbol}; /// The default backend recommended for general use. #[cfg(feature = "backends")] @@ -35,28 +35,31 @@ pub trait Backend: Default { /// The capacity denotes how many strings are expected to be interned. fn with_capacity(cap: usize) -> Self; - /// Interns the given string and returns its interned ref and symbol. + /// Interns the given string and returns its interned ref and symbol on success. /// /// # Note /// /// The backend must make sure that the returned symbol maps back to the /// original string in its [`resolve`](`Backend::resolve`) method. - fn intern(&mut self, string: &str) -> Self::Symbol; + fn try_intern(&mut self, string: &str) -> Result; - /// Interns the given static string and returns its interned ref and symbol. + /// Interns the given static string and returns its interned ref and symbol on success. /// /// # Note /// /// The backend must make sure that the returned symbol maps back to the /// original string in its [`resolve`](`Backend::resolve`) method. #[inline] - fn intern_static(&mut self, string: &'static str) -> Self::Symbol { + fn try_intern_static(&mut self, string: &'static str) -> Result { // The default implementation simply forwards to the normal [`intern`] // implementation. Backends that can optimize for this use case should // implement this method. - self.intern(string) + self.try_intern(string) } + /// Try to reserve capacity for at least additional more symbols. + fn try_reserve(&mut self, additional: usize) -> Result<()>; + /// Shrink backend capacity to fit interned symbols exactly. fn shrink_to_fit(&mut self); diff --git a/src/backend/string.rs b/src/backend/string.rs index 42b0a12..5391e6f 100644 --- a/src/backend/string.rs +++ b/src/backend/string.rs @@ -1,7 +1,7 @@ #![cfg(feature = "backends")] use super::Backend; -use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol}; +use crate::{symbol::expect_valid_symbol, DefaultSymbol, Error, Result, Symbol}; use alloc::{string::String, vec::Vec}; use core::{iter::Enumerate, marker::PhantomData, slice}; @@ -96,8 +96,9 @@ where S: Symbol, { /// Returns the next available symbol. - fn next_symbol(&self) -> S { - expect_valid_symbol(self.ends.len()) + #[inline] + fn try_next_symbol(&self) -> Result { + S::try_from_usize(self.ends.len()).ok_or(Error::OutOfSymbols) } /// Returns the string associated to the span. @@ -130,20 +131,35 @@ where Span { from, to } } - /// Pushes the given string into the buffer and returns its span. + /// Pushes the given string into the buffer and returns its span on success. /// - /// # Panics + /// # Errors /// - /// If the backend ran out of symbols. - fn push_string(&mut self, string: &str) -> S { + /// Returns an [`Error`] if the backend ran out of symbols or memory. + fn try_push_string(&mut self, string: &str) -> Result { + // reserve required capacity ahead of pushing a string + self.buffer.try_reserve(string.len())?; + // The following cannot panic, we already reserved enough capacity for a string. self.buffer.push_str(string); let to = self.buffer.as_bytes().len(); - let symbol = self.next_symbol(); + let symbol = self.try_next_symbol()?; + // FIXME: vec_push_within_capacity #100486, replace the following with: + // + // if let Err(value) = self.ends.push_within_capacity(to) { + // self.ends.try_reserve(1)?; + // // this cannot fail, the previous line either returned or added at least 1 free slot + // let _ = self.ends.push_within_capacity(value); + // } + self.ends.try_reserve(1)?; self.ends.push(to); - symbol + + Ok(symbol) } } +// According to google the approx. word length is 5. +const DEFAULT_WORD_LEN: usize = 5; + impl Backend for StringBackend where S: Symbol, @@ -155,18 +171,16 @@ where #[cfg_attr(feature = "inline-more", inline)] fn with_capacity(cap: usize) -> Self { - // According to google the approx. word length is 5. - let default_word_len = 5; Self { ends: Vec::with_capacity(cap), - buffer: String::with_capacity(cap * default_word_len), + buffer: String::with_capacity(cap * DEFAULT_WORD_LEN), marker: Default::default(), } } #[inline] - fn intern(&mut self, string: &str) -> Self::Symbol { - self.push_string(string) + fn try_intern(&mut self, string: &str) -> Result { + self.try_push_string(string) } #[inline] @@ -175,6 +189,12 @@ where .map(|span| self.span_to_str(span)) } + fn try_reserve(&mut self, additional: usize) -> Result<()> { + self.ends.try_reserve(additional)?; + self.buffer.try_reserve(additional * DEFAULT_WORD_LEN)?; + Ok(()) + } + fn shrink_to_fit(&mut self) { self.ends.shrink_to_fit(); self.buffer.shrink_to_fit(); diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..fc83915 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,38 @@ +use alloc::collections; +use core::fmt; + +/// An error object returned from fallible methods of the string-interner. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Error { + /// The interner already interns the maximum number of strings possible by the chosen symbol type. + OutOfSymbols, + /// An operation could not be completed, because it failed to allocate enough memory. + OutOfMemory +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + Error::OutOfSymbols => "no more symbols available", + Error::OutOfMemory => "out of memory" + }) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl From for Error { + fn from(_: collections::TryReserveError) -> Self { + Error::OutOfMemory + } +} + +impl From for Error { + fn from(_: hashbrown::TryReserveError) -> Self { + Error::OutOfMemory + } +} + +/// The type returned by fallible methods of the string-interner. +pub type Result = core::result::Result; diff --git a/src/interner.rs b/src/interner.rs index db01de1..0e977ef 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -1,4 +1,4 @@ -use crate::{backend::Backend, Symbol}; +use crate::{backend::Backend, Result, Symbol}; use core::{ fmt, fmt::{Debug, Formatter}, @@ -158,6 +158,17 @@ where self.len() == 0 } + /// Try to reserve capacity for at least additional more symbols. + pub fn try_reserve(&mut self, additional: usize) -> Result<()> { + self.dedup.raw_table_mut().try_reserve(additional, |(symbol, ())| { + // SAFETY: This is safe because we only operate on symbols that + // we receive from our backend making them valid. + let string = unsafe { self.backend.resolve_unchecked(*symbol) }; + make_hash(&self.hasher, string) + })?; + self.backend.try_reserve(additional) + } + /// Returns the symbol for the given string if any. /// /// Can be used to query if a string has already been interned without interning. @@ -190,11 +201,11 @@ where /// [1]: [`StringInterner::get_or_intern`] /// [2]: [`StringInterner::get_or_intern_static`] #[cfg_attr(feature = "inline-more", inline)] - fn get_or_intern_using( + fn try_get_or_intern_using( &mut self, string: T, - intern_fn: fn(&mut B, T) -> ::Symbol, - ) -> ::Symbol + intern_fn: fn(&mut B, T) -> Result<::Symbol>, + ) -> Result<::Symbol> where T: Copy + Hash + AsRef + for<'a> PartialEq<&'a str>, { @@ -204,6 +215,13 @@ where backend, } = self; let hash = make_hash(hasher, string.as_ref()); + // this call requires hashbrown `raw` feature enabled + dedup.raw_table_mut().try_reserve(1, |(symbol, ())| { + // SAFETY: This is safe because we only operate on symbols that + // we receive from our backend making them valid. + let string = unsafe { backend.resolve_unchecked(*symbol) }; + make_hash(hasher, string) + })?; let entry = dedup.raw_entry_mut().from_hash(hash, |symbol| { // SAFETY: This is safe because we only operate on symbols that // we receive from our backend making them valid. @@ -213,7 +231,7 @@ where let (&mut symbol, &mut ()) = match entry { RawEntryMut::Occupied(occupied) => occupied.into_key_value(), RawEntryMut::Vacant(vacant) => { - let symbol = intern_fn(backend, string); + let symbol = intern_fn(backend, string)?; vacant.insert_with_hasher(hash, symbol, (), |symbol| { // SAFETY: This is safe because we only operate on symbols that // we receive from our backend making them valid. @@ -222,7 +240,23 @@ where }) } }; - symbol + Ok(symbol) + } + + /// Interns the given string fallibly. + /// + /// Returns a symbol for resolution into the original string on success. + /// + /// # Errors + /// + /// If the interner already interns the maximum number of strings possible + /// by the chosen symbol type or when running out of heap memory. + #[inline] + pub fn try_get_or_intern(&mut self, string: T) -> Result<::Symbol> + where + T: AsRef, + { + self.try_get_or_intern_using(string.as_ref(), B::try_intern) } /// Interns the given string. @@ -232,13 +266,31 @@ where /// # Panics /// /// If the interner already interns the maximum number of strings possible - /// by the chosen symbol type. + /// by the chosen symbol type or when running out of heap memory. #[inline] pub fn get_or_intern(&mut self, string: T) -> ::Symbol where T: AsRef, { - self.get_or_intern_using(string.as_ref(), B::intern) + self.try_get_or_intern(string).unwrap() + } + + /// Interns the given `'static` string fallibly. + /// + /// Returns a symbol for resolution into the original string. + /// + /// # Note + /// + /// This is more efficient than [`StringInterner::get_or_intern`] since it might + /// avoid some memory allocations if the backends supports this. + /// + /// # Errors + /// + /// If the interner already interns the maximum number of strings possible + /// by the chosen symbol type or when running out of heap memory. + #[inline] + pub fn try_get_or_intern_static(&mut self, string: &'static str) -> Result<::Symbol> { + self.try_get_or_intern_using(string, B::try_intern_static) } /// Interns the given `'static` string. @@ -253,10 +305,10 @@ where /// # Panics /// /// If the interner already interns the maximum number of strings possible - /// by the chosen symbol type. + /// by the chosen symbol type or when running out of heap memory. #[inline] pub fn get_or_intern_static(&mut self, string: &'static str) -> ::Symbol { - self.get_or_intern_using(string, B::intern_static) + self.try_get_or_intern_static(string).unwrap() } /// Shrink backend capacity to fit the interned strings exactly. diff --git a/src/lib.rs b/src/lib.rs index 06ea97c..79269bf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,6 +130,7 @@ mod serde_impl; pub mod backend; mod interner; pub mod symbol; +mod error; /// A convenience [`StringInterner`] type based on the [`DefaultBackend`]. #[cfg(feature = "backends")] @@ -147,3 +148,6 @@ pub use self::{ #[doc(inline)] pub use hashbrown::hash_map::DefaultHashBuilder; + +#[doc(inline)] +pub use error::{Error, Result}; From 9ad151b91a2404335facb38cf2a807b4691cbc28 Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Mon, 27 May 2024 16:44:18 +0200 Subject: [PATCH 2/7] backend/bucket fix docs --- src/backend/bucket/fixed_str.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/backend/bucket/fixed_str.rs b/src/backend/bucket/fixed_str.rs index 1b09e75..b06c3c3 100644 --- a/src/backend/bucket/fixed_str.rs +++ b/src/backend/bucket/fixed_str.rs @@ -20,8 +20,7 @@ impl FixedString { // contents: String::try_with_capacity(cap)?, // }) } - - /// Returns the underlying [`Box`]. + /// Returns the underlying [`String`]. /// /// Guarantees not to perform any reallocations in this process. #[inline] From 27c77cc69086826a42f15bd7c8ab3f79f93e3514 Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Tue, 11 Jun 2024 13:04:23 +0200 Subject: [PATCH 3/7] remove Backend::try_reserve method --- src/backend/bucket/mod.rs | 5 ----- src/backend/buffer.rs | 8 -------- src/backend/mod.rs | 3 --- src/backend/string.rs | 6 ------ src/interner.rs | 12 +++++++++--- 5 files changed, 9 insertions(+), 25 deletions(-) diff --git a/src/backend/bucket/mod.rs b/src/backend/bucket/mod.rs index b1be2f5..b823368 100644 --- a/src/backend/bucket/mod.rs +++ b/src/backend/bucket/mod.rs @@ -112,11 +112,6 @@ where self.try_push_span(interned) } - fn try_reserve(&mut self, additional: usize) -> Result<()> { - self.spans.try_reserve(additional)?; - self.try_reserve_head(additional) - } - fn shrink_to_fit(&mut self) { self.spans.shrink_to_fit(); // Commenting out the below line fixes: https://github.com/Robbepop/string-interner/issues/46 diff --git a/src/backend/buffer.rs b/src/backend/buffer.rs index e6ab58e..5a46dbb 100644 --- a/src/backend/buffer.rs +++ b/src/backend/buffer.rs @@ -190,14 +190,6 @@ where } } - fn try_reserve(&mut self, additional: usize) -> Result<()> { - // We encode the `usize` string length into the buffer as well. - let var7_len: usize = calculate_var7_size(self.buffer.len() + additional); - let bytes_per_string = DEFAULT_STR_LEN + var7_len; - self.buffer.try_reserve(additional * bytes_per_string)?; - Ok(()) - } - fn shrink_to_fit(&mut self) { self.buffer.shrink_to_fit(); } diff --git a/src/backend/mod.rs b/src/backend/mod.rs index d41abf8..9c3b269 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -57,9 +57,6 @@ pub trait Backend: Default { self.try_intern(string) } - /// Try to reserve capacity for at least additional more symbols. - fn try_reserve(&mut self, additional: usize) -> Result<()>; - /// Shrink backend capacity to fit interned symbols exactly. fn shrink_to_fit(&mut self); diff --git a/src/backend/string.rs b/src/backend/string.rs index 5391e6f..c31dfa7 100644 --- a/src/backend/string.rs +++ b/src/backend/string.rs @@ -189,12 +189,6 @@ where .map(|span| self.span_to_str(span)) } - fn try_reserve(&mut self, additional: usize) -> Result<()> { - self.ends.try_reserve(additional)?; - self.buffer.try_reserve(additional * DEFAULT_WORD_LEN)?; - Ok(()) - } - fn shrink_to_fit(&mut self) { self.ends.shrink_to_fit(); self.buffer.shrink_to_fit(); diff --git a/src/interner.rs b/src/interner.rs index 0e977ef..f4a25c8 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -158,15 +158,21 @@ where self.len() == 0 } - /// Try to reserve capacity for at least additional more symbols. + /// Try to reserve capacity for at least `additional` more symbols. + /// + /// # Note + /// + /// This function only reserves capacity in the hashmap responsible for + /// string deduplication. + /// It __does not__ reserve capacity in the backend that stores strings. pub fn try_reserve(&mut self, additional: usize) -> Result<()> { self.dedup.raw_table_mut().try_reserve(additional, |(symbol, ())| { // SAFETY: This is safe because we only operate on symbols that // we receive from our backend making them valid. let string = unsafe { self.backend.resolve_unchecked(*symbol) }; make_hash(&self.hasher, string) - })?; - self.backend.try_reserve(additional) + }) + .map_err(From::from) } /// Returns the symbol for the given string if any. From 55df1e7a366ac2a5db0a6f43b6530a4a37065697 Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Tue, 11 Jun 2024 13:05:50 +0200 Subject: [PATCH 4/7] fixes docs and mentions `try_get_or_intern` in the root doc of the StringInterner --- src/backend/mod.rs | 4 ++-- src/interner.rs | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 9c3b269..6b0e99c 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -69,8 +69,8 @@ pub trait Backend: Default { /// /// Does not perform validity checks on the given symbol and relies /// on the caller to be provided with a symbol that has been generated - /// by the [`intern`](`Backend::intern`) or - /// [`intern_static`](`Backend::intern_static`) methods of the same + /// by the [`try_intern`](`Backend::try_intern`) or + /// [`try_intern_static`](`Backend::try_intern_static`) methods of the same /// interner backend. unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str; diff --git a/src/interner.rs b/src/interner.rs index f4a25c8..423a8da 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -24,7 +24,8 @@ where /// /// The following API covers the main functionality: /// -/// - [`StringInterner::get_or_intern`]: To intern a new string. +/// - [`StringInterner::get_or_intern`] or +/// [`StringInterner::try_get_or_intern`]: To intern a new string. /// - This maps from `string` type to `symbol` type. /// - [`StringInterner::resolve`]: To resolve your already interned strings. /// - This maps from `symbol` type to `string` type. @@ -287,7 +288,7 @@ where /// /// # Note /// - /// This is more efficient than [`StringInterner::get_or_intern`] since it might + /// This is more efficient than [`StringInterner::try_get_or_intern`] since it might /// avoid some memory allocations if the backends supports this. /// /// # Errors From 5b022daebbddb6e9a20f45bebd565f27b54f9a56 Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Tue, 11 Jun 2024 13:30:08 +0200 Subject: [PATCH 5/7] buffer backend: test calculate_var7_size alongside encode_var_usize --- src/backend/buffer.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/backend/buffer.rs b/src/backend/buffer.rs index 5a46dbb..86c0879 100644 --- a/src/backend/buffer.rs +++ b/src/backend/buffer.rs @@ -325,7 +325,7 @@ fn decode_var_usize_cold(buffer: &[u8]) -> Option<(usize, usize)> { #[cfg(test)] mod tests { - use super::{decode_var_usize, encode_var_usize}; + use super::{decode_var_usize, encode_var_usize, calculate_var7_size}; #[cfg(not(feature = "std"))] use alloc::vec::Vec; @@ -334,6 +334,7 @@ mod tests { let mut buffer = Vec::new(); for i in 0..2usize.pow(7) { buffer.clear(); + assert_eq!(calculate_var7_size(i), 1); assert_eq!(encode_var_usize(&mut buffer, i), 1); assert_eq!(buffer, [i as u8]); assert_eq!(decode_var_usize(&buffer), Some((i, 1))); @@ -345,6 +346,7 @@ mod tests { let mut buffer = Vec::new(); for i in 2usize.pow(7)..2usize.pow(14) { buffer.clear(); + assert_eq!(calculate_var7_size(i), 2); assert_eq!(encode_var_usize(&mut buffer, i), 2); assert_eq!(buffer, [0x80 | ((i & 0x7F) as u8), (0x7F & (i >> 7) as u8)]); assert_eq!(decode_var_usize(&buffer), Some((i, 2))); @@ -357,6 +359,7 @@ mod tests { let mut buffer = Vec::new(); for i in 2usize.pow(14)..2usize.pow(21) { buffer.clear(); + assert_eq!(calculate_var7_size(i), 3); assert_eq!(encode_var_usize(&mut buffer, i), 3); assert_eq!( buffer, @@ -376,6 +379,7 @@ mod tests { let mut buffer = Vec::new(); for i in range { buffer.clear(); + assert_eq!(calculate_var7_size(i), 4); assert_eq!(encode_var_usize(&mut buffer, i), 4); assert_eq!( buffer, @@ -418,6 +422,7 @@ mod tests { fn encode_var_u32_max_works() { let mut buffer = Vec::new(); let i = u32::MAX as usize; + assert_eq!(calculate_var7_size(i), 5); assert_eq!(encode_var_usize(&mut buffer, i), 5); assert_eq!(buffer, [0xFF, 0xFF, 0xFF, 0xFF, 0x0F]); assert_eq!(decode_var_usize(&buffer), Some((i, 5))); @@ -427,6 +432,7 @@ mod tests { fn encode_var_u64_max_works() { let mut buffer = Vec::new(); let i = u64::MAX as usize; + assert_eq!(calculate_var7_size(i), 10); assert_eq!(encode_var_usize(&mut buffer, i), 10); assert_eq!( buffer, From a1b11751790711955c663ea526d46240f4099534 Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Tue, 11 Jun 2024 14:22:48 +0200 Subject: [PATCH 6/7] tests: memory consuption - update number of alloc/deallocations of the updated buffer backend to reflect changes in backend --- tests/tests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests.rs b/tests/tests.rs index 1f4a55c..0b64af5 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -52,8 +52,8 @@ impl BackendStats for backend::StringBackend { impl BackendStats for backend::BufferBackend { const MIN_OVERHEAD: f64 = 1.35; const MAX_OVERHEAD: f64 = 1.58; - const MAX_ALLOCATIONS: usize = 43; - const MAX_DEALLOCATIONS: usize = 41; + const MAX_ALLOCATIONS: usize = 42; + const MAX_DEALLOCATIONS: usize = 40; const NAME: &'static str = "BufferBackend"; } From 474f2b9c2c4c42d41b942e8a10dce42acc646d50 Mon Sep 17 00:00:00 2001 From: Rafal Michalski Date: Thu, 20 Jun 2024 13:13:45 +0200 Subject: [PATCH 7/7] fix formatting --- src/backend/bucket/fixed_str.rs | 6 ++---- src/backend/bucket/mod.rs | 3 ++- src/backend/buffer.rs | 4 ++-- src/error.rs | 4 ++-- src/interner.rs | 21 +++++++++++++-------- src/lib.rs | 2 +- 6 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/backend/bucket/fixed_str.rs b/src/backend/bucket/fixed_str.rs index b06c3c3..1b15925 100644 --- a/src/backend/bucket/fixed_str.rs +++ b/src/backend/bucket/fixed_str.rs @@ -1,7 +1,7 @@ use super::InternedStr; +use crate::Result; #[cfg(not(feature = "std"))] use alloc::string::String; -use crate::Result; #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct FixedString { @@ -56,9 +56,7 @@ impl FixedString { Some(InternedStr::new( // SAFETY: We convert from bytes to utf8 from which we know through the // input string that they must represent valid utf8. - unsafe { - core::str::from_utf8_unchecked(&self.contents.as_bytes()[len..new_len]) - }, + unsafe { core::str::from_utf8_unchecked(&self.contents.as_bytes()[len..new_len]) }, )) } } diff --git a/src/backend/bucket/mod.rs b/src/backend/bucket/mod.rs index b823368..114f061 100644 --- a/src/backend/bucket/mod.rs +++ b/src/backend/bucket/mod.rs @@ -186,7 +186,8 @@ where /// Interns a new string into the backend and returns a reference to it. unsafe fn try_alloc(&mut self, string: &str) -> Result { self.try_reserve_head(string.len())?; - Ok(self.head + Ok(self + .head .push_str(string) .expect("encountered invalid head capacity (2)")) } diff --git a/src/backend/buffer.rs b/src/backend/buffer.rs index 86c0879..ffd761a 100644 --- a/src/backend/buffer.rs +++ b/src/backend/buffer.rs @@ -212,7 +212,7 @@ where fn calculate_var7_size(value: usize) -> usize { // number of bits to encode // value = 0 would give 0 bits, hence: |1, could be anything up to |0x7F as well - let bits = usize::BITS - (value|1).leading_zeros(); + let bits = usize::BITS - (value | 1).leading_zeros(); // (bits to encode / 7).ceil() ((bits + 6) / 7) as usize } @@ -325,7 +325,7 @@ fn decode_var_usize_cold(buffer: &[u8]) -> Option<(usize, usize)> { #[cfg(test)] mod tests { - use super::{decode_var_usize, encode_var_usize, calculate_var7_size}; + use super::{calculate_var7_size, decode_var_usize, encode_var_usize}; #[cfg(not(feature = "std"))] use alloc::vec::Vec; diff --git a/src/error.rs b/src/error.rs index fc83915..ad72656 100644 --- a/src/error.rs +++ b/src/error.rs @@ -7,14 +7,14 @@ pub enum Error { /// The interner already interns the maximum number of strings possible by the chosen symbol type. OutOfSymbols, /// An operation could not be completed, because it failed to allocate enough memory. - OutOfMemory + OutOfMemory, } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(match self { Error::OutOfSymbols => "no more symbols available", - Error::OutOfMemory => "out of memory" + Error::OutOfMemory => "out of memory", }) } } diff --git a/src/interner.rs b/src/interner.rs index 423a8da..11dfcfe 100644 --- a/src/interner.rs +++ b/src/interner.rs @@ -167,13 +167,15 @@ where /// string deduplication. /// It __does not__ reserve capacity in the backend that stores strings. pub fn try_reserve(&mut self, additional: usize) -> Result<()> { - self.dedup.raw_table_mut().try_reserve(additional, |(symbol, ())| { - // SAFETY: This is safe because we only operate on symbols that - // we receive from our backend making them valid. - let string = unsafe { self.backend.resolve_unchecked(*symbol) }; - make_hash(&self.hasher, string) - }) - .map_err(From::from) + self.dedup + .raw_table_mut() + .try_reserve(additional, |(symbol, ())| { + // SAFETY: This is safe because we only operate on symbols that + // we receive from our backend making them valid. + let string = unsafe { self.backend.resolve_unchecked(*symbol) }; + make_hash(&self.hasher, string) + }) + .map_err(From::from) } /// Returns the symbol for the given string if any. @@ -296,7 +298,10 @@ where /// If the interner already interns the maximum number of strings possible /// by the chosen symbol type or when running out of heap memory. #[inline] - pub fn try_get_or_intern_static(&mut self, string: &'static str) -> Result<::Symbol> { + pub fn try_get_or_intern_static( + &mut self, + string: &'static str, + ) -> Result<::Symbol> { self.try_get_or_intern_using(string, B::try_intern_static) } diff --git a/src/lib.rs b/src/lib.rs index 79269bf..a400adb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -128,9 +128,9 @@ extern crate std as alloc; mod serde_impl; pub mod backend; +mod error; mod interner; pub mod symbol; -mod error; /// A convenience [`StringInterner`] type based on the [`DefaultBackend`]. #[cfg(feature = "backends")]