diff --git a/Cargo.lock b/Cargo.lock index ecf962537..ec41eb3b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,18 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "ahash" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.2" @@ -26,6 +38,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "anes" version = "0.1.6" @@ -814,6 +832,10 @@ name = "hashbrown" version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "heck" @@ -1087,6 +1109,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21" +dependencies = [ + "hashbrown 0.14.2", +] + [[package]] name = "macros" version = "0.0.1" @@ -1199,8 +1230,12 @@ dependencies = [ name = "nemo-benches" version = "0.3.1-dev" dependencies = [ + "clap 4.4.6", + "colored", "criterion", "env_logger 0.10.0", + "flate2", + "log", "nemo", "nemo-physical", "rand", @@ -1234,11 +1269,13 @@ dependencies = [ "howlong", "linked-hash-map", "log", + "lru", "num", "once_cell", "quickcheck", "quickcheck_macros", "rand", + "regex", "reqwest", "rio_turtle", "rio_xml", @@ -2570,3 +2607,23 @@ dependencies = [ "cfg-if", "windows-sys", ] + +[[package]] +name = "zerocopy" +version = "0.7.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686b7e407015242119c33dab17b8f61ba6843534de936d94368856528eae4dcc" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020f3dfe25dfc38dfea49ce62d5d45ecdd7f0d8a724fa63eb36b6eba4ec76806" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.38", +] diff --git a/nemo-benches/Cargo.toml b/nemo-benches/Cargo.toml index ca29d1f5b..83730132c 100644 --- a/nemo-benches/Cargo.toml +++ b/nemo-benches/Cargo.toml @@ -9,13 +9,21 @@ license.workspace = true readme = "README.md" repository.workspace = true +[[bin]] +name = "dict-bench" +path = "src/bin/dict-bench.rs" + [dependencies] nemo-physical = { path = "../nemo-physical", default-features = false } nemo = { path = "../nemo", default-features = false } rand = "0.8.5" +flate2 = "1" +log = { version = "0.4", features = [ "max_level_trace", "release_max_level_trace" ] } +clap = { version = "4.0.32", features = [ "derive", "cargo", "env" ] } +colored = "2" +env_logger = "*" [dev-dependencies] -env_logger = "*" criterion = { version = "0.5", features = [ "html_reports" ] } rand_pcg = "0.3" diff --git a/nemo-benches/benches/input.rs b/nemo-benches/benches/input.rs index 7b1136c2e..9b6f0196b 100644 --- a/nemo-benches/benches/input.rs +++ b/nemo-benches/benches/input.rs @@ -7,7 +7,7 @@ use nemo_physical::{ builder_proxy::{ ColumnBuilderProxy, PhysicalBuilderProxyEnum, PhysicalStringColumnBuilderProxy, }, - dictionary::PrefixedStringDictionary, + dictionary::HashMapDictionary, }; use rand::{distributions::Alphanumeric, prelude::*}; use rand_pcg::Pcg64; @@ -35,7 +35,7 @@ pub fn benchmark_input(c: &mut Criterion) { group.bench_function("read_strings", |b| { b.iter_batched( || { - let dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let dict = std::cell::RefCell::new(HashMapDictionary::default()); (strings.clone(), dict) }, |(input, dict)| { @@ -53,7 +53,7 @@ pub fn benchmark_input(c: &mut Criterion) { group.bench_function("read_terms", |b| { b.iter_batched( || { - let dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let dict = std::cell::RefCell::new(HashMapDictionary::default()); (terms.clone(), dict) }, |(input, dict)| { @@ -71,7 +71,7 @@ pub fn benchmark_input(c: &mut Criterion) { group.bench_function("read_iris", |b| { b.iter_batched( || { - let dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let dict = std::cell::RefCell::new(HashMapDictionary::default()); (iris.clone(), dict) }, |(input, dict)| { diff --git a/nemo-benches/src/bin/dict-bench.rs b/nemo-benches/src/bin/dict-bench.rs new file mode 100644 index 000000000..61f22fc16 --- /dev/null +++ b/nemo-benches/src/bin/dict-bench.rs @@ -0,0 +1,119 @@ +use flate2::read::MultiGzDecoder; +use std::env; +use std::fs::File; +use std::io::prelude::*; +use std::io::stdin; +use std::io::BufReader; + +use nemo::meta::{timing::TimedDisplay, TimedCode}; +use nemo_physical::dictionary::{ + hash_map_dictionary::HashMapDictionary, meta_dictionary::MetaDictionary, + prefixed_string_dictionary::PrefixedStringDictionary, string_dictionary::StringDictionary, + AddResult, Dictionary, +}; + +fn create_dictionary(dict_type: &str) -> Box { + match dict_type { + "hash" => { + println!("Using StringDictionary."); + Box::new(StringDictionary::new()) + } + "hashmap" => { + println!("Using HashMapDictionary."); + Box::new(HashMapDictionary::new()) + } + "prefix" => { + println!("Using PrefixedStringDictionary."); + Box::new(PrefixedStringDictionary::new()) + } + "meta" => { + println!("Using MetaDictionary."); + Box::new(MetaDictionary::new()) + } + _ => panic!("Unexpected dictionary type '{}'.", dict_type), + } +} + +fn main() { + env_logger::init(); + TimedCode::instance().start(); + + let args: Vec<_> = env::args().collect(); + if args.len() < 3 { + println!("Usage: dict-bench "); + println!( + " File with dictionary entries, one per line, possibly with duplicates." + ); + println!( + " Identifier for the dictionary to test, e.g., \"hash\" or \"prefix\"." + ); + println!( + " If anything is given here, the program will terminate without asking for a prompt." + ); + } + + let filename = &args[1]; + let dicttype = &args[2]; + + let reader = BufReader::new(MultiGzDecoder::new( + File::open(filename).expect("Cannot open file."), + )); + + let mut dict = create_dictionary(dicttype); + let mut count_lines = 0; + let mut count_unique = 0; + let mut bytes = 0; + + TimedCode::instance().sub("Dictionary filling").start(); + + println!("Starting to fill dictionary ..."); + + for l in reader.lines() { + let s = l.unwrap(); + let b = s.len(); + + let entry_status = dict.add_string(s); + match entry_status { + AddResult::Fresh(_value) => { + bytes += b; + count_unique += 1; + } + AddResult::Known(_value) => {} + AddResult::Rejected => {} + } + + count_lines += 1; + } + + TimedCode::instance().sub("Dictionary filling").stop(); + + println!( + "Processed {} strings (dictionary contains {} unique strings with {} bytes overall).", + count_lines, count_unique, bytes + ); + + TimedCode::instance().stop(); + + println!( + "\n{}", + TimedCode::instance().create_tree_string( + "dict-bench", + &[ + TimedDisplay::default(), + TimedDisplay::default(), + TimedDisplay::new(nemo::meta::timing::TimedSorting::LongestThreadTime, 0) + ] + ) + ); + + if args.len() < 4 { + println!("All done. Press return to end benchmark (and free all memory)."); + let mut s = String::new(); + stdin().read_line(&mut s).expect("No string entered?"); + } + + if dict.len() == 123456789 { + // FWIW, prevent dict from going out of scope before really finishing + println!("Today is your lucky day."); + } +} diff --git a/nemo-cli/Cargo.toml b/nemo-cli/Cargo.toml index eb40664ec..63733e87f 100644 --- a/nemo-cli/Cargo.toml +++ b/nemo-cli/Cargo.toml @@ -9,9 +9,6 @@ license.workspace = true readme = "README.md" repository.workspace = true -[features] -no-prefixed-string-dictionary = ["nemo/no-prefixed-string-dictionary"] - [[bin]] name = "nmo" path = "src/main.rs" diff --git a/nemo-physical/Cargo.toml b/nemo-physical/Cargo.toml index dcee77494..0bb478377 100644 --- a/nemo-physical/Cargo.toml +++ b/nemo-physical/Cargo.toml @@ -14,7 +14,6 @@ default = ["timing"] # Enables time measurements using the "howlong" crate # If this feature is not enabled, all time measurements will display zero instead timing = ["dep:howlong"] -no-prefixed-string-dictionary = [] [dependencies] log = "0.4" @@ -24,10 +23,12 @@ num = "0.4.0" ascii_tree = "0.1.1" once_cell = "1" linked-hash-map = "0.5.6" +lru = "0.11.1" howlong = { version = "0.1", optional = true } rio_turtle = "0.8.4" rio_xml = "0.8.4" reqwest = "0.11.18" +regex = "1.9.5" [dev-dependencies] arbitrary = { version = "1", features = ["derive"] } diff --git a/nemo-physical/src/arithmetic/traits.rs b/nemo-physical/src/arithmetic/traits.rs index ce140cd62..31720670b 100644 --- a/nemo-physical/src/arithmetic/traits.rs +++ b/nemo-physical/src/arithmetic/traits.rs @@ -114,7 +114,7 @@ impl CheckedPow for usize { impl CheckedPow for u8 { fn checked_pow(self, exponent: Self) -> Option { - num::checked_pow(self, exponent.try_into().ok()?) + num::checked_pow(self, exponent.into()) } } diff --git a/nemo-physical/src/builder_proxy.rs b/nemo-physical/src/builder_proxy.rs index 4bdf5e767..c4e3567c5 100644 --- a/nemo-physical/src/builder_proxy.rs +++ b/nemo-physical/src/builder_proxy.rs @@ -95,7 +95,13 @@ impl ColumnBuilderProxy for PhysicalStringColumnBuilderProxy<'_> generic_trait_impl_without_add!(VecT::U64); fn add(&mut self, input: PhysicalString) -> Result<(), ReadingError> { self.commit(); - self.value = Some(self.dict.borrow_mut().add(input.into()).try_into()?); + self.value = Some( + self.dict + .borrow_mut() + .add_string(input.into()) + .value() + .try_into()?, + ); Ok(()) } } diff --git a/nemo-physical/src/datatypes/data_value.rs b/nemo-physical/src/datatypes/data_value.rs index 4740d0c55..8e2b9b5d4 100644 --- a/nemo-physical/src/datatypes/data_value.rs +++ b/nemo-physical/src/datatypes/data_value.rs @@ -73,7 +73,12 @@ impl DataValueT { match self { Self::String(val) => { // dictionary indices - StorageValueT::U64(dict.add(val.clone().into()).try_into().unwrap()) + StorageValueT::U64( + dict.add_string(val.clone().into()) + .value() + .try_into() + .unwrap(), + ) } Self::U32(val) => StorageValueT::U32(*val), Self::U64(val) => StorageValueT::U64(*val), @@ -88,7 +93,7 @@ impl DataValueT { match self { Self::String(val) => Some(StorageValueT::U64( // dictionary indices - dict.index_of(val.into())?.try_into().unwrap(), + dict.fetch_id(val.into())?.try_into().unwrap(), )), Self::U32(val) => Some(StorageValueT::U32(*val)), Self::U64(val) => Some(StorageValueT::U64(*val)), diff --git a/nemo-physical/src/dictionary.rs b/nemo-physical/src/dictionary.rs index a147dc7c7..fb9720754 100644 --- a/nemo-physical/src/dictionary.rs +++ b/nemo-physical/src/dictionary.rs @@ -1,39 +1,128 @@ -//! This module provides different dictionary functionalities -//! In general these dictionary functionalities allow to represent [String] values as [usize] values +//! This module provides functionalities for creating and maintaining dictionaries. +//! A dictionary is a data structure that assigns numeric ids to complex objects (such as [String]s), +//! and that provides an bijective (invertible) mapping between the two. use std::fmt::Debug; -/// Module to define a [PrefixedStringDictionary] -/// This will provide a more memory-efficient storage of [String] values if they share equivalent prefixes (such as IRIs) -/// The prefixes of the [String] will be stored as a Triestructure. +/// Module to define the [DictionaryString] +pub mod dictionary_string; +pub use dictionary_string::DictionaryString; +/// Module to define the [PrefixedStringDictionary] pub mod prefixed_string_dictionary; pub use prefixed_string_dictionary::PrefixedStringDictionary; /// Module to define a simple [StringDictionary] pub mod string_dictionary; pub use string_dictionary::StringDictionary; +/// Module to define [HashMapDictionary] +pub mod hash_map_dictionary; +pub use hash_map_dictionary::HashMapDictionary; +/// Module to define [InfixDictionary] +pub mod infix_dictionary; +pub use infix_dictionary::InfixDictionary; +/// Module to define [MetaDictionary] +pub mod meta_dictionary; +pub use meta_dictionary::MetaDictionary; /// Module mapping physical types into logical types into Strings pub mod value_serializer; pub use value_serializer::ValueSerializer; -/// This Dictionary Trait defines dictionaries, which keep ownership of the inserted elements. -pub trait Dictionary: Debug + Default + Clone { - /// Construct a new and empty [`Dictionary`] - fn new() -> Self - where - Self: Sized + Default, - { - Self::default() +/// Fake id that dictionaries use to indicate that an entry has no id. +const NONEXISTING_ID_MARK: usize = usize::MAX; +/// Fake id that dictionaries use to indicate that an entry is known +/// in some other dictionary (indicating that a search across multiple dictionaries +/// should be continued). +const KNOWN_ID_MARK: usize = usize::MAX - 1; + +/// Result of adding new values to a dictionary. +/// It indicates if the operation was successful, and whether the value was previously present or not. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AddResult { + /// Element was new and has been freshly assinged the given id. + Fresh(usize), + /// Element was already known and has the given id. + Known(usize), + /// Element not supported by dictionary. + Rejected, +} + +impl AddResult { + /// Returns the actual index. + /// In case of [AddResult::Rejected], a fake id is returned ([usize::MAX]). + pub fn value(&self) -> usize { + match self { + AddResult::Fresh(value) => *value, + AddResult::Known(value) => *value, + AddResult::Rejected => NONEXISTING_ID_MARK, + } + } +} + +/// A Dictionary represents a bijective (invertible) mapping from objects to numeric ids. +/// The "objects" are provided when the dictionary is used, whereas the ids are newly +/// assigned by the dictionary itself. +pub trait Dictionary: Debug { + /// Adds a new string to the dictionary. If the string is not known yet, it will + /// be assigned a new id. Unsupported strings can also be rejected, which specialized + /// dictionary implementations might do. + /// + /// The result is an [AddResult] that indicates if the string was newly added, + /// previoulsy present, or rejected. In the first two cases, the result yields + /// the strings id. + fn add_string(&mut self, string: String) -> AddResult; + + /// Adds a new string to the dictionary. If the string is not known yet, it will + /// be assigned a new id. Unsupported strings can also be rejected, which specialized + /// dictionary implementations might do. + /// + /// The result is an [AddResult] that indicates if the string was newly added, + /// previoulsy present, or rejected. In the first two cases, the result yields + /// the strings id. + fn add_str(&mut self, string: &str) -> AddResult; + + /// Adds a new string to the dictionary. This method is similar to `add_string()` but uses a + /// pre-processed string. Some dictionary implementations may extract only parts of + /// the string to fit internal assumptions (e.g., a dictionary that requires a fixed + /// prefix may ignore the prefix and only store the rest, as if the prefix would + /// match). To perform checks and possibly reject data, `add_string()` or `add_str()` should be used. + fn add_dictionary_string(&mut self, ds: DictionaryString) -> AddResult; + + /// Looks for a given [&str] slice and returns `Some(id)` if it is in the dictionary, and `None` otherwise. + fn fetch_id(&self, string: &str) -> Option; + + /// Looks for a string and returns `Some(id)` if it is in the dictionary, and `None` otherwise. + /// This method is similar to `fetch_id()` but uses a pre-processed string. Some dictionary implementations + /// may extract only parts of the string to fit internal assumptions (e.g., a dictionary that requires a fixed + /// prefix may ignore the prefix and only look up the rest, as if the prefix would + /// match). To perform checks and possibly reject data, `fetch_id()` should be used. + fn fetch_id_for_dictionary_string(&self, ds: &DictionaryString) -> Option { + self.fetch_id(ds.as_str()) } - /// Add a new string to the dictionary - /// and returns the associated [usize] value to the added string - /// Note that duplicates will not be added and the existing [usize] will be returned - fn add(&mut self, entry: String) -> usize; - /// Looks for a given [&str] slice and returns `Some(position)` if there is a match or `None` if there is no match. - fn index_of(&self, entry: &str) -> Option; - /// Returns an equivalent [String] to the one associated with the `index` or None if the `index` is out of bounds - fn entry(&self, index: usize) -> Option; - /// Returns the number of elements in the dictionary. + + /// Returns the [String] to the one associated with the `id` or None if the `id` is out of bounds + fn get(&self, id: usize) -> Option; + + /// Returns the number of elements in the dictionary. For dictionaries that support marking elements as + /// known without giving IDs to them, such elements should not be counted. fn len(&self) -> usize; - /// Returns whether the dictionary is empty. - fn is_empty(&self) -> bool; + + /// Returns true if the dictionary is empty. False otherwise + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Marks the given string as being known ---using the special id [u64::MAX] - 1--- without + /// assigning an own id to it. If the entry exists already, the old id will be kept and + /// returned. It is possible to return [AddResult::Rejected] to indicate that the dictionary + /// does not support marking of strings. Implementors of [Dictionary::mark_str] must also implement [Dictionary::has_marked]. + fn mark_str(&mut self, _string: &str) -> AddResult { + AddResult::Rejected + } + + /// Returns true if the dictionary contains any marked elements (See [Dictionary::mark_str]). The intention is that code marks + /// all elements that are relevant to this dictionary, or none at all, so that a return value of `true` indicates + /// that one can rely on unknown and non-marked elements to be missing in all dictionaries. Implementors of + /// [Dictionary::has_marked] must also implement [Dictionary::mark_str]. + fn has_marked(&self) -> bool { + false + } } diff --git a/nemo-physical/src/dictionary/dictionary_string.rs b/nemo-physical/src/dictionary/dictionary_string.rs new file mode 100644 index 000000000..3731cca75 --- /dev/null +++ b/nemo-physical/src/dictionary/dictionary_string.rs @@ -0,0 +1,223 @@ +use std::cell::UnsafeCell; + +pub(crate) const LONG_STRING_THRESHOLD: usize = 1000; + +/// Internal struct where keep locations extracted from strings. This is separated +/// to enable an iner mutability pattern. +#[derive(Debug, Clone, Copy)] +struct DictionaryStringLocations { + prefix_length: usize, + infix_length: usize, + infix_done: bool, +} + +impl DictionaryStringLocations { + fn new() -> Self { + DictionaryStringLocations { + prefix_length: 0, + infix_length: 0, + infix_done: false, + } + } +} + +/// String that computes and caches checks relevant for dictionary selection. +#[derive(Debug)] +pub struct DictionaryString { + string: String, + positions: UnsafeCell, +} +impl DictionaryString { + /// Constructor + pub fn new(s: &str) -> Self { + DictionaryString { + string: s.to_string(), + positions: UnsafeCell::new(DictionaryStringLocations::new()), + } + } + + /// Constructor, taking ownership of the given string + pub fn from_string(s: String) -> Self { + DictionaryString { + string: s, + positions: UnsafeCell::new(DictionaryStringLocations::new()), + } + } + + /// Returns true if the string is considered "long". Long strings may be handled differently + /// in some dictionaries. + pub fn is_long(&self) -> bool { + self.string.len() > LONG_STRING_THRESHOLD + } + + /// Returns the complete string data. + pub fn as_str(&self) -> &str { + self.string.as_str() + } + + /// Returns the first part of the standard split into pieces + pub fn prefix(&self) -> &str { + self.set_pieces(); + unsafe { + let prefix_length = (*self.positions.get()).prefix_length; + self.string.as_str().get_unchecked(..prefix_length) + } + } + + /// Returns the middle part of the standard split into pieces + pub fn infix(&self) -> &str { + self.set_pieces(); + unsafe { + let prefix_end = (*self.positions.get()).prefix_length; + let infix_end = prefix_end + (*self.positions.get()).infix_length; + self.string.as_str().get_unchecked(prefix_end..infix_end) + } + } + + /// Returns the last part of the standard split into pieces + pub fn suffix(&self) -> &str { + self.set_pieces(); + unsafe { + let prefix_end = (*self.positions.get()).prefix_length; + let infix_end = prefix_end + (*self.positions.get()).infix_length; + self.string.as_str().get_unchecked(infix_end..) + } + } + + /// Checks if the string can be viewed as an infix that is enclosed by the given prefix and suffix. + /// Note that this uses the standard splitting approach to determine prefix and suffix, which means + /// that the funciton may return `false` even if the string actually starts and ends with the prefix + /// and suffix given (if these are not the ones detected). + pub fn has_infix(&self, prefix: &str, suffix: &str) -> bool { + self.prefix() == prefix && self.suffix() == suffix + } + + /// Checks if the string has a non-empty prefix or suffix. + pub fn infixable(&self) -> bool { + self.set_pieces(); + unsafe { (*self.positions.get()).infix_length < self.string.len() } + } + + /// Computes the pieces from the string. + fn set_pieces(&self) { + if (unsafe { *self.positions.get() }).infix_done { + return; + } + + let bytes = self.string.as_bytes(); + + let mut prefix_length: usize = 0; + let mut infix_length: usize = bytes.len(); + + if infix_length > 0 && bytes[infix_length - 1] == b'>' { + if bytes[0] == b'<' { + // using bytes is safe at pos 0; we know string!="" from above + let pos = DictionaryString::rfind_hashslash_plus(bytes); + if pos > 0 { + prefix_length = pos; // note that pos is +1 the actual pos + infix_length = bytes.len() - prefix_length - 1; + } + } else if bytes[0] == b'"' { + // using bytes is safe at pos 0; we know string!="" from above + let pos = DictionaryString::find_quote_plus(unsafe { bytes.get_unchecked(1..) }); + if pos > 0 { + prefix_length = 1; + infix_length = pos - 1; // note that pos is relative to the slice that starts at 1, and that it is +1 the actual position + } + } + } // else: use defaults from above + + let positions = unsafe { &mut *self.positions.get() }; + positions.prefix_length = prefix_length; + positions.infix_length = infix_length; + positions.infix_done = true; + } + + /// Finds the last position in UTF-8 str slice (given as `&[u8]` bytes) where the characters '/' or '#' occur, + /// and returns the successor of that position. If the character is not found, 0 is returned. + /// The method avoids any UTF decoding, because it is unnecessary for characters in the ASCII range (<128). + #[inline(always)] + fn rfind_hashslash_plus(s: &[u8]) -> usize { + let mut pos: usize = s.len(); + let mut iter = s.iter().copied(); + while let Some(ch) = iter.next_back() { + if ch == b'/' || ch == b'#' { + return pos; + } + pos -= 1; + } + pos + } + + /// Finds the first position in UTF-8 str slice (given as `&[u8]` bytes) where the character '"' occurs, + /// and returns the successor of that position. If the character is not found, 0 is returned. + /// The method avoids any UTF decoding, because it is unnecessary for characters in the ASCII range (<128). + #[inline(always)] + fn find_quote_plus(s: &[u8]) -> usize { + let mut pos: usize = 1; + for ch in s.iter().copied() { + if ch == b'"' { + return pos; + } + pos += 1; + } + 0 + } +} + +#[cfg(test)] +mod test { + use super::DictionaryString; + + #[test] + fn split_parts_qids() { + let ds = DictionaryString::new(""); + assert_eq!(ds.prefix(), ""); + } + + #[test] + fn split_parts_rdf_type() { + let ds = DictionaryString::new(""); + assert_eq!(ds.prefix(), ""); + } + + #[test] + fn split_parts_integer() { + let ds = DictionaryString::new("\"305\"^^"); + assert_eq!(ds.prefix(), "\""); + assert_eq!(ds.infix(), "305"); + assert_eq!( + ds.suffix(), + "\"^^" + ); + } + + #[test] + fn findr_hashslash_plus() { + let s1 = "A täst /stri#ng / with non-ASCII unicöde in it"; + let s2 = "A täst /st#ring # with non-ASCII unicöde in it"; + let s3 = "A täst string with non-ASCII unicöde in it"; + let pos1 = DictionaryString::rfind_hashslash_plus(s1.as_bytes()); + let pos2 = DictionaryString::rfind_hashslash_plus(s2.as_bytes()); + let pos3 = DictionaryString::rfind_hashslash_plus(s3.as_bytes()); + + assert_eq!(pos1, s1.rfind(|c: char| c == '/' || c == '#').unwrap() + 1); + assert_eq!(pos2, s2.rfind(|c: char| c == '/' || c == '#').unwrap() + 1); + assert_eq!(pos3, 0); + } + + #[test] + fn find_quote_plus() { + let s1 = "A täst /stri\"ng / with non-ASC\"II unicöde in it"; + let pos1 = DictionaryString::find_quote_plus(s1.as_bytes()); + let s2 = "A täst /string / with non-ASCII unicöde in it"; + let pos2 = DictionaryString::find_quote_plus(s2.as_bytes()); + + assert_eq!(pos1, s1.find('"').unwrap() + 1); + assert_eq!(pos2, 0); + } +} diff --git a/nemo-physical/src/dictionary/hash_map_dictionary.rs b/nemo-physical/src/dictionary/hash_map_dictionary.rs new file mode 100644 index 000000000..2d43a75aa --- /dev/null +++ b/nemo-physical/src/dictionary/hash_map_dictionary.rs @@ -0,0 +1,423 @@ +use super::{AddResult, Dictionary, DictionaryString}; + +use std::{ + collections::HashMap, + fmt::Display, + hash::{Hash, Hasher}, +}; + +use std::sync::atomic::{AtomicBool, Ordering}; + +/// Global string buffer for dictionary data. +/// This is global here to allow keys in the hashmap to access it for computing equality and hashes, +/// without the need to re-implement the whole hashmap to inject such an object. +static mut BUFFER: StringBuffer = StringBuffer::new(); +// The following code is needed if allocations are done while constructing [StringBuffer]: +//use once_cell::sync::Lazy; +//static mut BUFFER: Lazy = Lazy::new(||StringBuffer::new()); + +/// Address size of pages in the string buffer +const PAGE_ADDR_BITS: usize = 25; // 32MB +/// Size of pages in the string buffer +const PAGE_SIZE: usize = 1 << PAGE_ADDR_BITS; +/// Bit mask that keeps only the (lower) PAGE_ADDR_BITS-1 bits, for extracting a string's length +const LENGTH_BITS_MASK: u64 = (1 << (PAGE_ADDR_BITS - 1)) - 1; + +/// A buffer for string data using compact memory regions that are managed in pages. +/// New buffers need to be initialized, upn which they will receive an identifying buffer id +/// that is used whenever the data accessed. +/// +/// The implementaion is not fully thread-safe, but it is thread-safe as long as each buffer +/// is used in only one thread. That is, parallel threads can safely create buffers (which will +/// have different ids), as long as all their operations use the buffer id that they were given. +struct StringBuffer { + /// Vector of buffer ids and string buffers + pages: Vec<(usize, String)>, + /// Single temporary string per buffer. [StringRef] uses this for representing strings that are not in the buffer. + /// + /// TODO: It would be possible and more elegant to have a special alternative key implementation for our hashmap, + /// where the key has the relevant data instead of pointing to a temporary buffer. + tmp_strings: Vec, + /// Currently active page for each buffer + cur_pages: Vec, + /// Lock to guard page assignment operations when using multiple threads + lock: AtomicBool, +} + +impl StringBuffer { + /// Constructor. + const fn new() -> Self { + StringBuffer { + pages: Vec::new(), + tmp_strings: Vec::new(), + cur_pages: Vec::new(), + lock: AtomicBool::new(false), + } + } + + /// Initializes a new buffer and returns a handle that can henceforth be used to access it. + fn init_buffer(&mut self) -> usize { + self.acquire_page_lock(); + let buf_id = self.cur_pages.len(); + self.pages.push((buf_id, String::with_capacity(PAGE_SIZE))); + self.cur_pages.push(self.pages.len() - 1); + self.tmp_strings.push(String::new()); + self.release_page_lock(); + buf_id + } + + /// Frees the memory used by the pages of the dropped buffer. + /// No other pages are affected or moved. + fn drop_buffer(&mut self, buffer: usize) { + self.acquire_page_lock(); + for (b, s) in self.pages.iter_mut() { + if buffer == *b { + s.clear(); + s.shrink_to_fit(); + *b = usize::MAX; + } + } + self.release_page_lock(); + } + + /// Inserts a string into the buffer and returns a [StringRef] that points to it. + /// + /// TODO: Allocation of new pages could re-use freed pages instead of always appending. + fn push_str(&mut self, buffer: usize, s: &str) -> StringRef { + let len = s.len(); + assert!(len < PAGE_SIZE); + let mut page_num = self.cur_pages[buffer]; + if self.pages[page_num].1.len() + len > PAGE_SIZE { + self.acquire_page_lock(); + self.pages.push((buffer, String::with_capacity(PAGE_SIZE))); + page_num = self.pages.len() - 1; + self.cur_pages[buffer] = page_num; + self.release_page_lock(); + } + let page_addr = self.pages[page_num].1.len(); + self.pages[page_num].1.push_str(s); + + StringRef::new(page_num * PAGE_SIZE + page_addr, s.len()) + } + + /// Returns a direct string slice reference for this data. + /// This is a pointer to global mutable data, and cannot be used safely. + fn get_str(&self, address: usize, length: usize) -> &str { + let page_num = address >> PAGE_ADDR_BITS; + let page_addr = address % PAGE_SIZE; + unsafe { + self.pages[page_num] + .1 + .get_unchecked(page_addr..page_addr + length) + } + } + + /// Creates a reference to the given string without adding the string to the buffer. + fn get_tmp_string_ref(&mut self, buffer: usize, s: &str) -> StringRef { + self.tmp_strings[buffer].clear(); + self.tmp_strings[buffer].push_str(s); + StringRef::new_tmp(buffer) + } + + /// Returns the current contents of the temporary string. + fn get_tmp_string(&self, buffer: usize) -> &str { + self.tmp_strings[buffer].as_str() + } + + /// Acquire the lock that we use for operations that add new pages or change + /// the assignment of pages to buffers in any way. + fn acquire_page_lock(&mut self) { + while self + .lock + .compare_exchange_weak(false, true, Ordering::Acquire, Ordering::Acquire) + .is_err() + {} + } + + /// Release the lock. + fn release_page_lock(&mut self) { + self.lock.store(false, Ordering::Release); + } +} + +const STRINGREF_STRING_LEGHT_BITS: usize = 24; +const STRINGREF_STARTING_ADDRESS_BITS: usize = 40; +const MAX_STRINGREF_STRING_LEGHT: usize = 1 << STRINGREF_STRING_LEGHT_BITS; +const MAX_STRINGREF_STARTING_ADDRESS: usize = 1 << STRINGREF_STARTING_ADDRESS_BITS; + +/// Memory-optimized reference to a string in the dictionary. +#[derive(Clone, Copy, Debug, Default)] +struct StringRef { + /// The 64bits reference consists of 40bits that encode a starting address within + /// the buffer, and 24bits that encode the string length. + /// This limits the maximal buffer size to 1TB of string data, and the maximal length + /// of a single string to 16M bytes. + reference: u64, +} + +impl StringRef { + /// Creates an object that refers to the current contents of the + /// buffer's temporary String. + fn new_tmp(buffer: usize) -> Self { + assert!(buffer < MAX_STRINGREF_STRING_LEGHT); + let u64buffer: u64 = buffer.try_into().unwrap(); + StringRef { + reference: (u64::MAX << STRINGREF_STRING_LEGHT_BITS) + u64buffer, + } + } + + /// Creates a reference to the specific string slice in the buffer. + /// It is not checked if that slice is allocated. + fn new(address: usize, len: usize) -> Self { + assert!(len < MAX_STRINGREF_STRING_LEGHT); + assert!(address < MAX_STRINGREF_STARTING_ADDRESS); + let u64add: u64 = address.try_into().unwrap(); + let u64len: u64 = len.try_into().unwrap(); + StringRef { + reference: (u64add << STRINGREF_STRING_LEGHT_BITS) + u64len, + } + } + + /// Returns the stored start address for the string that this refers to. + /// For temporary references that do not point to the buffer, the result is meaningless. + fn address(&self) -> usize { + (self.reference >> STRINGREF_STRING_LEGHT_BITS) + .try_into() + .unwrap() + } + + /// Returns the stored length of the string that this refers to. + /// For temporary references that do not point to the buffer, the result is meaningless. + fn len(&self) -> usize { + (self.reference & LENGTH_BITS_MASK).try_into().unwrap() + } + + /// Returns a direct string slice reference for this data. + /// This is a pointer to global mutable data, and cannot be used safely. + fn as_str(&self) -> &str { + if ((!self.reference) >> STRINGREF_STRING_LEGHT_BITS) != 0 { + unsafe { BUFFER.get_str(self.address(), self.len()) } + } else { + unsafe { BUFFER.get_tmp_string(self.len()) } + } + } +} + +impl Display for StringRef { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write! {f, "{}", self.as_str()} + } +} + +impl Hash for StringRef { + fn hash(&self, state: &mut H) + where + H: Hasher, + { + self.as_str().hash(state) + } +} + +impl PartialEq for StringRef { + fn eq(&self, other: &StringRef) -> bool { + self.as_str().eq(other.as_str()) + } +} + +impl Eq for StringRef {} + +/// A read-only, hashmap-based [Dictionary] to implement a bijection between strings and integers. +/// Strings are stored in a compact buffer to reduce memory overhead and fragmentation. +#[derive(Clone, Debug)] +pub struct HashMapDictionary { + store: Vec, + mapping: HashMap, + buffer: usize, + has_known_mark: bool, +} + +impl HashMapDictionary { + /// Construct a new and empty dictionary. + pub fn new() -> Self { + Self::default() + } + + /// Check if a string is already in the dictionary, and if not, + /// set its id to the given value. IDs are normally assigned consecutively + /// by this dictionary, but one can also assign [super::KNOWN_ID_MARK] to + /// merely mark a string as known. Other non-consecutive assignments + /// will generally lead to errors, since the same ID might be assigned + /// later on again. + /// + /// If the given string is known but not assigned an ID (indicated by + /// [super::KNOWN_ID_MARK]), then the operation will still not assign an + /// ID either. In such a case, another dictionary should have that ID. + #[inline(always)] + fn add_str_with_id(&mut self, string: &str, id: usize) -> AddResult { + match self + .mapping + .get(unsafe { &BUFFER.get_tmp_string_ref(self.buffer, string) }) + { + Some(idx) => { + // if *idx == super::KNOWN_ID_MARK { + // println!("Got KID for {} when attempting to add id {}",string,id); + // } + AddResult::Known(*idx) + } + None => { + let sref = unsafe { BUFFER.push_str(self.buffer, string) }; + if id != super::KNOWN_ID_MARK { + self.store.push(sref); + } + self.mapping.insert(sref, id); + AddResult::Fresh(id) + } + } + } +} + +impl Default for HashMapDictionary { + fn default() -> Self { + HashMapDictionary { + store: Vec::new(), + mapping: HashMap::new(), + buffer: unsafe { BUFFER.init_buffer() }, + has_known_mark: false, + } + } +} + +impl Drop for HashMapDictionary { + fn drop(&mut self) { + unsafe { + BUFFER.drop_buffer(self.buffer); + } + } +} + +impl Dictionary for HashMapDictionary { + fn add_string(&mut self, string: String) -> AddResult { + self.add_str_with_id(string.as_str(), self.store.len()) + } + + fn add_str(&mut self, string: &str) -> AddResult { + self.add_str_with_id(string, self.store.len()) + } + + fn add_dictionary_string(&mut self, ds: DictionaryString) -> AddResult { + self.add_str_with_id(ds.as_str(), self.store.len()) + } + + fn fetch_id(&self, string: &str) -> Option { + self.mapping + .get(unsafe { &BUFFER.get_tmp_string_ref(self.buffer, string) }) + .copied() + } + + fn get(&self, id: usize) -> Option { + self.store + .get(id) + .map(|entry| -> String { entry.to_string() }) + } + + fn len(&self) -> usize { + self.store.len() + } + + fn mark_str(&mut self, string: &str) -> AddResult { + self.has_known_mark = true; + self.add_str_with_id(string, super::KNOWN_ID_MARK) + } + + fn has_marked(&self) -> bool { + self.has_known_mark + } +} + +#[cfg(test)] +mod test { + use std::borrow::Borrow; + + use crate::dictionary::AddResult; + use crate::dictionary::Dictionary; + + use super::HashMapDictionary; + + fn create_dict() -> HashMapDictionary { + let mut dict = HashMapDictionary::default(); + let vec: Vec<&str> = vec![ + "a", + "b", + "c", + "a", + "b", + "c", + "Position 3", + "Position 4", + "Position 3", + "Position 5", + ]; + + for i in vec { + dict.add_string(i.to_string()); + } + dict + } + + #[test] + fn get() { + let mut dict = create_dict(); + + let mut dict2 = HashMapDictionary::default(); + dict2.add_string("entry0".to_string()); + dict.add_string("another entry".to_string()); + dict2.add_string("entry1".to_string()); + + assert_eq!(dict.get(0), Some("a".to_string())); + assert_eq!(dict.get(1), Some("b".to_string())); + assert_eq!(dict.get(2), Some("c".to_string())); + assert_eq!(dict.get(3), Some("Position 3".to_string())); + assert_eq!(dict.get(4), Some("Position 4".to_string())); + assert_eq!(dict.get(5), Some("Position 5".to_string())); + assert_eq!(dict.get(6), Some("another entry".to_string())); + assert_eq!(dict.get(7), None); + assert_eq!(dict.get(3), Some("Position 3".to_string())); + + assert_eq!(dict2.get(0), Some("entry0".to_string())); + assert_eq!(dict2.get(1), Some("entry1".to_string())); + assert_eq!(dict2.get(2), None); + } + + #[test] + fn fetch_id() { + let dict = create_dict(); + assert_eq!(dict.fetch_id("a".to_string().borrow()), Some(0)); + assert_eq!(dict.fetch_id("b".to_string().borrow()), Some(1)); + assert_eq!(dict.fetch_id("c".to_string().borrow()), Some(2)); + assert_eq!(dict.fetch_id("Position 3".to_string().borrow()), Some(3)); + assert_eq!(dict.fetch_id("Position 4".to_string().borrow()), Some(4)); + assert_eq!(dict.fetch_id("Position 5".to_string().borrow()), Some(5)); + assert_eq!(dict.fetch_id("d".to_string().borrow()), None); + assert_eq!(dict.fetch_id("Pos".to_string().borrow()), None); + assert_eq!(dict.fetch_id("Pos"), None); + assert_eq!(dict.fetch_id("b"), Some(1)); + } + + #[test] + fn add() { + let mut dict = create_dict(); + assert_eq!(dict.add_string("a".to_string()), AddResult::Known(0)); + assert_eq!( + dict.add_string("new value".to_string()), + AddResult::Fresh(6) + ); + } + + #[test] + fn empty_str() { + let mut dict = HashMapDictionary::default(); + assert_eq!(dict.add_string("".to_string()), AddResult::Fresh(0)); + assert_eq!(dict.get(0), Some("".to_string())); + assert_eq!(dict.fetch_id(""), Some(0)); + } +} diff --git a/nemo-physical/src/dictionary/infix_dictionary.rs b/nemo-physical/src/dictionary/infix_dictionary.rs new file mode 100644 index 000000000..67932fbeb --- /dev/null +++ b/nemo-physical/src/dictionary/infix_dictionary.rs @@ -0,0 +1,143 @@ +use super::hash_map_dictionary::HashMapDictionary; +use super::AddResult; +use super::Dictionary; +use super::DictionaryString; + +/// A read-only [Dictionary] to implement a bijection between integer ids and strings that start and end +/// with a certain fixed prefix and postfix, respectively. Strings that do not have this shape will be +/// rejected. +#[derive(Clone, Debug)] +pub struct InfixDictionary { + dict: HashMapDictionary, + prefix: String, + suffix: String, +} + +impl InfixDictionary { + /// Construct a new and empty dictionary for the given prefix and suffix. + pub fn new(prefix: String, suffix: String) -> Self { + InfixDictionary { + dict: HashMapDictionary::new(), + prefix, + suffix, + } + } + + /// Add a given infix string to the internal dictionary + fn add_infix_str(&mut self, string: &str) -> AddResult { + self.dict.add_str(string) + } +} + +impl Dictionary for InfixDictionary { + /// Adds a string to the disctionary. It is checked if the string has the required prefix and suffix + /// to be in this dictionary. If this check fails, the string is rejected. + fn add_string(&mut self, string: String) -> AddResult { + self.add_str(string.as_str()) + } + + /// Adds a string to the disctionary. It is checked if the string has the required prefix and suffix + /// to be in this dictionary. If this check fails, the string is rejected. + fn add_str(&mut self, string: &str) -> AddResult { + if string.starts_with(self.prefix.as_str()) && string.ends_with(self.suffix.as_str()) { + self.add_infix_str(&string[self.prefix.len()..string.len() - self.suffix.len()]) + } else { + AddResult::Rejected + } + } + + /// Add a string to the dictionary, but assume that the added string + /// consists of the fixed prefix, followed by the given string's infix, + /// followed by the fixed suffix. There is no check that the given prefix + /// and suffix are actually the same as the fixed ones. + fn add_dictionary_string(&mut self, ds: DictionaryString) -> AddResult { + self.add_infix_str(ds.infix()) + } + + fn fetch_id(&self, string: &str) -> Option { + if string.starts_with(self.prefix.as_str()) && string.ends_with(self.suffix.as_str()) { + self.dict.fetch_id(unsafe { + string.get_unchecked(self.prefix.len()..string.len() - self.suffix.len()) + }) + } else { + None + } + } + + /// Look up a string in the dictionary, but assume that the added string + /// consists of the fixed prefix, followed by the given string's infix, + /// followed by the fixed suffix. There is no check that the given prefix + /// and suffix are actually the same as the fixed ones. + fn fetch_id_for_dictionary_string(&self, ds: &DictionaryString) -> Option { + self.dict.fetch_id(ds.infix()) + } + + fn get(&self, id: usize) -> Option { + let subresult = self.dict.get(id); + if let Some(inner) = subresult { + return Some(self.prefix.clone() + inner.as_str() + self.suffix.as_str()); + } + None + } + + fn len(&self) -> usize { + self.dict.len() + } + + /// Marks a string for this dictionary, as described in the documentation of [Dictionary]. + /// The given string must use the dictionary's prefix and suffix: no further checks are + /// performed here. + fn mark_str(&mut self, string: &str) -> AddResult { + self.dict + .mark_str(&string[self.prefix.len()..string.len() - self.suffix.len()]) + } + + fn has_marked(&self) -> bool { + self.dict.has_marked() + } +} + +#[cfg(test)] +mod test { + + use crate::dictionary::AddResult; + use crate::dictionary::Dictionary; + use crate::dictionary::DictionaryString; + + use super::InfixDictionary; + + #[test] + fn add_and_get() { + let mut dict = InfixDictionary::new("".to_string()); + + assert_eq!( + dict.add_str(""), + AddResult::Fresh(0) + ); + assert_eq!( + dict.add_str(""), + AddResult::Rejected + ); + assert_eq!( + dict.add_str("wrongsuffix"), + AddResult::Rejected + ); + assert_eq!(dict.get(0), Some("".to_string())); + assert_eq!(dict.fetch_id(""), Some(0)); + } + + #[test] + fn add_and_get_ds() { + let mut dict = InfixDictionary::new("".to_string()); + + let ds1 = DictionaryString::new(""); + let ds2 = DictionaryString::new(""); // should still be accepted + + assert_eq!(dict.add_dictionary_string(ds1), AddResult::Fresh(0)); + assert_eq!(dict.add_dictionary_string(ds2), AddResult::Fresh(1)); + assert_eq!(dict.get(0), Some("".to_string())); + assert_eq!(dict.get(1), Some("".to_string())); + assert_eq!(dict.fetch_id(""), Some(0)); + assert_eq!(dict.fetch_id(""), Some(1)); + } +} diff --git a/nemo-physical/src/dictionary/meta_dictionary.rs b/nemo-physical/src/dictionary/meta_dictionary.rs new file mode 100644 index 000000000..3973ec228 --- /dev/null +++ b/nemo-physical/src/dictionary/meta_dictionary.rs @@ -0,0 +1,541 @@ +use super::AddResult; +use super::Dictionary; +use super::DictionaryString; +use super::HashMapDictionary; +use super::InfixDictionary; + +use lru::LruCache; +use std::borrow::Borrow; +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::num::NonZeroUsize; + +/// Number of recent occurrences of a string pattern required for creating a bespoke dictionary +const DICT_THRESHOLD: u32 = 500; + +/// Bits in the size of address blocks allocated to sub-dictionaries. +/// For example 24bit blocks each contain 2^24 addresses, and there are +/// 2^8=256 such blocks available within the u32 address range (and +/// 2^40 in 64bits). +const BLOCKSIZE: u32 = 24; + +// The code for [StringPair] and [StringPairKey] is inspired by +// https://stackoverflow.com/a/50478038 ("How to avoid temporary allocations when using a complex key for a HashMap?"). +// The goal is just that, since we have very frequent hashmap lookups here. +#[derive(Debug, Eq, Hash, PartialEq)] +struct StringPair { + first: String, + second: String, +} + +impl StringPair { + fn new(first: impl Into, second: impl Into) -> Self { + StringPair { + first: first.into(), + second: second.into(), + } + } +} + +trait StringPairKey { + fn to_key(&self) -> (&str, &str); +} + +impl Hash for dyn StringPairKey + '_ { + fn hash(&self, state: &mut H) { + self.to_key().hash(state) + } +} + +impl PartialEq for dyn StringPairKey + '_ { + fn eq(&self, other: &Self) -> bool { + self.to_key() == other.to_key() + } +} + +impl Eq for dyn StringPairKey + '_ {} + +impl StringPairKey for StringPair { + fn to_key(&self) -> (&str, &str) { + (&self.first, &self.second) + } +} + +impl<'a> StringPairKey for (&'a str, &'a str) { + fn to_key(&self) -> (&str, &str) { + (self.0, self.1) + } +} + +impl<'a> Borrow for StringPair { + fn borrow(&self) -> &(dyn StringPairKey + 'a) { + self + } +} +impl<'a> Borrow for (&'a str, &'a str) { + fn borrow(&self) -> &(dyn StringPairKey + 'a) { + self + } +} +// End of code for [StringPair]. + +/// Enum to specify what kind of data a dictionary supports. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +enum DictionaryType { + /// Plain string dictionary + String, + /// Dictionary for long strings (blobs) + Blob, + /// Dictionary for strings with a fixed prefix and suffix + Infix { prefix: String, suffix: String }, + // /// Dictionary for numeric strings with a fixed prefix and suffix + //NumInfix { prefix: String, suffix: String }, + // /// Dictionary for named (actually: "numbered") nulls + // NULL, +} + +impl DictionaryType { + /// Returns true if the given string is supported by a dictinoary of this type. + fn supports(&self, ds: &DictionaryString) -> bool { + match self { + DictionaryType::String => !ds.is_long(), + DictionaryType::Blob => ds.is_long(), + DictionaryType::Infix { prefix, suffix } => { + !ds.is_long() && ds.has_infix(prefix, suffix) + } //DictionaryType::NumInfix { prefix, suffix } => false, // TODO + } + } +} + +/// Struct to hold relevant information about a sub-dictionary. +#[derive(Debug)] +pub struct DictRecord { + /// Pointer to the actual dictionary object + dict: Box, + /// Type of the dictionary + dict_type: DictionaryType, + /// Vector to associate local address block numbers to global block numbers + gblocks: Vec, +} + +/// Iterator-like struct for cycling over suitable dictionaries for some [DictionaryString]. +/// It is mostly a device for reusing some iteration code, and requires all calls to provide +/// the data to work on. +struct DictIterator { + /// Internal encoding of a "position" in a single value. + /// It is interpreted as follows: 0 is the initial state ("before" any value), + /// 1 is the "fitting infix dictionary" (if any) + position: usize, +} + +impl DictIterator { + /// Constructor. + fn new() -> Self { + DictIterator { position: 0 } + } + + /// Advance iterator, and return the id of the next dictionary. + fn next(&mut self, ds: &DictionaryString, md: &MetaDictionary) -> usize { + // First look for infix dictionary: + if self.position == 0 { + self.position = 1; + if ds.infixable() { + if let Some(dict_idx) = md + .infix_dicts + .get::(&(ds.prefix(), ds.suffix())) + { + return *dict_idx; + } + } + } + + // Finally, interpret self.position-1 as an index in md.generic_dicts: + while self.position <= md.generic_dicts.len() { + self.position += 1; + if md.dicts[md.generic_dicts[self.position - 2]] + .dict_type + .supports(ds) + { + return md.generic_dicts[self.position - 2]; + } + } + + usize::MAX // No further dictionaries left + } +} + +/// A dictionary that combines several other dictionaries. +#[derive(Debug)] +pub struct MetaDictionary { + /// Vector to map global block numbers to pairs (sub-dictionary, local block number) + dictblocks: Vec<(usize, usize)>, + /// Vector of all sub-dictionaries, indexed by their sub-dictionary number + dicts: Vec, + /// Data structure to hold counters for recently encountered dictionary types that we might + /// want to make a dictionary for. + dict_candidates: LruCache, + /// Auxiliary datastructure for finding fitting infix dictionaries. + infix_dicts: HashMap, + /// Auxiliary datastructure for finding fitting general purpose dictionaries. + generic_dicts: Vec, + /// Keep track of total number of entries for faster checks + size: usize, +} + +impl Default for MetaDictionary { + /// Initialise a [MetaDictionary]. + /// Sets up relevant default dictionaries for basic blocks. + fn default() -> Self { + let mut result = Self { + dictblocks: Vec::new(), + dicts: Vec::new(), + dict_candidates: LruCache::new(NonZeroUsize::new(150).unwrap()), + infix_dicts: HashMap::new(), + generic_dicts: Vec::new(), + size: 0, + }; + + result.add_dictionary(DictionaryType::Blob); + result.add_dictionary(DictionaryType::String); + + result + } +} + +impl MetaDictionary { + /// Construct a new and empty dictionary. + pub fn new() -> Self { + Self::default() + } + + /// Convert the local ID of a given dictionary to a global ID. + /// The function assumes that the given local id exists, and will crash + /// otherwise. It can safely be used for conversion of previously stored data. + fn local_to_global_unchecked(&self, dict: usize, local_id: usize) -> usize { + let lblock = local_id >> BLOCKSIZE; + let offset = local_id % (1 << BLOCKSIZE); + let gblock = self.dicts[dict].gblocks[lblock]; // Could fail if: (1) dictionary does not exist, or (2) block not used by dict + + (gblock << BLOCKSIZE) + offset + } + + /// Convert the local ID of a given dictionary to a global ID. + /// The function will check if the local id is supported by a previously + /// reserved address block, and will reserve a new block for this + /// dictionary otherwise. This is used when converting newly created ids. + fn local_to_global(&mut self, dict: usize, local_id: usize) -> usize { + let lblock = local_id >> BLOCKSIZE; + let offset = local_id % (1 << BLOCKSIZE); + let gblock = self.allocate_block(dict, lblock); + + (gblock << BLOCKSIZE) + offset + } + + /// Find a global block that is allocated for the given dictionary and local block. If not + /// allocated yet, a new block is reserved for this purpose. Allocation tries to preserve relative + /// order and distance, and to keep some distance from other dictionary's blocks. + fn allocate_block(&mut self, dict: usize, local_block: usize) -> usize { + if self.dicts[dict].gblocks.len() <= local_block { + // make space for block records up to required length + self.dicts[dict].gblocks.resize(local_block + 1, usize::MAX); + } + if self.dicts[dict].gblocks[local_block] == usize::MAX { + // allocate necessary new block + let mut btl_index = local_block + 1; // index of first allocated block to the left, +1 (so 0 means "no block to left") + while btl_index > 0 && self.dicts[dict].gblocks[btl_index - 1] == usize::MAX { + btl_index -= 1; + } + + let mut new_block = if btl_index > 0 { + // extrapolate where global block should be relative to last allocated local block + self.dicts[dict].gblocks[btl_index - 1] + btl_index - 1 + } else { + // TODO determine "good" initial block for this dictionary + 0 + }; + // Find first empty block right of the chosen new block + while new_block < self.dictblocks.len() && self.dictblocks[new_block].1 != usize::MAX { + new_block += 1; + } + if new_block >= self.dictblocks.len() { + self.dictblocks + .resize(new_block + 1, (usize::MAX, usize::MAX)); + } + self.dicts[dict].gblocks[local_block] = new_block; + self.dictblocks[new_block] = (dict, local_block); + new_block + } else { + self.dicts[dict].gblocks[local_block] + } + } + + /// Creates and adds a new (sub)dictionary of the given type. + /// No blocks are allocated yet. It is not checked if a similar dictionary is already there. + fn add_dictionary(&mut self, dt: DictionaryType) { + let dict: Box; + match dt { + DictionaryType::String => { + dict = Box::new(HashMapDictionary::new()); + self.generic_dicts.push(self.dicts.len()); + } + DictionaryType::Blob => { + dict = Box::new(HashMapDictionary::new()); + self.generic_dicts.push(self.dicts.len()); + } + DictionaryType::Infix { + ref prefix, + ref suffix, + } => { + dict = Box::new(InfixDictionary::new(prefix.to_string(), suffix.to_string())); + self.infix_dicts.insert( + StringPair::new(prefix.to_string(), suffix.to_string()), + self.dicts.len(), + ); + } + } + let dr = DictRecord { + dict, + dict_type: dt, + gblocks: Vec::new(), + }; + self.dicts.push(dr); + } + + #[inline(always)] + fn add_dictionary_string_inline(&mut self, ds: DictionaryString) -> AddResult { + let mut best_dict_idx = usize::MAX; + + // Look up new entry in all applicable dictionaries. + let mut d_it = DictIterator::new(); + let mut dict_idx: usize; + while { + dict_idx = d_it.next(&ds, self); + dict_idx + } != usize::MAX + { + if best_dict_idx == usize::MAX { + best_dict_idx = dict_idx; + } + if let Some(idx) = self.dicts[dict_idx] + .dict + .fetch_id_for_dictionary_string(&ds) + { + if idx != super::KNOWN_ID_MARK { + return AddResult::Known(self.local_to_global_unchecked(dict_idx, idx)); + } // else: marked, continue search for real id + } else if self.dicts[dict_idx].dict.has_marked() { + // neither found nor marked in marked dict -> give up search + break; + } + } + // Performance note: The remaining code is only executed once per unique string (i.e., typically much fewer times than the above). + assert!(best_dict_idx < self.dicts.len()); + + // Consider creating a new dictionary for the new entry: + if ds.infixable() { + if let DictionaryType::String = self.dicts[best_dict_idx].dict_type { + #[allow(trivial_casts)] + if let Some(count) = self + .dict_candidates + .get_mut((ds.prefix(), ds.suffix()).borrow() as &dyn StringPairKey) + { + *count += 1; + if *count > DICT_THRESHOLD { + // Performance note: The following code is very rarely executed. + self.dict_candidates.pop(&StringPair::new( + ds.prefix().to_string(), + ds.suffix().to_string(), + )); + best_dict_idx = self.dicts.len(); + self.add_dictionary(DictionaryType::Infix { + prefix: ds.prefix().to_string(), + suffix: ds.suffix().to_string(), + }); + log::info!( + "Initialized new infix dictionary (#{}) for '{}...{}'.", + best_dict_idx, + ds.prefix(), + ds.suffix() + ); + // Mark previously added strings to enable one-shot misses when looking up elements: + if self.size < 50000 { + let mut i: usize = 0; + let mut c: usize = 0; + let min_len = ds.prefix().len() + ds.suffix().len(); // presumably lets us discard many strings more quickly + while let Some(string) = self.dicts[1].dict.get(i) { + i += 1; + if string.len() >= min_len + && string.starts_with(ds.prefix()) + && string.ends_with(ds.suffix()) + { + c += 1; + self.dicts[best_dict_idx].dict.mark_str(string.as_str()); + } + } + log::info!("Marked {} older strings of that type, iterating {} strings overall.",c,i); + } + } + } else { + self.dict_candidates.put( + StringPair::new(ds.prefix().to_string(), ds.suffix().to_string()), + 1, + ); + } + } + } + + // Add entry to preferred dictionary + self.size += 1; + let local_id = self.dicts[best_dict_idx] + .dict + .add_dictionary_string(ds) + .value(); + // Compute global id based on block and local id, possibly allocating new block in the process + AddResult::Fresh(self.local_to_global(best_dict_idx, local_id)) + } +} + +impl Dictionary for MetaDictionary { + fn add_string(&mut self, string: String) -> AddResult { + self.add_dictionary_string_inline(DictionaryString::from_string(string)) + } + + fn add_str(&mut self, string: &str) -> AddResult { + self.add_dictionary_string_inline(DictionaryString::new(string)) + } + + fn add_dictionary_string(&mut self, ds: DictionaryString) -> AddResult { + self.add_dictionary_string_inline(ds) + } + + fn fetch_id(&self, string: &str) -> Option { + let ds = DictionaryString::new(string); + + // Look up new entry in all applicable dictionaries. + let mut d_it = DictIterator::new(); + let mut dict_idx: usize; + while { + dict_idx = d_it.next(&ds, self); + dict_idx + } != usize::MAX + { + if let Some(idx) = self.dicts[dict_idx].dict.fetch_id(string) { + return Some(self.local_to_global_unchecked(dict_idx, idx)); + } + } + None + } + + fn get(&self, id: usize) -> Option { + let gblock = id >> BLOCKSIZE; + let offset = id % (1 << BLOCKSIZE); + if self.dictblocks.len() <= gblock || self.dictblocks[gblock] == (usize::MAX, usize::MAX) { + return None; + } + let (dict_id, lblock) = self.dictblocks[gblock]; + + self.dicts[dict_id].dict.get((lblock >> BLOCKSIZE) + offset) + } + + fn len(&self) -> usize { + let mut len = 0; + log::info!("Computing total meta dict length ..."); + for dr in self.dicts.iter() { + log::info!("+ {} entries in dict {:?}", dr.dict.len(), dr.dict_type); + len += dr.dict.len(); + } + log::info!("Total len {}", len); + len + } +} + +#[cfg(test)] +mod test { + use crate::dictionary::AddResult; + use crate::dictionary::Dictionary; + + use super::MetaDictionary; + use super::DICT_THRESHOLD; + use crate::dictionary::dictionary_string::LONG_STRING_THRESHOLD; + + /// Pads a string to make it longer than the threshold applied to distinguish blobs. + fn long_string(s: &str) -> String { + "#".to_string().repeat(LONG_STRING_THRESHOLD + 1) + s + } + + #[test] + fn add_and_get() { + let mut dict = MetaDictionary::default(); + + let res1 = dict.add_string("entry0".to_string()); + let res2 = dict.add_string("entry1".to_string()); + let res3 = dict.add_string("entry0".to_string()); + let res4 = dict.add_string(long_string("long1")); + let res5 = dict.add_string("entry2".to_string()); + let res6 = dict.add_string(long_string("long2")); + let res7 = dict.add_string(long_string("long1")); + + let get1 = dict.get(res1.value()); + let get2 = dict.get(res2.value()); + let get4 = dict.get(res4.value()); + let getnone1 = dict.get(res6.value() + 1); // unused but in an allocated block + let getnone2 = dict.get(1 << 30); // out of any allocated block + + assert_eq!(res1, AddResult::Fresh(res1.value())); + assert_eq!(res2, AddResult::Fresh(res2.value())); + assert_eq!(res3, AddResult::Known(res1.value())); + assert_eq!(res4, AddResult::Fresh(res4.value())); + assert_eq!(res5, AddResult::Fresh(res5.value())); + assert_eq!(res6, AddResult::Fresh(res6.value())); + assert_eq!(res7, AddResult::Known(res4.value())); + + assert_eq!(dict.fetch_id("entry0"), Some(res1.value())); + assert_eq!(dict.fetch_id("entry1"), Some(res2.value())); + assert_eq!( + dict.fetch_id(long_string("long1").as_str()), + Some(res4.value()) + ); + + assert_eq!(get1.unwrap(), "entry0".to_string()); + assert_eq!(get2.unwrap(), "entry1".to_string()); + assert_eq!(get4.unwrap(), long_string("long1")); + + assert_eq!(getnone1, None); + assert_eq!(getnone2, None); + } + + #[test] + fn add_and_get_prefix() { + let mut dict = MetaDictionary::default(); + + let res1 = dict.add_string("entry0".to_string()); + + for i in 0..DICT_THRESHOLD + 2 { + dict.add_string( + "", + ); + dict.add_string( + "\"".to_string() + + i.to_string().as_str() + + "\"^^", + ); + } + + let res2 = dict.add_string("".to_string()); + let res3 = + dict.add_string("\"42.3\"^^".to_string()); + + let res1known = dict.add_string("entry0".to_string()); + let res2known = dict.add_string("".to_string()); + let res3known = + dict.add_string("\"42.3\"^^".to_string()); + + assert_eq!(res1, AddResult::Fresh(res1.value())); + assert_eq!(res2, AddResult::Fresh(res2.value())); + assert_eq!(res3, AddResult::Fresh(res3.value())); + + assert_eq!(res1known, AddResult::Known(res1.value())); + assert_eq!(res2known, AddResult::Known(res2.value())); + assert_eq!(res3known, AddResult::Known(res3.value())); + } +} diff --git a/nemo-physical/src/dictionary/prefixed_string_dictionary.rs b/nemo-physical/src/dictionary/prefixed_string_dictionary.rs index 1ebf1c968..f8f052d81 100644 --- a/nemo-physical/src/dictionary/prefixed_string_dictionary.rs +++ b/nemo-physical/src/dictionary/prefixed_string_dictionary.rs @@ -6,7 +6,9 @@ use std::{ rc::{Rc, Weak}, }; +use super::AddResult; use super::Dictionary; +use super::DictionaryString; /// Represents a node, which is either a [TrieNode::Root], or some non-special [TrieNode::Node] enum TrieNode { @@ -317,6 +319,13 @@ pub struct PrefixedStringDictionary { store: Rc>, } +impl PrefixedStringDictionary { + /// Construct a new and empty dictionary. + pub fn new() -> Self { + Self::default() + } +} + impl Default for PrefixedStringDictionary { /// Initialise a Default Prefixedstringdictionary /// It contains the empty string as the first element (position 0) @@ -335,50 +344,62 @@ impl Default for PrefixedStringDictionary { } impl Dictionary for PrefixedStringDictionary { - fn add(&mut self, entry: String) -> usize { + fn add_string(&mut self, entry: String) -> AddResult { log::trace!("add {entry:?} to {self:?}"); - *self.mapping.entry(entry.clone()).or_insert_with(|| { - let prefixes: Vec<&str> = Prefixer::new(entry.as_str()).collect(); - log::trace!("prefixes: {prefixes:?}"); - let (real_prefixes, real_entry) = prefixes.split_at(prefixes.len() - 1); - log::trace!("reals: {real_prefixes:?}, {real_entry:?}"); - let (mut cur_node, remaining_prefixes) = - TrieNode::find_last_match(self.store.to_owned(), real_prefixes); - log::trace!("cur_node: {cur_node:?}, remaining: {remaining_prefixes:?}"); - for element in remaining_prefixes { - let new_node = Rc::new(RefCell::new(TrieNode::create_node( + match self.mapping.get(&entry) { + Some(idx) => AddResult::Known(*idx), + None => { + let prefixes: Vec<&str> = Prefixer::new(entry.as_str()).collect(); + log::trace!("prefixes: {prefixes:?}"); + let (real_prefixes, real_entry) = prefixes.split_at(prefixes.len() - 1); + log::trace!("reals: {real_prefixes:?}, {real_entry:?}"); + let (mut cur_node, remaining_prefixes) = + TrieNode::find_last_match(self.store.to_owned(), real_prefixes); + log::trace!("cur_node: {cur_node:?}, remaining: {remaining_prefixes:?}"); + for element in remaining_prefixes { + let new_node = Rc::new(RefCell::new(TrieNode::create_node( + Rc::clone(&cur_node), + element.to_string(), + ))); + log::trace!("{element:?} ({remaining_prefixes:?}): new_node: {new_node:?}"); + cur_node + .as_ref() + .borrow_mut() + .add_node(Rc::clone(&new_node)); + cur_node = Rc::clone(&new_node); + log::trace!("{element:?} ({remaining_prefixes:?}): cur_node: {cur_node:?}"); + } + let entry_string = Rc::new(real_entry[0].to_string()); + log::trace!("entry_string: {entry_string:?}"); + log::trace!( + "pair: {:?}", + TrieNodeStringPair(Rc::clone(&cur_node), Rc::clone(&entry_string)) + ); + let value = self.ordering.len(); + self.ordering.push(TrieNodeStringPair( Rc::clone(&cur_node), - element.to_string(), - ))); - log::trace!("{element:?} ({remaining_prefixes:?}): new_node: {new_node:?}"); - cur_node - .as_ref() - .borrow_mut() - .add_node(Rc::clone(&new_node)); - cur_node = Rc::clone(&new_node); - log::trace!("{element:?} ({remaining_prefixes:?}): cur_node: {cur_node:?}"); + Rc::clone(&entry_string), + )); + log::trace!("ordering: {:?}, value: {value:?}", self.ordering); + self.mapping.insert(entry.clone(), value); + AddResult::Fresh(value) } - let entry_string = Rc::new(real_entry[0].to_string()); - log::trace!("entry_string: {entry_string:?}"); - log::trace!( - "pair: {:?}", - TrieNodeStringPair(Rc::clone(&cur_node), Rc::clone(&entry_string)) - ); - let value = self.ordering.len(); - self.ordering.push(TrieNodeStringPair( - Rc::clone(&cur_node), - Rc::clone(&entry_string), - )); - log::trace!("ordering: {:?}, value: {value:?}", self.ordering); - value - }) + } } - fn index_of(&self, entry: &str) -> Option { + fn add_str(&mut self, string: &str) -> AddResult { + self.add_string(string.to_string()) + } + + fn add_dictionary_string(&mut self, ds: DictionaryString) -> AddResult { + self.add_str(ds.as_str()) + } + + fn fetch_id(&self, entry: &str) -> Option { self.mapping.get(&entry.to_string()).cloned() } - fn entry(&self, index: usize) -> Option { + fn get(&self, index: usize) -> Option { if index < self.ordering.len() { Some(format!( "{}{}", @@ -393,10 +414,6 @@ impl Dictionary for PrefixedStringDictionary { fn len(&self) -> usize { self.ordering.len() } - - fn is_empty(&self) -> bool { - self.len() == 0 - } } /// The [Prefixer] allows to split a given [&str] into its prefixes. @@ -436,6 +453,7 @@ impl<'a> Iterator for Prefixer<'a> { mod test { use std::borrow::Borrow; + use crate::dictionary::AddResult; use crate::dictionary::Dictionary; use super::PrefixedStringDictionary; @@ -458,7 +476,7 @@ mod test { ]; for i in vec { - dict.add(i.to_string()); + dict.add_string(i.to_string()); } dict } @@ -466,41 +484,51 @@ mod test { #[test] fn empty_str() { let mut dict = PrefixedStringDictionary::default(); - dict.add("".to_string()); - assert_eq!(dict.entry(0), Some("".to_string())); + dict.add_string("".to_string()); + assert_eq!(dict.get(0), Some("".to_string())); let mut dict = create_dict(); - dict.add("".to_string()); - assert_eq!(dict.entry(0), Some("".to_string())); - assert_eq!(dict.entry(8), None); + dict.add_string("".to_string()); + assert_eq!(dict.get(0), Some("".to_string())); + assert_eq!(dict.get(8), None); } #[test] fn entry() { let dict = create_dict(); - assert_eq!(dict.entry(1), Some("a".to_string())); - assert_eq!(dict.entry(2), Some("b".to_string())); - assert_eq!(dict.entry(3), Some("c".to_string())); - assert_eq!(dict.entry(4), Some("Position 3".to_string())); - assert_eq!(dict.entry(5), Some("Position 4".to_string())); - assert_eq!(dict.entry(6), Some("Position 5".to_string())); - assert_eq!(dict.entry(7), None); - assert_eq!(dict.entry(4), Some("Position 3".to_string())); + assert_eq!(dict.get(1), Some("a".to_string())); + assert_eq!(dict.get(2), Some("b".to_string())); + assert_eq!(dict.get(3), Some("c".to_string())); + assert_eq!(dict.get(4), Some("Position 3".to_string())); + assert_eq!(dict.get(5), Some("Position 4".to_string())); + assert_eq!(dict.get(6), Some("Position 5".to_string())); + assert_eq!(dict.get(7), None); + assert_eq!(dict.get(4), Some("Position 3".to_string())); } #[test] fn index_of() { let dict = create_dict(); - assert_eq!(dict.index_of("a".to_string().borrow()), Some(1)); - assert_eq!(dict.index_of("b".to_string().borrow()), Some(2)); - assert_eq!(dict.index_of("c".to_string().borrow()), Some(3)); - assert_eq!(dict.index_of("Position 3".to_string().borrow()), Some(4)); - assert_eq!(dict.index_of("Position 4".to_string().borrow()), Some(5)); - assert_eq!(dict.index_of("Position 5".to_string().borrow()), Some(6)); - assert_eq!(dict.index_of("d".to_string().borrow()), None); - assert_eq!(dict.index_of("Pos".to_string().borrow()), None); - assert_eq!(dict.index_of("Pos"), None); - assert_eq!(dict.index_of("b"), Some(2)); + assert_eq!(dict.fetch_id("a".to_string().borrow()), Some(1)); + assert_eq!(dict.fetch_id("b".to_string().borrow()), Some(2)); + assert_eq!(dict.fetch_id("c".to_string().borrow()), Some(3)); + assert_eq!(dict.fetch_id("Position 3".to_string().borrow()), Some(4)); + assert_eq!(dict.fetch_id("Position 4".to_string().borrow()), Some(5)); + assert_eq!(dict.fetch_id("Position 5".to_string().borrow()), Some(6)); + assert_eq!(dict.fetch_id("d".to_string().borrow()), None); + assert_eq!(dict.fetch_id("Pos".to_string().borrow()), None); + assert_eq!(dict.fetch_id("Pos"), None); + assert_eq!(dict.fetch_id("b"), Some(2)); + } + + #[test] + fn add() { + let mut dict = create_dict(); + assert_eq!(dict.add_string("a".to_string()), AddResult::Known(1)); + assert_eq!( + dict.add_string("new value".to_string()), + AddResult::Fresh(7) + ); } #[test] @@ -508,11 +536,11 @@ mod test { let mut dict = create_dict(); // no prefixes, so no children assert!(dict.store.as_ref().borrow().children().is_empty()); - dict.add("https://wikidata.org/entity/Q42".to_string()); + dict.add_string("https://wikidata.org/entity/Q42".to_string()); // now we need some children assert!(!dict.store.as_ref().borrow().children().is_empty()); assert_eq!( - dict.entry(7), + dict.get(7), Some("https://wikidata.org/entity/Q42".to_string()) ); } @@ -522,11 +550,11 @@ mod test { let mut dict = create_dict(); // no prefixes, so no children assert!(dict.store.as_ref().borrow().children().is_empty()); - dict.add("https://wikidata.org/entity/Q42".to_string()); + dict.add_string("https://wikidata.org/entity/Q42".to_string()); // now we need some children assert!(!dict.store.as_ref().borrow().children().is_empty()); assert_eq!( - dict.entry(7), + dict.get(7), Some("https://wikidata.org/entity/Q42".to_string()) ); drop(dict); @@ -541,16 +569,16 @@ mod test { ]; for (i, str) in vec.iter().enumerate() { - assert_eq!(dict.add(str.to_string()), i + 1); + assert_eq!(dict.add_string(str.to_string()).value(), i + 1); } // duplicates for (i, str) in vec.iter().enumerate() { - assert_eq!(dict.add(str.to_string()), i + 1); + assert_eq!(dict.add_string(str.to_string()).value(), i + 1); } for (id, result) in vec.iter().enumerate() { - assert_eq!(dict.entry(id + 1).unwrap(), result.to_string()); - assert_eq!(dict.index_of(result), Some(id + 1)); + assert_eq!(dict.get(id + 1).unwrap(), result.to_string()); + assert_eq!(dict.fetch_id(result), Some(id + 1)); } } } diff --git a/nemo-physical/src/dictionary/string_dictionary.rs b/nemo-physical/src/dictionary/string_dictionary.rs index 1092b1d74..379b245cf 100644 --- a/nemo-physical/src/dictionary/string_dictionary.rs +++ b/nemo-physical/src/dictionary/string_dictionary.rs @@ -1,4 +1,6 @@ +use super::AddResult; use super::Dictionary; +use super::DictionaryString; use std::collections::HashMap; use std::rc::Rc; @@ -9,24 +11,39 @@ pub struct StringDictionary { mapping: HashMap, usize>, } +impl StringDictionary { + /// Construct a new and empty dictionary. + pub fn new() -> Self { + Self::default() + } +} + impl Dictionary for StringDictionary { - fn add(&mut self, entry: String) -> usize { + fn add_string(&mut self, entry: String) -> AddResult { match self.mapping.get(&entry) { - Some(idx) => *idx, + Some(idx) => AddResult::Known(*idx), None => { let len = self.store.len(); self.store.push(Rc::new(entry)); self.mapping.insert(self.store[len].clone(), len); - len + AddResult::Fresh(len) } } } - fn index_of(&self, entry: &str) -> Option { + fn add_str(&mut self, string: &str) -> AddResult { + self.add_string(string.to_string()) + } + + fn add_dictionary_string(&mut self, ds: DictionaryString) -> AddResult { + self.add_str(ds.as_str()) + } + + fn fetch_id(&self, entry: &str) -> Option { self.mapping.get(&entry.to_string()).copied() } - fn entry(&self, index: usize) -> Option { + fn get(&self, index: usize) -> Option { self.store .get(index) .map(|entry| -> String { Rc::clone(entry).to_string() }) @@ -35,16 +52,13 @@ impl Dictionary for StringDictionary { fn len(&self) -> usize { self.mapping.len() } - - fn is_empty(&self) -> bool { - self.mapping.is_empty() - } } #[cfg(test)] mod test { use std::borrow::Borrow; + use crate::dictionary::AddResult; use crate::dictionary::Dictionary; use super::StringDictionary; @@ -65,7 +79,7 @@ mod test { ]; for i in vec { - dict.add(i.to_string()); + dict.add_string(i.to_string()); } dict } @@ -73,28 +87,38 @@ mod test { #[test] fn entry() { let dict = create_dict(); - assert_eq!(dict.entry(0), Some("a".to_string())); - assert_eq!(dict.entry(1), Some("b".to_string())); - assert_eq!(dict.entry(2), Some("c".to_string())); - assert_eq!(dict.entry(3), Some("Position 3".to_string())); - assert_eq!(dict.entry(4), Some("Position 4".to_string())); - assert_eq!(dict.entry(5), Some("Position 5".to_string())); - assert_eq!(dict.entry(6), None); - assert_eq!(dict.entry(3), Some("Position 3".to_string())); + assert_eq!(dict.get(0), Some("a".to_string())); + assert_eq!(dict.get(1), Some("b".to_string())); + assert_eq!(dict.get(2), Some("c".to_string())); + assert_eq!(dict.get(3), Some("Position 3".to_string())); + assert_eq!(dict.get(4), Some("Position 4".to_string())); + assert_eq!(dict.get(5), Some("Position 5".to_string())); + assert_eq!(dict.get(6), None); + assert_eq!(dict.get(3), Some("Position 3".to_string())); } #[test] fn index_of() { let dict = create_dict(); - assert_eq!(dict.index_of("a".to_string().borrow()), Some(0)); - assert_eq!(dict.index_of("b".to_string().borrow()), Some(1)); - assert_eq!(dict.index_of("c".to_string().borrow()), Some(2)); - assert_eq!(dict.index_of("Position 3".to_string().borrow()), Some(3)); - assert_eq!(dict.index_of("Position 4".to_string().borrow()), Some(4)); - assert_eq!(dict.index_of("Position 5".to_string().borrow()), Some(5)); - assert_eq!(dict.index_of("d".to_string().borrow()), None); - assert_eq!(dict.index_of("Pos".to_string().borrow()), None); - assert_eq!(dict.index_of("Pos"), None); - assert_eq!(dict.index_of("b"), Some(1)); + assert_eq!(dict.fetch_id("a".to_string().borrow()), Some(0)); + assert_eq!(dict.fetch_id("b".to_string().borrow()), Some(1)); + assert_eq!(dict.fetch_id("c".to_string().borrow()), Some(2)); + assert_eq!(dict.fetch_id("Position 3".to_string().borrow()), Some(3)); + assert_eq!(dict.fetch_id("Position 4".to_string().borrow()), Some(4)); + assert_eq!(dict.fetch_id("Position 5".to_string().borrow()), Some(5)); + assert_eq!(dict.fetch_id("d".to_string().borrow()), None); + assert_eq!(dict.fetch_id("Pos".to_string().borrow()), None); + assert_eq!(dict.fetch_id("Pos"), None); + assert_eq!(dict.fetch_id("b"), Some(1)); + } + + #[test] + fn add() { + let mut dict = create_dict(); + assert_eq!(dict.add_string("a".to_string()), AddResult::Known(0)); + assert_eq!( + dict.add_string("new value".to_string()), + AddResult::Fresh(6) + ); } } diff --git a/nemo-physical/src/dictionary/value_serializer.rs b/nemo-physical/src/dictionary/value_serializer.rs index bc4314950..a8dbbb126 100644 --- a/nemo-physical/src/dictionary/value_serializer.rs +++ b/nemo-physical/src/dictionary/value_serializer.rs @@ -21,7 +21,7 @@ where { usize::try_from(constant) .ok() - .and_then(|constant| dict.entry(constant)) + .and_then(|constant| dict.get(constant)) .unwrap_or_else(|| format!("{NULL_PREFIX}{constant}")) .into() } diff --git a/nemo-physical/src/management/database.rs b/nemo-physical/src/management/database.rs index 18413bf40..f4164f82c 100644 --- a/nemo-physical/src/management/database.rs +++ b/nemo-physical/src/management/database.rs @@ -43,12 +43,8 @@ use super::{ ByteSized, ExecutionPlan, }; -#[cfg(feature = "no-prefixed-string-dictionary")] /// Dictionary Implementation used in the current configuration -pub type Dict = crate::dictionary::StringDictionary; -#[cfg(not(feature = "no-prefixed-string-dictionary"))] -/// Dictionary Implementation used in the current configuration -pub type Dict = crate::dictionary::PrefixedStringDictionary; +pub type Dict = crate::dictionary::hash_map_dictionary::HashMapDictionary; /// Type that represents a reordering of the columns of a table. /// It is given in form of a permutation which encodes the transformation diff --git a/nemo-physical/src/tabular/operations/triescan_project.rs b/nemo-physical/src/tabular/operations/triescan_project.rs index e4034cb0c..a319e84f1 100644 --- a/nemo-physical/src/tabular/operations/triescan_project.rs +++ b/nemo-physical/src/tabular/operations/triescan_project.rs @@ -799,11 +799,27 @@ mod test { #[test] fn spurious_tuples_in_reorder_bug() { let mut dict = Dict::default(); - let x = dict.add("72".to_owned()).try_into().unwrap(); - let a = dict.add("139".to_owned()).try_into().unwrap(); - let b = dict.add("141".to_owned()).try_into().unwrap(); - let u = dict.add("140".to_owned()).try_into().unwrap(); - let v = dict.add("134".to_owned()).try_into().unwrap(); + let x = dict.add_string("72".to_owned()).value().try_into().unwrap(); + let a = dict + .add_string("139".to_owned()) + .value() + .try_into() + .unwrap(); + let b = dict + .add_string("141".to_owned()) + .value() + .try_into() + .unwrap(); + let u = dict + .add_string("140".to_owned()) + .value() + .try_into() + .unwrap(); + let v = dict + .add_string("134".to_owned()) + .value() + .try_into() + .unwrap(); let fst = vec![x]; let snd = vec![a, b]; @@ -831,8 +847,9 @@ mod test { #[test] fn spurious_tuples_in_reorder_mk2_bug() { let mut dict = Dict::default(); - let mut intern = - |term: &str| StorageValueT::U64(dict.add(term.to_owned()).try_into().unwrap()); + let mut intern = |term: &str| { + StorageValueT::U64(dict.add_string(term.to_owned()).value().try_into().unwrap()) + }; let a = intern("genid:cc18ce3a-be8a-3445-8b68-2027a2e1b1be"); let b = intern("genid:0f18d187-7a4f-35c6-b645-c57ee51d277d"); @@ -876,8 +893,9 @@ mod test { #[test] fn spurious_tuples_in_reorder_mk2_minimised_bug() { let mut dict = Dict::default(); - let mut intern = - |term: &str| StorageValueT::U64(dict.add(term.to_owned()).try_into().unwrap()); + let mut intern = |term: &str| { + StorageValueT::U64(dict.add_string(term.to_owned()).value().try_into().unwrap()) + }; let a = intern("genid:cc18ce3a-be8a-3445-8b68-2027a2e1b1be"); let b = intern("genid:0f18d187-7a4f-35c6-b645-c57ee51d277d"); @@ -902,17 +920,57 @@ mod test { fn spurious_tuples_in_reorder_bug2() { let mut dict = Dict::default(); - let rg = dict.add("RoleGroup".to_owned()).try_into().unwrap(); - let a = dict.add("4_1_6".to_owned()).try_into().unwrap(); - let b = dict.add("4_1_21".to_owned()).try_into().unwrap(); - let c = dict.add("4_1_22".to_owned()).try_into().unwrap(); - - let u = dict.add("32_1_58".to_owned()).try_into().unwrap(); - let v = dict.add("32_1_72".to_owned()).try_into().unwrap(); - let w = dict.add("32_1_81".to_owned()).try_into().unwrap(); - let x = dict.add("32_1_74".to_owned()).try_into().unwrap(); - let y = dict.add("32_1_83".to_owned()).try_into().unwrap(); - let z = dict.add("32_1_60".to_owned()).try_into().unwrap(); + let rg = dict + .add_string("RoleGroup".to_owned()) + .value() + .try_into() + .unwrap(); + let a = dict + .add_string("4_1_6".to_owned()) + .value() + .try_into() + .unwrap(); + let b = dict + .add_string("4_1_21".to_owned()) + .value() + .try_into() + .unwrap(); + let c = dict + .add_string("4_1_22".to_owned()) + .value() + .try_into() + .unwrap(); + + let u = dict + .add_string("32_1_58".to_owned()) + .value() + .try_into() + .unwrap(); + let v = dict + .add_string("32_1_72".to_owned()) + .value() + .try_into() + .unwrap(); + let w = dict + .add_string("32_1_81".to_owned()) + .value() + .try_into() + .unwrap(); + let x = dict + .add_string("32_1_74".to_owned()) + .value() + .try_into() + .unwrap(); + let y = dict + .add_string("32_1_83".to_owned()) + .value() + .try_into() + .unwrap(); + let z = dict + .add_string("32_1_60".to_owned()) + .value() + .try_into() + .unwrap(); let fst = vec![a, b, c]; let snd = vec![rg, rg, rg]; diff --git a/nemo/Cargo.toml b/nemo/Cargo.toml index f27880e65..9311f5119 100644 --- a/nemo/Cargo.toml +++ b/nemo/Cargo.toml @@ -15,7 +15,6 @@ default = ["timing"] # Enables the "js" feature of the "getrandom" crate # This feature cannot be used together with the "timing" feature, because the "howlong" crate does not support web assembly environments js = ["getrandom/js"] -no-prefixed-string-dictionary = ["nemo-physical/no-prefixed-string-dictionary"] timing = ["nemo-physical/timing"] [dependencies] diff --git a/nemo/src/builder_proxy.rs b/nemo/src/builder_proxy.rs index ccd648c4a..21fc11090 100644 --- a/nemo/src/builder_proxy.rs +++ b/nemo/src/builder_proxy.rs @@ -394,8 +394,7 @@ where #[cfg(test)] mod test { use nemo_physical::{ - datatypes::storage_value::VecT, - dictionary::{Dictionary, PrefixedStringDictionary}, + datatypes::storage_value::VecT, dictionary::Dictionary, management::database::Dict, }; use test_log::test; @@ -557,7 +556,7 @@ mod test { }) .unwrap(); - let mut dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let mut dict = std::cell::RefCell::new(Dict::default()); let physical_builder_for_any_column = PhysicalStringColumnBuilderProxy::new(&dict); let physical_builder_for_string_column = PhysicalStringColumnBuilderProxy::new(&dict); @@ -640,11 +639,11 @@ mod test { let any_result: Vec = any_result_indices .into_iter() - .map(|idx| dict.get_mut().entry(idx.try_into().unwrap()).unwrap()) + .map(|idx| dict.get_mut().get(idx.try_into().unwrap()).unwrap()) .collect(); let string_result: Vec = string_result_indices .into_iter() - .map(|idx| dict.get_mut().entry(idx.try_into().unwrap()).unwrap()) + .map(|idx| dict.get_mut().get(idx.try_into().unwrap()).unwrap()) .collect(); let VecT::I64(integer_result) = phys_enum_for_integer.finalize() else { unreachable!() diff --git a/nemo/src/io/formats/dsv.rs b/nemo/src/io/formats/dsv.rs index 8ba6cfb5f..37ddb427b 100644 --- a/nemo/src/io/formats/dsv.rs +++ b/nemo/src/io/formats/dsv.rs @@ -70,7 +70,7 @@ //! # ], //! # ); //! # let table_reader:Box = Box::new(csv_reader); -//! # let mut dict = RefCell::new(nemo_physical::dictionary::PrefixedStringDictionary::default()); +//! # let mut dict = RefCell::new(nemo_physical::management::database::Dict::default()); //! let mut builder = vec![ //! PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)), //! PhysicalBuilderProxyEnum::I64(Default::default()), @@ -260,7 +260,8 @@ mod test { data_value::{DataValueIteratorT, PhysicalString}, storage_value::VecT, }, - dictionary::{Dictionary, PrefixedStringDictionary}, + dictionary::Dictionary, + management::database::Dict, }; #[test] @@ -274,7 +275,7 @@ Boston;United States;4628910 .delimiter(b';') .from_reader(data.as_bytes()); - let mut dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let mut dict = std::cell::RefCell::new(Dict::default()); let csvreader = DSVReader::dsv( ResourceProviders::empty(), &DsvFile::csv_file( @@ -312,7 +313,7 @@ Boston;United States;4628910 let dvit = DataValueIteratorT::String(Box::new(x.into_iter().map(|vt| { dict.get_mut() - .entry(usize::try_from(u64::try_from(vt.get(0).unwrap()).unwrap()).unwrap()) + .get(usize::try_from(u64::try_from(vt.get(0).unwrap()).unwrap()).unwrap()) .map(PhysicalString::from) .unwrap() }))); @@ -367,7 +368,7 @@ The next 2 columns are empty;;;789 .delimiter(b';') .from_reader(data.as_bytes()); - let mut dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let mut dict = std::cell::RefCell::new(Dict::default()); let csvreader = DSVReader::dsv( ResourceProviders::empty(), &DsvFile::csv_file( @@ -417,7 +418,7 @@ The next 2 columns are empty;;;789 col0_idx .iter() .copied() - .map(|idx| dict.get_mut().entry(idx.try_into().unwrap()).unwrap()) + .map(|idx| dict.get_mut().get(idx.try_into().unwrap()).unwrap()) .map(PhysicalString::from) .collect::>() .into_iter(), @@ -426,7 +427,7 @@ The next 2 columns are empty;;;789 col1_idx .iter() .copied() - .map(|idx| dict.get_mut().entry(idx.try_into().unwrap()).unwrap()) + .map(|idx| dict.get_mut().get(idx.try_into().unwrap()).unwrap()) .map(PhysicalString::from) .collect::>() .into_iter(), @@ -435,7 +436,7 @@ The next 2 columns are empty;;;789 col2_idx .iter() .copied() - .map(|idx| dict.get_mut().entry(idx.try_into().unwrap()).unwrap()) + .map(|idx| dict.get_mut().get(idx.try_into().unwrap()).unwrap()) .map(PhysicalString::from) .collect::>() .into_iter(), @@ -474,7 +475,7 @@ node03;123;123;13;55;123;invalid .has_headers(false) .from_reader(data.as_bytes()); - let dict = std::cell::RefCell::new(PrefixedStringDictionary::default()); + let dict = std::cell::RefCell::new(Dict::default()); let csvreader: DSVReader = DSVReader::dsv( ResourceProviders::empty(), &DsvFile::csv_file( diff --git a/nemo/src/io/formats/rdf_triples.rs b/nemo/src/io/formats/rdf_triples.rs index a10ec8479..30c1113d2 100644 --- a/nemo/src/io/formats/rdf_triples.rs +++ b/nemo/src/io/formats/rdf_triples.rs @@ -218,7 +218,8 @@ mod test { use nemo_physical::{ builder_proxy::{PhysicalColumnBuilderProxy, PhysicalStringColumnBuilderProxy}, datatypes::data_value::{DataValueIteratorT, PhysicalString}, - dictionary::{Dictionary, PrefixedStringDictionary}, + dictionary::Dictionary, + management::database::Dict, }; use rio_turtle::TurtleParser; use test_log::test; @@ -235,7 +236,7 @@ mod test { _:subject2 "object2" . "#.as_bytes(); - let dict = RefCell::new(PrefixedStringDictionary::default()); + let dict = RefCell::new(Dict::default()); let mut builders = vec![ PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)), PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)), @@ -264,7 +265,7 @@ mod test { .get(idx) .and_then(|value| value.try_into().ok()) .and_then(|u64: u64| usize::try_from(u64).ok()) - .and_then(|usize| dict.borrow_mut().entry(usize)) + .and_then(|usize| dict.borrow_mut().get(usize)) .unwrap() }) .map(PhysicalString::from) @@ -302,7 +303,7 @@ mod test { "# .as_bytes(); - let dict = RefCell::new(PrefixedStringDictionary::default()); + let dict = RefCell::new(Dict::default()); let mut builders = vec![ PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)), PhysicalBuilderProxyEnum::String(PhysicalStringColumnBuilderProxy::new(&dict)),