Skip to content

Commit

Permalink
Add deserialize methods for rkyv
Browse files Browse the repository at this point in the history
  • Loading branch information
ChillFish8 committed Nov 2, 2022
1 parent ed6ce7b commit b716efb
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 17 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ exclude = ["data/*"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
rkyv = { version = "0.7.31", features = ["alloc", "hashbrown"] }
rkyv = { version = "0.7.31", features = ["alloc", "hashbrown", "validation"] }

bytecheck = "0.6.9"
memmap2 = "0.5.2"
hashbrown = "0.12.1"
ahash = "0.7.6"
Expand Down
11 changes: 7 additions & 4 deletions src/symspell.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ use std::io::{BufRead, BufReader};
use std::path::Path;
use std::{cmp, i64};

use bytecheck::CheckBytes;
use deunicode::deunicode;
use hashbrown::{HashMap, HashSet};
use rkyv::{Archive, Deserialize, Serialize};

use crate::composition::Composition;
use crate::edit_distance;
use crate::suggestion::Suggestion;
use crate::wordmaps::{MemBackedWordMap, WordRepr};
use crate::wordmaps::{WordMap, WordRepr};

#[derive(Eq, PartialEq, Debug)]
pub enum Verbosity {
Expand Down Expand Up @@ -63,14 +65,16 @@ mod ascii {
const WORD_COUNT: i64 = 1_024_908_267_229;
const PREFIX_LENGTH: i64 = 7;

#[derive(Archive, Deserialize, Serialize)]
#[archive_attr(derive(CheckBytes))]
pub struct SymSpell {
/// Maximum edit distance for doing lookups.
max_dictionary_edit_distance: i64,
/// The minimum frequency count for dictionary words to be considered correct spellings.
count_threshold: i64,
max_length: usize,
words: HashMap<String, i64>,
pub deletes: MemBackedWordMap,
pub deletes: WordMap,
}

impl Default for SymSpell {
Expand Down Expand Up @@ -189,7 +193,7 @@ impl SymSpell {
}
}

self.deletes = MemBackedWordMap::with_dictionary(deletes);
self.deletes = WordMap::with_dictionary(deletes);
}

/// Find suggested spellings for a given input word, using the maximum
Expand Down Expand Up @@ -844,7 +848,6 @@ fn edits(
}
}


#[cfg(test)]
mod tests {
use super::*;
Expand Down
28 changes: 16 additions & 12 deletions src/wordmaps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ use std::hash::{Hash, Hasher};
use std::iter::FromIterator;
use std::ops::Deref;

use bytecheck::CheckBytes;
use hashbrown::HashMap;
use rkyv::{Archive, Serialize};
use rkyv::{Archive, Deserialize, Serialize};

/// A 32 bit sized pointer to a given word.
///
/// This is used so much we want to reduce the size of our points as much as possible.
/// 32 bits is really all that we require as any larger than a 32 bit length array wont
/// fit in memory or be able to be used regardless.
#[derive(Archive, Serialize, Copy, Clone)]
#[derive(Archive, Deserialize, Serialize, Copy, Clone)]
#[archive(compare(PartialEq))]
#[archive_attr(derive(Debug))]
#[archive_attr(derive(Debug, CheckBytes))]
pub struct WordRef(u32);

impl Debug for WordRef {
Expand Down Expand Up @@ -44,9 +45,9 @@ impl WordRepr for rkyv::Archived<Word> {
/// as_str which performs an checked transmute.
///
/// The Archived variant of this type only implements `as_str` and derives Debug and EQ.
#[derive(Archive, Serialize, Clone, Default)]
#[derive(Archive, Deserialize, Serialize, Clone, Default)]
#[archive(compare(PartialEq))]
#[archive_attr(derive(Debug))]
#[archive_attr(derive(Debug, CheckBytes))]
pub struct Word(Box<[u8]>);

impl WordRepr for Word {
Expand Down Expand Up @@ -135,19 +136,20 @@ impl Hash for Word {
/// `u64 (hash of the string) -> Box<[u32]>` and then heavily de-duplicates words which
/// are then inserted as the `word_references` this is just a array containing a `Word` each
/// `WordRef` is just a index to this array in order to retrieve words.
#[derive(Archive, Serialize, Default)]
pub struct MemBackedWordMap {
#[derive(Archive, Deserialize, Serialize, Default)]
#[archive_attr(derive(CheckBytes))]
pub struct WordMap {
data: HashMap<u64, Box<[WordRef]>>,
word_references: Box<[Word]>,
}

impl Debug for MemBackedWordMap {
impl Debug for WordMap {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self.data)
}
}

impl MemBackedWordMap {
impl WordMap {
/// Gets the word which is located at the given `WordRef` pointer.
#[inline]
pub fn word_at(&self, word_ref: &WordRef) -> &Word {
Expand All @@ -162,7 +164,9 @@ impl MemBackedWordMap {
}

/// Creates a new `MemBackedWordMap` from a given dictionary.
pub fn with_dictionary<K: AsRef<str>>(mut dictionary: HashMap<K, Vec<String>>) -> Self {
pub fn with_dictionary<K: AsRef<str>>(
mut dictionary: HashMap<K, Vec<String>>,
) -> Self {
let (ref_words, lookup) = {
let mut lookup_index: HashMap<String, u32> = HashMap::new();
let mut ref_words = Vec::new();
Expand Down Expand Up @@ -226,7 +230,7 @@ mod tests {
#[test]
fn test_basic_map() {
let words = get_words();
let map = MemBackedWordMap::with_dictionary(words);
let map = WordMap::with_dictionary(words);

let word = map.get("hello");
assert!(word.is_some());
Expand All @@ -238,7 +242,7 @@ mod tests {
#[test]
fn bench_basic_map() {
let words = get_words();
let map = MemBackedWordMap::with_dictionary(words);
let map = WordMap::with_dictionary(words);

let start = std::time::Instant::now();
for _ in 0..1_000 {
Expand Down

0 comments on commit b716efb

Please sign in to comment.