Skip to content

Commit

Permalink
Vendor the en_US dictionary
Browse files Browse the repository at this point in the history
It will be a little friendlier for anyone trying out the examples or
benchmarks to make the dictionary lookup infallible. We will eventually
want to use a real dictionary for integration tests and we can drop the
xdg dependency. The file size is around a half of a MB so I don't feel
too badly about pulling this in.
  • Loading branch information
the-mikedavis committed Aug 26, 2023
1 parent c954ebc commit 97e7d29
Show file tree
Hide file tree
Showing 12 changed files with 49,593 additions and 82 deletions.
9 changes: 0 additions & 9 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,3 @@ rust-version = "1.60"
[dependencies]
# TODO: investigate using smartstring to cut down on allocations since
# there are plenty of small strings.

[dev-dependencies]
xdg = "2.5"
7 changes: 0 additions & 7 deletions bench/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,3 @@ rust-version = "1.66"
[dependencies]
spellbook = { path = "../" }
brunch = "0.5"
xdg = "2.5"
30 changes: 5 additions & 25 deletions bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,13 @@ use std::hint::black_box;
use brunch::Bench;
use spellbook::Dictionary;

const EN_US_DIC: &str = include_str!("../../vendor/en_US/en_US.dic");
const EN_US_AFF: &str = include_str!("../../vendor/en_US/en_US.aff");

const SAMPLES: u32 = 500_000;

fn main() {
let base = xdg::BaseDirectories::new().expect("Could not determine XDG directories");
let (dic_path, aff_path) = match base.get_data_dirs().iter().find_map(|dir| {
let subdir = dir.join("hunspell");
if !subdir.is_dir() {
return None;
}

let dic = subdir.join("en_US.dic");
let aff = subdir.join("en_US.aff");
if dic.is_file() && aff.is_file() {
Some((dic, aff))
} else {
None
}
}) {
Some((dic, aff)) => (dic, aff),
None => {
eprintln!("Could not find the en_US dictionary");
std::process::exit(1);
}
};
let dic_text = std::fs::read_to_string(dic_path).unwrap();
let aff_text = std::fs::read_to_string(aff_path).unwrap();
let dict = Dictionary::compile(&aff_text, &dic_text).unwrap();
let dict = Dictionary::compile(EN_US_AFF, EN_US_DIC).unwrap();

eprintln!("Starting benchmarks...");
eprintln!();
Expand All @@ -39,7 +19,7 @@ fn main() {

// Compilation
Bench::new("Compile en_US")
.run(|| Dictionary::compile(black_box(&aff_text), black_box(&dic_text))),
.run(|| Dictionary::compile(black_box(EN_US_AFF), black_box(EN_US_DIC))),
Bench::spacer(),

// Checking
Expand Down
43 changes: 7 additions & 36 deletions examples/check.rs
Original file line number Diff line number Diff line change
@@ -1,52 +1,23 @@
use std::time::Instant;

use spellbook::Dictionary;
use xdg::BaseDirectories;

const EN_US_DIC: &str = include_str!("../vendor/en_US/en_US.dic");
const EN_US_AFF: &str = include_str!("../vendor/en_US/en_US.aff");

fn main() {
let mut args = std::env::args().skip(1);
let arg1 = match args.next() {
let word = match args.next() {
Some(arg) => arg,
None => {
eprintln!("Usage: check [LANG] WORD");
std::process::exit(1);
}
};
let (lang, word) = match args.next() {
Some(arg2) => (arg1, arg2),
None => ("en_US".to_string(), arg1),
};

let base = BaseDirectories::new().expect("Could not determine XDG directories");
let (dic_path, aff_path) = match base.get_data_dirs().iter().find_map(|dir| {
let subdir = dir.join("hunspell");
if !subdir.is_dir() {
return None;
}

let dic = subdir.join(format!("{lang}.dic"));
let aff = subdir.join(format!("{lang}.aff"));
if dic.is_file() && aff.is_file() {
Some((dic, aff))
} else {
None
}
}) {
Some((dic, aff)) => (dic, aff),
None => {
eprintln!("Could not find the {lang} dictionary");
eprintln!("Usage: check WORD");
std::process::exit(1);
}
};
let dic_text = std::fs::read_to_string(dic_path).unwrap();
let aff_text = std::fs::read_to_string(aff_path).unwrap();

let now = Instant::now();
let dict = Dictionary::compile(&aff_text, &dic_text).unwrap();
println!(
"Compiled the {lang} dictionary in {}ms",
now.elapsed().as_millis()
);
let dict = Dictionary::compile(EN_US_AFF, EN_US_DIC).unwrap();
println!("Compiled the dictionary in {}ms", now.elapsed().as_millis());

let now = Instant::now();
if dict.check(&word) {
Expand Down
1 change: 0 additions & 1 deletion shell.nix
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,4 @@ pkgs.mkShell {
cargo-flamegraph
];
RUST_BACKTRACE = "1";
XDG_DATA_DIRS = with pkgs; lib.makeSearchPath "share" [hunspellDicts.en_US];
}
6 changes: 6 additions & 0 deletions vendor/en_US/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
US English dictionary.

These files are licensed separately from spellbook. See the '*license.txt'
files in this directory.

Upstream <https://github.com/JetBrains/hunspell-dictionaries>
31 changes: 31 additions & 0 deletions vendor/en_US/WordNet_license.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
WordNet Release 2.1

This software and database is being provided to you, the LICENSEE, by
Princeton University under the following license. By obtaining, using
and/or copying this software and database, you agree that you have
read, understood, and will comply with these terms and conditions.:

Permission to use, copy, modify and distribute this software and
database and its documentation for any purpose and without fee or
royalty is hereby granted, provided that you agree to comply with
the following copyright notice and statements, including the disclaimer,
and that the same appear on ALL copies of the software, database and
documentation, including modifications that you make for internal
use or for distribution.

WordNet 2.1 Copyright 2005 by Princeton University. All rights reserved.

THIS SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PRINCETON
UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANT-
ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE
OF THE LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT
INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR
OTHER RIGHTS.

The name of Princeton University or Princeton may not be used in
advertising or publicity pertaining to distribution of the software
and/or database. Title to copyright in this software, database and
any associated documentation shall at all times remain with
Princeton University and LICENSEE agrees to preserve same.
Loading

0 comments on commit 97e7d29

Please sign in to comment.