From 1f1f1a097a03151c1665141c2b2b050ad6cf13a2 Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 02:52:32 +0800 Subject: [PATCH 01/22] Migrate: migrate from fastann --- .gitignore | 11 + Cargo.toml | 58 +++ benches/bench_metrics.rs | 91 ++++ examples/Cargo.toml | 8 + examples/src/main.rs | 3 + src/core/ann_index.rs | 164 ++++++++ src/core/arguments.rs | 96 +++++ src/core/calc.rs | 158 +++++++ src/core/heap.rs | 266 ++++++++++++ src/core/kmeans.rs | 362 ++++++++++++++++ src/core/knn.rs | 551 ++++++++++++++++++++++++ src/core/metrics.rs | 109 +++++ src/core/mod.rs | 10 + src/core/neighbor.rs | 41 ++ src/core/node.rs | 214 ++++++++++ src/core/random.rs | 18 + src/core/simd_metrics.rs | 109 +++++ src/index/bruteforce_idx.rs | 112 +++++ src/index/bruteforce_params.rs | 20 + src/index/hnsw_idx.rs | 743 +++++++++++++++++++++++++++++++++ src/index/hnsw_params.rs | 85 ++++ src/index/mod.rs | 10 + src/index/pq_idx.rs | 535 ++++++++++++++++++++++++ src/index/pq_params.rs | 103 +++++ src/index/rpt_idx.rs | 646 ++++++++++++++++++++++++++++ src/index/rpt_params.rs | 51 +++ src/index/ssg_idx.rs | 582 ++++++++++++++++++++++++++ src/index/ssg_params.rs | 69 +++ src/lib.rs | 2 + 29 files changed, 5227 insertions(+) create mode 100644 Cargo.toml create mode 100644 benches/bench_metrics.rs create mode 100644 examples/Cargo.toml create mode 100644 examples/src/main.rs create mode 100644 src/core/ann_index.rs create mode 100644 src/core/arguments.rs create mode 100644 src/core/calc.rs create mode 100644 src/core/heap.rs create mode 100644 src/core/kmeans.rs create mode 100644 src/core/knn.rs create mode 100644 src/core/metrics.rs create mode 100644 src/core/mod.rs create mode 100644 src/core/neighbor.rs create mode 100644 src/core/node.rs create mode 100644 src/core/random.rs create mode 100644 src/core/simd_metrics.rs create mode 100644 src/index/bruteforce_idx.rs create mode 100644 src/index/bruteforce_params.rs create mode 100644 src/index/hnsw_idx.rs create mode 100644 src/index/hnsw_params.rs create mode 100644 src/index/mod.rs create mode 100644 src/index/pq_idx.rs create mode 100644 src/index/pq_params.rs create mode 100644 src/index/rpt_idx.rs create mode 100644 src/index/rpt_params.rs create mode 100644 src/index/ssg_idx.rs create mode 100644 src/index/ssg_params.rs create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore index 088ba6b..7292a07 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,14 @@ Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk + + +# Added by cargo +# +# already existing elements were commented out + +/target +#Cargo.lock +examples/Cargo.lock +examples/target +examples/target/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1a5fb82 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,58 @@ +[package] +name = "hora" +version = "0.1.0" +authors = ["salamer "] +edition = "2018" +license = "Apache-2.0" + +description = "Hora Search Everywhere" +documentation = "https://docs.rs/hora/" +repository = "https://github.com/hora-search/hora" + +keywords = ["approximate nearest neighbor search", "artificial intelligence", "SIMD", "nearest-neighbor-search", "no_std"] +categories = ["artificial intelligence"] + +[profile.dev] +codegen-units = 4 +debug = 1 # required for line numbers in tests, see tikv #5049 +debug-assertions = true +incremental = true +lto = false +opt-level = 0 +overflow-checks = false +panic = 'unwind' +rpath = false + +[profile.release] +codegen-units = 1 +debug = false +debug-assertions = false +incremental = false +lto = "fat" +opt-level = 3 +overflow-checks = false +panic = 'unwind' +rpath = false + +[features] +simd = ["packed_simd"] +no_std = ["hashbrown"] + +[dependencies] +bincode = "1.3.2" +fixedbitset = "0.4.0" +hashbrown = {version = "0.11.2", optional = true} +log = "^0.4" +num = "0.4.0" +packed_simd = {version = "0.3.5", package = "packed_simd_2", optional = true} +rand = "0.8.3" +rayon = "^1.5" +serde = {version = "1.0", features = ["derive"]} +smallvec = {version = "1.6.1", features = ["serde"], optional = true} + +[dev-dependencies] +criterion = "0.3.4" + +[[bench]] +name = "bench_metrics" +harness = false \ No newline at end of file diff --git a/benches/bench_metrics.rs b/benches/bench_metrics.rs new file mode 100644 index 0000000..53d21d4 --- /dev/null +++ b/benches/bench_metrics.rs @@ -0,0 +1,91 @@ +use fastann::core::metrics; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::distributions::Standard; +use rand::Rng; + +fn make_normal_distribution_clustering( + clustering_n: usize, + node_n: usize, + dimension: usize, + range: f64, +) -> ( + Vec>, // center of cluster + Vec>, // cluster data +) { + let _rng = rand::thread_rng(); + + let mut bases: Vec> = Vec::new(); + let mut ns: Vec> = Vec::new(); + for _i in 0..clustering_n { + let mut rng = rand::thread_rng(); + let mut base: Vec = Vec::with_capacity(dimension); + for _i in 0..dimension { + let n: f64 = rng.gen::() * range; // base number + base.push((n as f32)); + } + + let v_iter: Vec = rng + .sample_iter(&Standard) + .take(dimension * node_n) + .collect::>() + .clone(); + for _i in 0..node_n { + let mut vec_item = Vec::with_capacity(dimension); + for i in 0..dimension { + let vv = (v_iter[_i * dimension..(_i + 1) * dimension][i] as f32) + base[i]; // add normal distribution noise + vec_item.push(vv); + } + ns.push(vec_item); + } + bases.push(base); + } + + (bases, ns) +} + +fn metrics_dot_product_wrapper(nso: &[Vec]) { + nso.iter().for_each(|n| { + nso.iter().for_each(|m| { + metrics::dot_product(n, m).unwrap(); + }) + }) +} + +fn metrics_euclidean_distance_wrapper(nso: &[Vec]) { + nso.iter().for_each(|n| { + nso.iter().for_each(|m| { + metrics::euclidean_distance(n, m).unwrap(); + }) + }) +} + +fn metrics_manhattan_distance_wrapper(nso: &[Vec]) { + nso.iter().for_each(|n| { + nso.iter().for_each(|m| { + metrics::manhattan_distance(n, m).unwrap(); + }) + }) +} + +fn metrics_benchmark(c: &mut Criterion) { + let dimension = 64; + let nodes_every_cluster = 10; + let node_n = 50; + let range = 100000.0; + let (_, nso) = + make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, range); + + c.bench_function("dot_product", |b| { + b.iter(|| metrics_dot_product_wrapper(black_box(&nso))) + }); + c.bench_function("euclidean", |b| { + b.iter(|| metrics_euclidean_distance_wrapper(black_box(&nso))) + }); + c.bench_function("manhattan", |b| { + b.iter(|| metrics_manhattan_distance_wrapper(black_box(&nso))) + }); +} + +criterion_group!(benches, metrics_benchmark); +criterion_main!(benches); diff --git a/examples/Cargo.toml b/examples/Cargo.toml new file mode 100644 index 0000000..08d8075 --- /dev/null +++ b/examples/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "demo" +version = "0.1.0" +authors = ["salamer "] +edition = "2018" + +[dependencies] +hora = { package = "hora", path = "../../hora", features=["simd"]} \ No newline at end of file diff --git a/examples/src/main.rs b/examples/src/main.rs new file mode 100644 index 0000000..e7a11a9 --- /dev/null +++ b/examples/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} diff --git a/src/core/ann_index.rs b/src/core/ann_index.rs new file mode 100644 index 0000000..186a636 --- /dev/null +++ b/src/core/ann_index.rs @@ -0,0 +1,164 @@ +use crate::core::arguments; +use crate::core::metrics; +use crate::core::node; + +use serde::de::DeserializeOwned; + +/// ANNIndex trait provide the all `Approximate Nearest Neighbor Search` problem required method +/// +/// ANNIndex is the main trait that all `Approximate Nearest Neighbor Search` algorithm index have to implement +/// +/// Initial a ANNIndex and call `.build` method, it will build up the index internal to speed up the ANN search. +/// +/// +/// Example: +/// +/// ``` +/// let mut bf_idx = Box::new(bf::bf::BruteForceIndex::::new()); // use BruteForceIndex +/// for i in 0..embs.len() { +/// bf_idx.add_node(&core::node::Node::::new_with_idx(&embs[i], i)); // add index +/// } +/// idx.build(core::metrics::Metric::Euclidean).unwrap(); // build up index +/// println!("{embedding {}'s nearest neighbor is {}}", 0, bf_idx.search(embs[0]); +/// ``` +/// + +pub trait ANNIndex: Send + Sync { + /// build up the ANN index + /// + /// build up index with all node which have add into before, it will cost some time, and the time it cost depends on the algorithm + /// return `Err(&'static str)` if there is something wrong with the building process, and the `static str` is the debug reason + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str>; + + /// add node internal method + /// + /// it will allocate a space in the heap(Vector), and init a `Node` + /// return `Err(&'static str)` if there is something wrong with the adding process, and the `static str` is the debug reason + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str>; + + /// add node + /// + /// call `add_node()` internal + fn add(&mut self, vs: &[E], idx: T) -> Result<(), &'static str> { + self.add_node(&node::Node::new_with_idx(vs, idx)) + } + + /// add multiple node one time + /// + /// return `Err(&'static str)` if there is something wrong with the adding process, and the `static str` is the debug reason + fn add_batch(&mut self, vss: &[&[E]], indices: &[T]) -> Result<(), &'static str> { + if vss.len() != indices.len() { + return Err("vector's size is different with index"); + } + for idx in 0..vss.len() { + let n = node::Node::new_with_idx(vss[idx], indices[idx].clone()); + if let Err(err) = self.add_node(&n) { + return Err(err); + } + } + Ok(()) + } + + /// return the index has already been built or not + /// + /// return `True` if the index has been built + fn built(&self) -> bool; + + /// to rebuild the index with all nodes inside + /// + /// /// return `Err(&'static str)` if there is something wrong with the rebuilding process, and the `static str` is the debug reason + fn rebuild(&mut self, _mt: metrics::Metric) -> Result<(), &'static str> { + Err("not implement") + } + + /// search for k nearest neighbors node internal method + fn node_search_k( + &self, + item: &node::Node, + k: usize, + args: &arguments::Args, + ) -> Vec<(node::Node, E)>; + + /// search for k nearest neighbors and return full info + /// + /// it will return the all node's info including the original vectors, and the metric distance + /// + /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic + fn search_full(&self, item: &[E], k: usize) -> Vec<(node::Node, E)> { + assert_eq!(item.len(), self.dimension()); + self.node_search_k(&node::Node::new(item), k, &arguments::Args::new()) + } + + /// search for k nearest neighbors + /// + /// it only return the idx of the nearest node + /// + /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic + fn search(&self, item: &[E], k: usize) -> Vec { + assert_eq!(item.len(), self.dimension()); + self.node_search_k(&node::Node::new(item), k, &arguments::Args::new()) + .iter() + .map(|x| x.0.idx().as_ref().unwrap().clone()) + .collect::>() + } + + /// return the name of the Index + /// format like this + /// `HNSWIndex(Hierarchical Navigable Small World Index)` + fn name(&self) -> &'static str; + + /// internal nodes' size + fn nodes_size(&self) -> usize { + 0 + } + + /// clear all nodes and index built before + fn clear(&mut self) {} + + /// return String of Index statistics informations + fn idx_info(&self) -> String { + "not implement".to_string() + } + + /// return the dimension it require + fn dimension(&self) -> usize { + 0 + } +} + +/// SerializableIndex provide the `Serialization` and `Deserialization` method for the index +/// SerializableIndex is the main trait that all index have to implement +/// +/// call `.dump` method to dump a binary format file in the disk, and the binary file include all nodes which have added into +/// call `.load' method to load a binary format file to load back the Index built before, and the Index loaded have all Nodes' info the binary file have +/// +/// +/// Example: +/// +/// ``` +/// let mut bf_idx = Box::new(bf::bf::BruteForceIndex::::new()); // use BruteForceIndex +/// for i in 0..embs.len() { +/// bf_idx.add_node(&core::node::Node::::new_with_idx(&embs[i], i)); // add index +/// } +/// bf_idx.dump("bf_idx.idx", &arguments::Args::new()); +/// let bf_idx2 = Box::new(bf::bf::BruteForceIndex::::load("bf_idx.idx", &argument).unwrap()); +/// ``` +/// +pub trait SerializableIndex< + E: node::FloatElement + DeserializeOwned, + T: node::IdxType + DeserializeOwned, +>: Send + Sync + ANNIndex +{ + /// load file with path + fn load(_path: &str, _args: &arguments::Args) -> Result + where + Self: Sized, + { + Err("empty implementation") + } + + /// dump the file into the path + fn dump(&mut self, _path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + Err("empty implementation") + } +} diff --git a/src/core/arguments.rs b/src/core/arguments.rs new file mode 100644 index 0000000..b2667bd --- /dev/null +++ b/src/core/arguments.rs @@ -0,0 +1,96 @@ +#![allow(dead_code)] +#[cfg(feature = "without_std")] +use hashbrown::HashMap; +#[cfg(not(feature = "without_std"))] +use std::collections::HashMap; + +// TODO:L find a way to make the arguments generic; +#[derive(Clone, Debug)] +pub enum ArgsBox { + Float(f32), + Int(i32), + Str(String), + Usize(usize), +} + +// TODO: make this optional +pub struct Args { + args: HashMap, +} + +impl Default for Args { + fn default() -> Self { + Self::new() + } +} + +impl Args { + pub fn new() -> Self { + Args { + args: HashMap::new(), + } + } + + pub fn fget(&self, key: &str) -> Option { + let val = self.args.get(key)?; + match val { + ArgsBox::Float(s) => Some(*s), + _ => None, + } + } + + pub fn iget(&self, key: &str) -> Option { + let val = self.args.get(key)?; + match val { + ArgsBox::Int(s) => Some(*s), + _ => None, + } + } + + pub fn sget(&self, key: &str) -> Option { + let val = self.args.get(key)?; + match val { + ArgsBox::Str(s) => Some(s.clone()), + _ => None, + } + } + + pub fn uget(&self, key: &str) -> Option { + let val = self.args.get(key)?; + match val { + ArgsBox::Usize(s) => Some(*s), + _ => None, + } + } + + pub fn get(&self, key: &str) -> Option { + let val = self.args.get(key)?; + Some(val.clone()) + } + + pub fn fset(&mut self, key: &str, value: f32) -> &mut Args { + self.args.insert(key.to_string(), ArgsBox::Float(value)); + self + } + + pub fn iset(&mut self, key: &str, value: i32) -> &mut Args { + self.args.insert(key.to_string(), ArgsBox::Int(value)); + self + } + + pub fn uset(&mut self, key: &str, value: usize) -> &mut Args { + self.args.insert(key.to_string(), ArgsBox::Usize(value)); + self + } + + pub fn sset(&mut self, key: &str, value: &str) -> &mut Args { + self.args + .insert(key.to_string(), ArgsBox::Str(value.to_string())); + self + } + + pub fn set(&mut self, key: &str, value: ArgsBox) -> &mut Args { + self.args.insert(key.to_string(), value); + self + } +} diff --git a/src/core/calc.rs b/src/core/calc.rs new file mode 100644 index 0000000..d0f6a31 --- /dev/null +++ b/src/core/calc.rs @@ -0,0 +1,158 @@ +use crate::core::node::FloatElement; + +pub fn get_norm(vec1: &[T]) -> Result +where + T: FloatElement, +{ + match dot(&vec1, &vec1) { + Ok(val) => Ok(val.sqrt()), + Err(err) => Err(err), + } +} + +pub fn dot(vec1: &[T], vec2: &[T]) -> Result +where + T: FloatElement, +{ + T::dot_product(&vec1, &vec2) +} + +#[inline(always)] +pub fn same_dimension(vec1: &[T], vec2: &[T]) -> Result<(), &'static str> +where + T: FloatElement, +{ + if vec1.len() != vec2.len() { + return Result::Err("different dimensions"); + } + Result::Ok(()) +} + +pub fn split_imbalance(vec1: &[T], vec2: &[T]) -> f64 { + let ls = vec1.len() as f64; + let rs = vec2.len() as f64; + let f = ls / (ls + rs + 1e-9); + if f > (1.0 - f) { + f + } else { + 1.0 - f + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::core::simd_metrics::SIMDOptmized; + + use rand::distributions::Standard; + + use rand::Rng; + use std::time::{Duration, SystemTime, UNIX_EPOCH}; + fn make_normal_distribution_clustering( + clustering_n: usize, + node_n: usize, + dimension: usize, + range: f64, + ) -> ( + Vec>, // center of cluster + Vec>, // cluster data + ) { + let _rng = rand::thread_rng(); + + let mut bases: Vec> = Vec::new(); + let mut ns: Vec> = Vec::new(); + for _i in 0..clustering_n { + let mut rng = rand::thread_rng(); + let mut base: Vec = Vec::with_capacity(dimension); + for _i in 0..dimension { + let n: f64 = rng.gen::() * range; // base number + base.push(n); + } + + let v_iter: Vec = rng + .sample_iter(&Standard) + .take(dimension * node_n) + .collect::>() + .clone(); + for _i in 0..node_n { + let mut vec_item = Vec::with_capacity(dimension); + for i in 0..dimension { + let vv = v_iter[_i * dimension..(_i + 1) * dimension][i] + base[i]; // add normal distribution noise + vec_item.push(vv); + } + ns.push(vec_item); + } + bases.push(base); + } + + (bases, ns) + } + #[test] + fn test_dot() { + let a = [1., 2., 3.]; + let b = [1., 2., 3.]; + assert_eq!(dot(&a, &b).unwrap(), 14.0); + } + + #[test] + fn bench_dot() { + let dimension = 8024; + let nodes_every_cluster = 600; + let node_n = 50; + let (_, nso) = + make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 100000.0); + println!("hello world {:?}", nso.len()); + let ns: Vec> = nso + .iter() + .map(|x| x.iter().map(|p| *p as f32).collect()) + .collect(); + + { + let base_start = SystemTime::now(); + let sumbase = ns + .iter() + .map(|nsx| { + // dot(&nsx, &nsx); + // nsx.iter().zip(nsx).map(|(p, q)| p * q).sum::() + nsx.iter() + .zip(nsx) + .map(|(p, q)| (p - q).powi(2)) + .sum::() + }) + .sum::(); + let base_since_the_epoch = SystemTime::now() + .duration_since(base_start) + .expect("Time went backwards"); + println!( + "test for {:?} times, base use {:?} millisecond {:?}", + ns.len(), + base_since_the_epoch.as_millis(), + sumbase + ); + } + + { + let base_start = SystemTime::now(); + let sumsimd = ns + .iter() + .map(|nsx| f32::euclidean_distance(&nsx, &nsx).unwrap()) + .sum::(); + let base_since_the_epoch = SystemTime::now() + .duration_since(base_start) + .expect("Time went backwards"); + println!( + "test for {:?} times, simd use {:?} millisecond, {:?}", + ns.len(), + base_since_the_epoch.as_millis(), + sumsimd + ); + } + + let b = 25; + println!( + "{:?}, {:?}", + f32::dot_product(&ns[b], &ns[b]), + dot(&ns[b], &ns[b]).unwrap() + ); + } +} diff --git a/src/core/heap.rs b/src/core/heap.rs new file mode 100644 index 0000000..0936aa5 --- /dev/null +++ b/src/core/heap.rs @@ -0,0 +1,266 @@ +// this lib migrate from official lib, but without std dependency; + +use core::mem::{swap, ManuallyDrop}; +use core::ptr; + +pub struct BinaryHeap { + data: Vec, +} + +impl BinaryHeap { + pub fn new() -> BinaryHeap { + BinaryHeap { data: vec![] } + } + + pub fn with_capacity(capacity: usize) -> BinaryHeap { + BinaryHeap { + data: Vec::with_capacity(capacity), + } + } + + pub fn pop(&mut self) -> Option { + self.data.pop().map(|mut item| { + if !self.is_empty() { + swap(&mut item, &mut self.data[0]); + self.sift_down_to_bottom(0); + } + item + }) + } + + pub fn push(&mut self, item: T) { + let old_len = self.len(); + self.data.push(item); + self.sift_up(0, old_len); + } + + pub fn into_sorted_vec(mut self) -> Vec { + let mut end = self.len(); + while end > 1 { + end -= 1; + // SAFETY: `end` goes from `self.len() - 1` to 1 (both included), + // so it's always a valid index to access. + // It is safe to access index 0 (i.e. `ptr`), because + // 1 <= end < self.len(), which means self.len() >= 2. + unsafe { + let ptr = self.data.as_mut_ptr(); + ptr::swap(ptr, ptr.add(end)); + } + self.sift_down_range(0, end); + } + self.into_vec() + } + + fn sift_up(&mut self, start: usize, pos: usize) -> usize { + unsafe { + // Take out the value at `pos` and create a hole. + let mut hole = Hole::new(&mut self.data, pos); + + while hole.pos() > start { + let parent = (hole.pos() - 1) / 2; + if hole.element() <= hole.get(parent) { + break; + } + hole.move_to(parent); + } + hole.pos() + } + } + + fn sift_down_range(&mut self, pos: usize, end: usize) { + unsafe { + let mut hole = Hole::new(&mut self.data, pos); + let mut child = 2 * pos + 1; + while child < end - 1 { + // compare with the greater of the two children + child += (hole.get(child) <= hole.get(child + 1)) as usize; + // if we are already in order, stop. + if hole.element() >= hole.get(child) { + return; + } + hole.move_to(child); + child = 2 * hole.pos() + 1; + } + if child == end - 1 && hole.element() < hole.get(child) { + hole.move_to(child); + } + } + } + + fn sift_down(&mut self, pos: usize) { + let len = self.len(); + self.sift_down_range(pos, len); + } + + fn sift_down_to_bottom(&mut self, mut pos: usize) { + let end = self.len(); + let start = pos; + unsafe { + let mut hole = Hole::new(&mut self.data, pos); + let mut child = 2 * pos + 1; + while child < end - 1 { + child += (hole.get(child) <= hole.get(child + 1)) as usize; + hole.move_to(child); + child = 2 * hole.pos() + 1; + } + if child == end - 1 { + hole.move_to(child); + } + pos = hole.pos; + } + self.sift_up(start, pos); + } + + fn rebuild(&mut self) { + let mut n = self.len() / 2; + while n > 0 { + n -= 1; + self.sift_down(n); + } + } + + pub fn retain(&mut self, f: F) + where + F: FnMut(&T) -> bool, + { + self.data.retain(f); + self.rebuild(); + } +} + +impl BinaryHeap { + // pub fn iter(&self) -> Iter<'_, T> { + // Iter { iter: self.data.iter() } + // } + pub fn peek(&self) -> Option<&T> { + self.data.get(0) + } + + pub fn capacity(&self) -> usize { + self.data.capacity() + } + + pub fn reserve_exact(&mut self, additional: usize) { + self.data.reserve_exact(additional); + } + + pub fn reserve(&mut self, additional: usize) { + self.data.reserve(additional); + } + + pub fn shrink_to_fit(&mut self) { + self.data.shrink_to_fit(); + } + + pub fn into_vec(self) -> Vec { + self.into() + } + + pub fn len(&self) -> usize { + self.data.len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn clear(&mut self) { + self.data.drain(..); + } +} + +/// Hole represents a hole in a slice i.e., an index without valid value +/// (because it was moved from or duplicated). +/// In drop, `Hole` will restore the slice by filling the hole +/// position with the value that was originally removed. +struct Hole<'a, T: 'a> { + data: &'a mut [T], + elt: ManuallyDrop, + pos: usize, +} + +impl<'a, T> Hole<'a, T> { + /// Create a new `Hole` at index `pos`. + /// + /// Unsafe because pos must be within the data slice. + #[inline] + unsafe fn new(data: &'a mut [T], pos: usize) -> Self { + debug_assert!(pos < data.len()); + // SAFE: pos should be inside the slice + let elt = unsafe { ptr::read(data.get_unchecked(pos)) }; + Hole { + data, + elt: ManuallyDrop::new(elt), + pos, + } + } + + #[inline] + fn pos(&self) -> usize { + self.pos + } + + /// Returns a reference to the element removed. + #[inline] + fn element(&self) -> &T { + &self.elt + } + + /// Returns a reference to the element at `index`. + /// + /// Unsafe because index must be within the data slice and not equal to pos. + #[inline] + unsafe fn get(&self, index: usize) -> &T { + debug_assert!(index != self.pos); + debug_assert!(index < self.data.len()); + unsafe { self.data.get_unchecked(index) } + } + + /// Move hole to new location + /// + /// Unsafe because index must be within the data slice and not equal to pos. + #[inline] + unsafe fn move_to(&mut self, index: usize) { + debug_assert!(index != self.pos); + debug_assert!(index < self.data.len()); + unsafe { + let ptr = self.data.as_mut_ptr(); + let index_ptr: *const _ = ptr.add(index); + let hole_ptr = ptr.add(self.pos); + ptr::copy_nonoverlapping(index_ptr, hole_ptr, 1); + } + self.pos = index; + } +} + +impl Drop for Hole<'_, T> { + #[inline] + fn drop(&mut self) { + // fill the hole again + unsafe { + let pos = self.pos; + ptr::copy_nonoverlapping(&*self.elt, self.data.get_unchecked_mut(pos), 1); + } + } +} + +impl From> for BinaryHeap { + /// Converts a `Vec` into a `BinaryHeap`. + /// + /// This conversion happens in-place, and has *O*(*n*) time complexity. + fn from(vec: Vec) -> BinaryHeap { + let mut heap = BinaryHeap { data: vec }; + heap.rebuild(); + heap + } +} + +impl From> for Vec { + /// Converts a `BinaryHeap` into a `Vec`. + /// + /// This conversion requires no data movement or allocation, and has + /// constant time complexity. + fn from(heap: BinaryHeap) -> Vec { + heap.data + } +} diff --git a/src/core/kmeans.rs b/src/core/kmeans.rs new file mode 100644 index 0000000..4911f2b --- /dev/null +++ b/src/core/kmeans.rs @@ -0,0 +1,362 @@ +#![allow(dead_code)] +use crate::core::metrics; +use crate::core::node; +use metrics::metric; +use rand::prelude::*; +use rayon::prelude::*; +use std::sync::Mutex; + +#[derive(Default, Debug)] +pub struct Kmeans { + _dimension: usize, + _n_center: usize, + _centers: Vec>, + _data_range_begin: usize, + _data_range_end: usize, + _has_residual: bool, + _residual: Vec, + mt: metrics::Metric, //compute metrics +} + +impl Kmeans { + pub fn new(dimension: usize, n_center: usize, mt: metrics::Metric) -> Kmeans { + Kmeans { + _dimension: dimension, + _n_center: n_center, + _data_range_begin: 0, + _data_range_end: dimension, + mt, + ..Default::default() + } + } + + pub fn centers(&self) -> &Vec> { + &self._centers + } + + pub fn get_distance_from_vec(&self, x: &[E], y: &[E]) -> E { + let mut z = x[self._data_range_begin..self._data_range_end].to_vec(); + if self._has_residual { + (0..self._data_range_end - self._data_range_begin) + .for_each(|i| z[i] -= self._residual[i + self._data_range_begin]); + } + return metric(&z, y, self.mt).unwrap(); + } + + pub fn set_residual(&mut self, residual: Vec) { + self._has_residual = true; + self._residual = residual; + } + + pub fn init_center(&mut self, batch_size: usize, batch_data: &[Vec]) { + let dimension = self._dimension; + let n_center = self._n_center; + let begin = self._data_range_begin; + let mut mean_center: Vec = vec![E::from_f32(0.0).unwrap(); dimension]; + + (0..batch_size).for_each(|i| { + let cur_data = &batch_data[i]; + (0..dimension).for_each(|j| { + if self._has_residual { + mean_center[j] += cur_data[begin + j] - self._residual[begin + j]; + } else { + mean_center[j] += cur_data[begin + j]; + } + }); + }); + + (0..dimension).for_each(|i| { + mean_center[i] /= E::from_usize(batch_size).unwrap(); + }); + + let mut new_centers: Vec> = Vec::with_capacity(n_center); + (0..n_center).for_each(|i| { + let mut cur_center: Vec = Vec::new(); + (0..dimension).for_each(|j| { + let mut val = mean_center[j]; + if i & (1 << j) == 1 { + val += E::from_f32(1.0).unwrap(); + } else { + val -= E::from_f32(1.0).unwrap(); + } + cur_center.push(val); + }); + new_centers.push(cur_center); + }); + self._centers = new_centers; + } + + pub fn update_center( + &mut self, + batch_size: usize, + batch_data: &[Vec], + assigned_center: &[usize], + ) -> Vec { + let dimension = self._dimension; + let n_center = self._n_center; + let begin = self._data_range_begin; + let mut new_centers: Vec> = Vec::with_capacity(n_center); + (0..n_center).for_each(|_| { + new_centers.push(vec![E::from_f32(0.0).unwrap(); dimension]); + }); + let mut n_assigned_per_center: Vec = vec![0; n_center]; + (0..batch_size).for_each(|i| { + let cur_data = &batch_data[i]; + let cur_center = assigned_center[i]; + n_assigned_per_center[cur_center] += 1; + (0..dimension).for_each(|j| { + if self._has_residual { + new_centers[cur_center][j] += cur_data[begin + j] - self._residual[begin + j]; + } else { + new_centers[cur_center][j] += cur_data[begin + j]; + } + }); + }); + + (0..n_center).for_each(|i| { + if n_assigned_per_center[i] == 0 { + return; + } + (0..dimension).for_each(|j| { + new_centers[i][j] /= E::from_usize(n_assigned_per_center[i]).unwrap(); + }); + }); + self._centers = new_centers; + n_assigned_per_center + } + + pub fn search_data( + &mut self, + batch_size: usize, + batch_data: &Vec>, + assigned_center: &mut Vec, + ) { + let n_center = self._n_center; + let _dimension = self._dimension; + (0..batch_size).for_each(|i| { + let mut nearist_center_id: usize = 0; + (1..n_center).for_each(|j| { + let cur_center = &self._centers[j]; + let nearist_center = &self._centers[nearist_center_id]; + if self.get_distance_from_vec(&batch_data[i], cur_center) + < self.get_distance_from_vec(&batch_data[i], nearist_center) + { + nearist_center_id = j; + } + }); + assigned_center.push(nearist_center_id); + }); + } + + pub fn split_center( + &mut self, + batch_size: usize, + n_assigned_per_center: &mut Vec, + ) -> Result<(), &'static str> { + let dimension = self._dimension; + let n_center = self._n_center; + + if batch_size == 0 { + return Err("None to assigned impossible split center"); + } + + (0..n_center).for_each(|i| { + if n_assigned_per_center[i] == 0 { + //rand pick split center + let mut split_center_id = (i + 1) % n_center; + loop { + let mut rng = rand::thread_rng(); + let pick_percent = + n_assigned_per_center[split_center_id] as f64 / batch_size as f64; + if rng.gen_range(0.0..1.0) < pick_percent { + break; + } + split_center_id = (split_center_id + 1) % n_center; + } + const EPS: f32 = 1.0 / 1024.0; + (0..dimension).for_each(|j| { + if j % 2 == 0 { + self._centers[i][j] = + self._centers[split_center_id][j] * E::from_f32(1.0 - EPS).unwrap(); + self._centers[split_center_id][j] *= E::from_f32(1.0 + EPS).unwrap(); + } else { + self._centers[i][j] = + self._centers[split_center_id][j] * E::from_f32(1.0 + EPS).unwrap(); + self._centers[split_center_id][j] *= E::from_f32(1.0 - EPS).unwrap(); + } + }); + n_assigned_per_center[i] = n_assigned_per_center[split_center_id] / 2; + n_assigned_per_center[split_center_id] -= n_assigned_per_center[i]; + } + }); + Ok(()) + } + + pub fn train(&mut self, batch_size: usize, batch_data: &Vec>, n_epoch: usize) { + self.init_center(batch_size, batch_data); + (0..n_epoch).for_each(|epoch| { + let mut assigned_center: Vec = Vec::with_capacity(batch_size); + self.search_data(batch_size, batch_data, &mut assigned_center); + let mut n_assigned_per_center = + self.update_center(batch_size, batch_data, &assigned_center); + if epoch < n_epoch - 1 { + self.split_center(batch_size, &mut n_assigned_per_center) + .unwrap(); + } + }); + } + + pub fn set_range(&mut self, begin: usize, end: usize) { + assert!(end - begin == self._dimension); + self._data_range_begin = begin; + self._data_range_end = end; + } +} + +pub fn general_kmeans( + k: usize, + epoch: usize, + nodes: &[Box>], + mt: metrics::Metric, +) -> Vec { + if nodes.is_empty() { + return Vec::new(); + } + + let mut rng = rand::thread_rng(); + let mut means = Vec::with_capacity(k); + + (0..k).for_each(|_i| { + means.push(Box::new(nodes[rng.gen_range(0..nodes.len())].clone())); + }); + + (0..epoch).for_each(|_| { + let cluster_count: Vec> = (0..k).map(|_| Mutex::new(0)).collect(); + let mut cluster_features: Vec>> = (0..k) + .map(|_| Mutex::new(vec![E::zero(); nodes[0].vectors().len()])) + .collect(); + nodes.par_iter().zip(0..nodes.len()).for_each(|(node, _j)| { + let mut idx = 0; + let mut distance = E::max_value(); + for i in 0..means.len() { + let _distance = node.metric(&means[i], mt).unwrap(); + if _distance < distance { + idx = i; + distance = _distance; + } + } + cluster_features[idx] + .lock() + .unwrap() + .iter_mut() + .zip(node.vectors()) + .for_each(|(i, j)| *i += *j); + *cluster_count[idx].lock().unwrap() += 1; + }); + + cluster_features + .iter_mut() + .zip(cluster_count) + .for_each(|(features, cnt)| { + features + .lock() + .unwrap() + .iter_mut() + .for_each(|f| *f /= E::from_usize(*cnt.lock().unwrap()).unwrap()) + }); + + means + .iter_mut() + .zip(cluster_features) + .for_each(|(mean, features)| mean.set_vectors(&features.lock().unwrap())); + }); + + means + .iter() + .map(|mean| { + let mut mean_idx = 0; + let mut mean_distance = E::max_value(); + nodes.iter().zip(0..nodes.len()).for_each(|(node, i)| { + let distance = node.metric(&mean, mt).unwrap(); + if distance < mean_distance { + mean_idx = i; + mean_distance = distance; + } + }); + mean_idx + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + use rand::distributions::Standard; + + use rand::Rng; + + fn make_normal_distribution_clustering( + clustering_n: usize, + node_n: usize, + dimension: usize, + range: f64, + ) -> ( + Vec>, // center of cluster + Vec>, // cluster data + ) { + let _rng = rand::thread_rng(); + + let mut bases: Vec> = Vec::new(); + let mut ns: Vec> = Vec::new(); + for _i in 0..clustering_n { + let mut rng = rand::thread_rng(); + let mut base: Vec = Vec::with_capacity(dimension); + for _i in 0..dimension { + let n: f64 = rng.gen::() * range; // base number + base.push((n as f32)); + } + + let v_iter: Vec = rng + .sample_iter(&Standard) + .take(dimension * node_n) + .collect::>() + .clone(); + for _i in 0..node_n { + let mut vec_item = Vec::with_capacity(dimension); + for i in 0..dimension { + let vv = (v_iter[_i * dimension..(_i + 1) * dimension][i] as f32) + base[i]; // add normal distribution noise + vec_item.push(vv); + } + ns.push(vec_item); + } + bases.push(base); + } + + (bases, ns) + } + + #[test] + fn test_general_kmeans() { + let dimension = 2; + let nodes_every_cluster = 10; + let node_n = 10; + let (_, nso) = + make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 100000.0); + println!("{:?}", nso); + let ns: Vec> = nso + .iter() + .map(|x| x.iter().map(|p| *p as f32).collect()) + .collect(); + + let nodes: Vec>> = ns + .iter() + .zip(0..ns.len()) + .map(|(vs, idx)| Box::new(node::Node::new_with_idx(vs, idx))) + .collect(); + println!( + "{:?}", + general_kmeans(node_n, 30, &nodes, metrics::Metric::Euclidean) + ); + } +} diff --git a/src/core/knn.rs b/src/core/knn.rs new file mode 100644 index 0000000..2b3534c --- /dev/null +++ b/src/core/knn.rs @@ -0,0 +1,551 @@ +#![allow(dead_code)] +use crate::core::metrics; +use crate::core::neighbor::Neighbor; +use crate::core::node::{FloatElement, IdxType, Node}; +use fixedbitset::FixedBitSet; +use rand::seq::SliceRandom; +use rand::Rng; +use rayon::prelude::*; +use std::collections::BinaryHeap; +use std::sync::mpsc; + +use std::sync::{Arc, Mutex}; + +pub fn naive_build_knn_graph( + nodes: &[Box>], + mt: metrics::Metric, + k: usize, + graph: &mut Vec>>, // TODO: not use this one +) { + let tmp_graph = Arc::new(Mutex::new(graph)); + (0..nodes.len()).into_par_iter().for_each(|n| { + let item = &nodes[n]; + let mut heap = BinaryHeap::with_capacity(k); + (0..nodes.len()).for_each(|i| { + if i == n { + return; + } + heap.push(Neighbor::new(i, item.metric(&nodes[i], mt).unwrap())); + if heap.len() > k { + heap.pop(); + } + }); + let mut tmp = Vec::with_capacity(heap.len()); + while !heap.is_empty() { + tmp.push(heap.pop().unwrap()); + } + + tmp_graph.lock().unwrap()[n].clear(); + tmp_graph.lock().unwrap()[n] = tmp; + }); +} + +pub struct NNDescentHandler<'a, E: FloatElement, T: IdxType> { + nodes: &'a [Box>], + graph: Vec>>>>, + mt: metrics::Metric, + k: usize, + visited_id: FixedBitSet, + calculation_context: Vec<(Vec, Vec, Vec, Vec)>, // nn_new_neighbors, nn_old_neighbors, reversed_new_neighbors, reversed_old_neighbors + rho: f32, + cost: usize, + s: usize, + update_cnt: usize, +} + +impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { + fn new(nodes: &'a [Box>], mt: metrics::Metric, k: usize, rho: f32) -> Self { + NNDescentHandler { + nodes, + graph: Vec::new(), // TODO: as params + mt, + k, + visited_id: FixedBitSet::with_capacity(nodes.len() * nodes.len()), + calculation_context: Vec::new(), + rho, + cost: 0, + s: (rho * k as f32) as usize, + update_cnt: 0, + } + } + + fn update( + &self, + u1: usize, + u2: usize, + my_graph: &[Arc>>>], + ) -> bool { + if u1 == u2 { + return false; + } + + self.update_nn_node(u1, u2, my_graph); + self.update_nn_node(u2, u1, my_graph); + true + } + + fn update_nn_node( + &self, + me: usize, + candidate: usize, + my_graph: &[Arc>>>], + ) -> bool { + let dist = self.nodes[me] + .metric(&self.nodes[candidate], self.mt) + .unwrap(); + if dist > my_graph[me].lock().unwrap().peek().unwrap().distance() { + false + } else { + my_graph[me] + .lock() + .unwrap() + .push(Neighbor::new(candidate, dist)); + if my_graph[me].lock().unwrap().len() > self.k { + my_graph[me].lock().unwrap().pop(); + } + true + } + } + + fn init(&mut self) { + self.visited_id = FixedBitSet::with_capacity(self.nodes.len() * self.nodes.len()); + self.graph.clear(); + + self.graph = (0..self.nodes.len()) + .into_par_iter() + .map(|_i| { + let mut v = BinaryHeap::with_capacity(self.k * 2); + for _j in 0..self.k { + v.push(Neighbor::new(self.nodes.len(), E::max_value())); + } + Arc::new(Mutex::new(v)) + }) + .collect(); + + self.calculation_context = (0..self.nodes.len()) + .into_par_iter() + .map(|_i| { + let mut nn_new_neighbors: Vec = Vec::with_capacity(self.s); + let nn_old_neighbors: Vec = Vec::with_capacity(self.s); + for _j in 0..self.s { + let rand_val = rand::thread_rng().gen_range(0..self.nodes.len()); + nn_new_neighbors.push(rand_val); + } + + let mut reversed_new_neighbors: Vec = Vec::with_capacity(self.s); + let reversed_old_neighbors: Vec = Vec::with_capacity(self.s); + for _j in 0..self.s { + let rand_val = rand::thread_rng().gen_range(0..self.nodes.len()); + reversed_new_neighbors.push(rand_val); + } + ( + nn_new_neighbors, + nn_old_neighbors, + reversed_new_neighbors, + reversed_old_neighbors, + ) + }) + .collect(); + } + + fn iterate_nn(&self) -> (usize, FixedBitSet) { + let my_graph = &self.graph; + let length = self.nodes.len(); + // let (sender, receiver) = mpsc::channel(); + + // cc += (0..self.nodes.len()) + self.calculation_context + .par_iter() + .map( + |( + nn_new_neighbors, + nn_old_neighbors, + reversed_new_neighbors, + reversed_old_neighbors, + )| { + let mut flags = FixedBitSet::with_capacity(length * length); + let mut ccc: usize = 0; + for j in 0..nn_new_neighbors.len() { + for k in j..nn_new_neighbors.len() { + if self.update(nn_new_neighbors[j], nn_new_neighbors[k], &my_graph) { + ccc += 1; + } + flags.insert(nn_new_neighbors[j] * length + nn_new_neighbors[k]); + flags.insert(nn_new_neighbors[k] * length + nn_new_neighbors[j]); + } + } + + nn_new_neighbors.iter().for_each(|j| { + nn_old_neighbors.iter().for_each(|k| { + if self.update(*j, *k, &my_graph) { + ccc += 1; + } + flags.insert(j * length + k); + flags.insert(k * length + j); + }) + }); + + for j in 0..reversed_new_neighbors.len() { + for k in j..reversed_new_neighbors.len() { + if reversed_new_neighbors[j] >= reversed_new_neighbors[k] { + continue; + } + if self.update( + reversed_new_neighbors[j], + reversed_new_neighbors[k], + &my_graph, + ) { + ccc += 1; + } + flags.insert( + reversed_new_neighbors[j] * length + reversed_new_neighbors[k], + ); + flags.insert( + reversed_new_neighbors[k] * length + reversed_new_neighbors[j], + ); + } + } + reversed_new_neighbors.iter().for_each(|j| { + reversed_old_neighbors.iter().for_each(|k| { + if self.update(*j, *k, &my_graph) { + ccc += 1; + } + flags.insert(j * length + k); + flags.insert(k * length + j); + }) + }); + + nn_new_neighbors.iter().for_each(|j| { + reversed_old_neighbors.iter().for_each(|k| { + if self.update(*j, *k, &my_graph) { + ccc += 1; + } + flags.insert(j * length + k); + flags.insert(k * length + j); + }) + }); + + nn_new_neighbors.iter().for_each(|j| { + reversed_new_neighbors.iter().for_each(|k| { + if self.update(*j, *k, &my_graph) { + ccc += 1; + } + flags.insert(j * length + k); + flags.insert(k * length + j); + }) + }); + + nn_old_neighbors.iter().for_each(|j| { + reversed_new_neighbors.iter().for_each(|k| { + if self.update(*j, *k, &my_graph) { + ccc += 1; + } + flags.insert(j * length + k); + flags.insert(k * length + j); + }) + }); + (ccc, flags) + }, + ) + .reduce( + || (0, FixedBitSet::with_capacity(length * length)), + |(ccc1, mut flags1), (ccc2, flags2)| { + flags1.union_with(&flags2); + (ccc1 + ccc2, flags1) + }, + ) + } + + fn iterate(&mut self) -> usize { + self.update_cnt = 0; + self.cost = 0; + + let (cc, flags) = self.iterate_nn(); + self.visited_id.union_with(&flags); + + // s.send(flags).unwrap(); + // ccc + // }) + // .sum::(); + + // receiver.iter().for_each(|flags| { + // flags.iter().for_each(|j| { + // self.visited_id.set(*j, true); + // }); + // }); + + self.graph.par_iter().for_each(|graph| { + while graph.lock().unwrap().len() > self.k { + graph.lock().unwrap().pop(); + } + }); + + self.cost += cc; + let mut t = 0; + + let (sender2, receiver2) = mpsc::channel(); + // let pending_status2: Vec<(usize, usize, Vec, Vec, Vec)> = (0..self + // .nodes + // .len()) + t += (0..self.nodes.len()) + .into_par_iter() + .map_with(sender2, |s, i| { + // .map(|i| { + let mut nn_new_neighbors = Vec::with_capacity(self.graph[i].lock().unwrap().len()); + let mut nn_old_neighbors = Vec::with_capacity(self.graph[i].lock().unwrap().len()); + let mut flags = Vec::with_capacity(self.graph[i].lock().unwrap().len()); + let graph_item: Vec> = + self.graph[i].lock().unwrap().clone().into_vec(); + + let mut tt: usize = 0; + + for (j, the_graph_item) in graph_item.iter().enumerate().take(self.k) { + if the_graph_item.idx() == self.nodes.len() { + // init value, pass + continue; + } + if self + .visited_id + .contains(self.nodes.len() * i + the_graph_item.idx()) + { + nn_new_neighbors.push(j); + } else { + nn_old_neighbors.push(the_graph_item.idx()); + } + } + + tt += nn_new_neighbors.len(); + + if nn_new_neighbors.len() > self.s { + let mut rng = rand::thread_rng(); + nn_new_neighbors.shuffle(&mut rng); + nn_new_neighbors = nn_new_neighbors[self.s..].to_vec(); + } + + nn_new_neighbors.iter_mut().for_each(|j| { + flags.push(i * self.nodes.len() + graph_item[*j].idx()); + *j = graph_item[*j].idx(); + }); + + s.send((i, nn_new_neighbors, nn_old_neighbors, flags)) + .unwrap(); + tt + // (i, tt, nn_new_neighbors, nn_old_neighbors, flags) + }) + // .collect(); + .sum::(); + + // t += pending_status2 + // .iter() + // .map(|(i, tt, nn_new_neighbors, nn_old_neighbors, flags)| { + // self.nn_new_neighbors[*i] = nn_new_neighbors.to_vec(); + // self.nn_old_neighbors[*i] = nn_old_neighbors.to_vec(); + // flags.iter().for_each(|j| { + // self.visited_id.set(*j, false); + // }); + // tt + // }) + // .sum::(); + + receiver2 + .iter() + .for_each(|(i, nn_new_neighbors, nn_old_neighbors, flags)| { + self.calculation_context[i].0 = nn_new_neighbors; + self.calculation_context[i].1 = nn_old_neighbors; + flags.iter().for_each(|j| { + self.visited_id.set(*j, false); + }); + }); + + let reversed_new_neighbors = vec![Arc::new(Mutex::new(Vec::new())); self.nodes.len()]; + let reversed_old_neighbors = vec![Arc::new(Mutex::new(Vec::new())); self.nodes.len()]; + + (0..self.nodes.len()).into_par_iter().for_each(|i| { + for e in 0..self.calculation_context[i].1.len() { + reversed_old_neighbors[self.calculation_context[i].1[e]] + .lock() + .unwrap() + .push(i); + } + for e in 0..self.calculation_context[i].0.len() { + reversed_new_neighbors[self.calculation_context[i].0[e]] + .lock() + .unwrap() + .push(i); + } + }); + + (0..self.nodes.len()).into_par_iter().for_each(|i| { + let mut rng = rand::thread_rng(); + if reversed_old_neighbors[i].lock().unwrap().len() > self.s { + reversed_old_neighbors[i].lock().unwrap().shuffle(&mut rng); + reversed_old_neighbors[i].lock().unwrap().resize(self.s, 0); + } + if reversed_new_neighbors[i].lock().unwrap().len() > self.s { + reversed_new_neighbors[i].lock().unwrap().shuffle(&mut rng); + reversed_new_neighbors[i].lock().unwrap().resize(self.s, 0); + } + }); + + (0..self.nodes.len()).for_each(|i| { + self.calculation_context[i].2 = reversed_new_neighbors[i] + .lock() + .unwrap() + .iter() + .copied() + .collect(); + self.calculation_context[i].3 = reversed_old_neighbors[i] + .lock() + .unwrap() + .iter() + .copied() + .collect(); + }); + + t + } + + fn graph(&self) -> Vec>> { + let mut graph: Vec>> = Vec::with_capacity(self.graph.len()); + for iter in self.graph.iter() { + graph.push(iter.lock().unwrap().iter().cloned().collect()); + } + graph + } + + fn cost(&self) -> &usize { + &self.cost + } + + fn ths_update_cnt(&self) -> &usize { + &self.update_cnt + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::core::node; + use rand::distributions::{Distribution, Standard}; + use rand::Rng; + use std::collections::HashMap; + use std::collections::HashSet; + + use std::iter::FromIterator; + use std::time::{Duration, SystemTime, UNIX_EPOCH}; + fn make_normal_distribution_clustering( + clustering_n: usize, + node_n: usize, + dimension: usize, + range: f64, + ) -> ( + Vec>, // center of cluster + Vec>, // cluster data + ) { + let mut bases: Vec> = Vec::new(); + let mut ns: Vec> = Vec::new(); + for _i in 0..clustering_n { + let mut rng = rand::thread_rng(); + let mut base: Vec = Vec::with_capacity(dimension); + for _i in 0..dimension { + let n: f64 = rng.gen::() * range; // base number + base.push(n); + } + + let v_iter: Vec = rng + .sample_iter(&Standard) + .take(dimension * node_n) + .collect::>() + .clone(); + for _i in 0..node_n { + let mut vec_item = Vec::with_capacity(dimension); + for i in 0..dimension { + let vv = v_iter[_i * dimension..(_i + 1) * dimension][i] + base[i]; // add normal distribution noise + vec_item.push(vv); + } + ns.push(vec_item); + } + bases.push(base); + } + + (bases, ns) + } + + #[test] + fn knn_nn_descent() { + let dimension = 2; + let nodes_every_cluster = 10; + let node_n = 1000; + let (_, ns) = + make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 10000000.0); + println!("hello world {:?}", ns.len()); + + let mut data = Vec::new(); + for i in 0..ns.len() { + data.push(Box::new(node::Node::new_with_idx(&ns[i], i))); + } + + let mut graph: Vec>> = vec![Vec::new(); data.len()]; + let base_start = SystemTime::now(); + naive_build_knn_graph::(&data, metrics::Metric::Euclidean, 100, &mut graph); + let base_since_the_epoch = SystemTime::now() + .duration_since(base_start) + .expect("Time went backwards"); + println!( + "test for {:?} times, base use {:?} millisecond", + ns.len(), + base_since_the_epoch.as_millis() + ); + + let base_start = SystemTime::now(); + let mut nn_descent_handler = + NNDescentHandler::new(&data, metrics::Metric::Euclidean, 100, 0.2); + nn_descent_handler.init(); + + let try_times = 8; + let mut ground_truth: HashMap> = HashMap::new(); + for i in 0..graph.len() { + ground_truth.insert(i, HashSet::from_iter(graph[i].iter().map(|x| x.idx()))); + } + // let guard = pprof::ProfilerGuard::new(100).unwrap(); + for _p in 0..try_times { + let cc = nn_descent_handler.iterate(); + let mut error = 0; + for i in 0..nn_descent_handler.graph.len() { + let nn_descent_handler_val: Vec> = nn_descent_handler.graph[i] + .lock() + .unwrap() + .iter() + .cloned() + .collect(); + for j in 0..nn_descent_handler_val.len() { + if !ground_truth[&i].contains(&nn_descent_handler_val[j].idx()) { + error += 1; + } + } + } + println!( + "error {} /{:?} cc {:?} cost {:?} update_cnt {:?}", + error, + data.len() * 10, + cc, + nn_descent_handler.cost(), + nn_descent_handler.ths_update_cnt(), + ); + } + // if let Ok(report) = guard.report().build() { + // let file = File::create("flamegraph.svg").unwrap(); + // report.flamegraph(file).unwrap(); + // }; + + let base_since_the_epoch = SystemTime::now() + .duration_since(base_start) + .expect("Time went backwards"); + println!( + "test for {:?} times, base use {:?} millisecond", + ns.len(), + base_since_the_epoch.as_millis() + ); + } +} diff --git a/src/core/metrics.rs b/src/core/metrics.rs new file mode 100644 index 0000000..bff2577 --- /dev/null +++ b/src/core/metrics.rs @@ -0,0 +1,109 @@ +extern crate num; +use crate::core::calc::dot; + +use crate::core::node::FloatElement; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] +pub enum Metric { + Unknown, + Manhattan, + DotProduct, + Euclidean, + CosineSimilarity, + Angular, +} + +impl Default for Metric { + fn default() -> Self { + Metric::Unknown + } +} + +// TODO: SIMD support +// TODO: make these func private +pub fn metric(vec1: &[T], vec2: &[T], mt: Metric) -> Result +where + T: FloatElement, +{ + match mt { + Metric::Euclidean => euclidean_distance(vec1, vec2), + Metric::Manhattan => manhattan_distance(vec1, vec2), + Metric::DotProduct => dot_product(vec1, vec2), + Metric::CosineSimilarity => cosine_similarity(vec1, vec2), + Metric::Angular => angular_distance(vec1, vec2), + Metric::Unknown => Result::Err("unknown method"), + } +} + +#[allow(dead_code)] +pub fn range_metric( + vec1: &[T], + vec2: &[T], + mt: Metric, + begin: usize, + end: usize, +) -> Result +where + T: FloatElement, +{ + metric(&vec1[begin..end], &vec2[begin..end], mt) +} + +pub fn dot_product(vec1: &[T], vec2: &[T]) -> Result +where + T: FloatElement, +{ + assert_eq!(vec1.len(), vec2.len()); + // smaller means closer. + match dot(vec1, vec2) { + Ok(x) => Result::Ok(-x), + Err(err) => Err(err), + } +} + +pub fn manhattan_distance(vec1: &[T], vec2: &[T]) -> Result +where + T: FloatElement, +{ + T::manhattan_distance(vec1, vec2) +} + +pub fn euclidean_distance(vec1: &[T], vec2: &[T]) -> Result +where + T: FloatElement, +{ + T::euclidean_distance(vec1, vec2) +} + +pub fn cosine_similarity(vec1: &[T], vec2: &[T]) -> Result +where + T: FloatElement, +{ + assert_eq!(vec1.len(), vec2.len()); + // smaller means closer. + Result::Ok( + -dot(vec1, vec2).unwrap() + / (dot(vec1, vec1).unwrap().sqrt() * dot(vec2, vec2).unwrap().sqrt()), + ) +} + +// (a/|a| - b/|b|)^2 +// = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b| +// = 2 - 2cos +pub fn angular_distance(vec1: &[T], vec2: &[T]) -> Result +where + T: FloatElement, +{ + assert_eq!(vec1.len(), vec2.len()); + let rhd = dot(vec1, vec1).unwrap(); + let lhd = dot(vec2, vec2).unwrap(); + let rldot = dot(vec1, vec2).unwrap(); + let rlmul = rhd * lhd; + let two = T::float_two(); + if rlmul > T::float_zero() { + Result::Ok(two - two * rldot / rlmul.sqrt()) + } else { + Result::Ok(two) + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs new file mode 100644 index 0000000..ce2693a --- /dev/null +++ b/src/core/mod.rs @@ -0,0 +1,10 @@ +pub mod ann_index; +pub mod arguments; +pub mod calc; +pub mod kmeans; +pub mod knn; +pub mod metrics; +pub mod neighbor; +pub mod node; +pub mod random; +pub mod simd_metrics; diff --git a/src/core/neighbor.rs b/src/core/neighbor.rs new file mode 100644 index 0000000..be63c62 --- /dev/null +++ b/src/core/neighbor.rs @@ -0,0 +1,41 @@ +extern crate num; +use crate::core::node; +use core::cmp::Ordering; + +// util class +#[derive(Default, Clone, PartialEq, Debug)] +pub struct Neighbor { + pub _idx: T, + pub _distance: E, +} + +impl Neighbor { + pub fn new(idx: T, distance: E) -> Neighbor { + Neighbor { + _idx: idx, + _distance: distance, + } + } + + pub fn idx(&self) -> T { + self._idx.clone() + } + + pub fn distance(&self) -> E { + self._distance + } +} + +impl Ord for Neighbor { + fn cmp(&self, other: &Neighbor) -> Ordering { + self._distance.partial_cmp(&other._distance).unwrap() + } +} + +impl PartialOrd for Neighbor { + fn partial_cmp(&self, other: &Neighbor) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for Neighbor {} diff --git a/src/core/node.rs b/src/core/node.rs new file mode 100644 index 0000000..9e9f443 --- /dev/null +++ b/src/core/node.rs @@ -0,0 +1,214 @@ +#![allow(dead_code)] +use crate::core::metrics; + +use crate::core::simd_metrics; + +use core::hash::Hash; +use core::iter::Sum; +use num::traits::{FromPrimitive, NumAssign}; +use serde::{Deserialize, Serialize}; +#[cfg(feature = "use_smallvec")] +use smallvec; + +#[cfg(feature = "use_smallvec")] +const SMALLVEC_HOLD_NUM: usize = 64; +#[cfg(feature = "smallvec_128")] +const SMALLVEC_HOLD_NUM: usize = 128; + +pub trait FloatElement: + FromPrimitive + + Sized + + Default + + num::Zero + + num::traits::FloatConst + + core::fmt::Debug + + Clone + + Copy + + PartialEq + + PartialOrd + + NumAssign + + num::Signed + + num::Float + + Sync + + Send + + Sum + + Serialize + + simd_metrics::SIMDOptmized +{ + // TODO: make it static + fn float_one() -> Self; + + fn float_two() -> Self; + + fn float_zero() -> Self; + + fn zero_patch_num() -> Self; +} + +pub trait IdxType: + Sized + Clone + Default + core::fmt::Debug + Eq + Ord + Sync + Send + Serialize + Hash +{ +} + +#[macro_export] +macro_rules! to_float_element { + ( $x:ident ) => { + impl FloatElement for $x { + fn float_one() -> Self { + 1.0 + } + + fn float_two() -> Self { + 1.0 + } + + fn float_zero() -> Self { + 0.0 + } + + fn zero_patch_num() -> Self { + 1.34e-6 + } + } + }; +} + +#[macro_export] +macro_rules! to_idx_type { + ( $x:ident ) => { + impl IdxType for $x {} + }; +} + +to_float_element!(f64); +to_float_element!(f32); +to_idx_type!(String); +to_idx_type!(usize); +to_idx_type!(i16); +to_idx_type!(i32); +to_idx_type!(i64); +to_idx_type!(i128); +to_idx_type!(u16); +to_idx_type!(u32); +to_idx_type!(u64); +to_idx_type!(u128); + +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct Node { + #[cfg(any(feature = "use_smallvec", feature = "smallvec_128"))] + vectors: smallvec::SmallVec<[E; SMALLVEC_HOLD_NUM]>, // the vectors; + #[cfg(not(feature = "use_smallvec"))] + vectors: Vec, + idx: Option, // data id, it can be any type; +} + +impl Node { + #[cfg(any(feature = "use_smallvec", feature = "smallvec_128"))] + pub fn new(vectors: &[E]) -> Node { + Node::::valid_elements(vectors); + + Node { + vectors: smallvec::SmallVec::from_slice(vectors), + idx: Option::None, + } + } + + #[cfg(not(feature = "use_smallvec"))] + pub fn new(vectors: &[E]) -> Node { + Node::::valid_elements(vectors); + Node { + vectors: vectors.to_vec(), + idx: Option::None, + } + } + + pub fn new_with_idx(vectors: &[E], id: T) -> Node { + let mut n = Node::new(vectors); + n.set_idx(id); + n + } + + pub fn metric(&self, other: &Node, t: metrics::Metric) -> Result { + metrics::metric(&self.vectors, &other.vectors, t) + } + + // const value + #[cfg(any(feature = "use_smallvec", feature = "smallvec_128"))] + pub fn vectors(&self) -> &smallvec::SmallVec<[E; SMALLVEC_HOLD_NUM]> { + &self.vectors + } + + #[cfg(not(feature = "use_smallvec"))] + pub fn vectors(&self) -> &Vec { + &self.vectors + } + + #[cfg(any(feature = "use_smallvec", feature = "smallvec_128"))] + pub fn mut_vectors(&mut self) -> &mut smallvec::SmallVec<[E; SMALLVEC_HOLD_NUM]> { + &mut self.vectors + } + + #[cfg(not(feature = "use_smallvec"))] + pub fn mut_vectors(&mut self) -> &mut Vec { + &mut self.vectors + } + + #[cfg(any(feature = "use_smallvec", feature = "smallvec_128"))] + pub fn set_vectors(&mut self, v: &[E]) { + self.vectors = smallvec::SmallVec::from_slice(v); + } + + #[cfg(not(feature = "use_smallvec"))] + pub fn set_vectors(&mut self, v: &[E]) { + self.vectors = v.to_vec(); + } + + pub fn push(&mut self, e: &E) { + self.vectors.push(*e); + } + + pub fn len(&self) -> usize { + self.vectors.len() + } + + pub fn is_empty(&self) -> bool { + self.vectors.is_empty() + } + + pub fn idx(&self) -> &Option { + &self.idx + } + + fn set_idx(&mut self, id: T) { + self.idx = Option::Some(id); + } + + fn valid_elements(vectors: &[E]) -> bool { + for e in vectors.iter() { + if e.is_nan() || e.is_infinite() { + //TODO: log + panic!("invalid float element"); + } + } + true + } +} + +impl core::fmt::Display for Node { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "(key: {:#?}, vectors: {:#?})", self.idx, self.vectors) + } +} + +// general method + +#[cfg(test)] +#[test] +fn node_test() { + // f64 + let v = vec![0.1, 0.2]; + let v2 = vec![0.2, 0.1]; + let n = Node::::new(&v); + let n2 = Node::::new(&v2); + n.metric(&n2, metrics::Metric::Manhattan).unwrap(); +} diff --git a/src/core/random.rs b/src/core/random.rs new file mode 100644 index 0000000..5c7b05c --- /dev/null +++ b/src/core/random.rs @@ -0,0 +1,18 @@ +use rand::prelude::*; + +pub trait Random { + fn kiss() -> T; + fn flip() -> bool; + fn index(n: usize) -> usize; +} + +// TODO: use random +pub fn flip() -> bool { + let mut rng = rand::thread_rng(); + rng.gen_range(0..10) > 5 +} + +pub fn index(n: usize) -> usize { + let mut rng = rand::thread_rng(); + rng.gen_range(0..n) +} diff --git a/src/core/simd_metrics.rs b/src/core/simd_metrics.rs new file mode 100644 index 0000000..127aacf --- /dev/null +++ b/src/core/simd_metrics.rs @@ -0,0 +1,109 @@ +use crate::core::calc::same_dimension; +#[cfg(feature = "simd")] +use packed_simd::{f32x16, f32x4, f32x8, f64x4}; + +pub trait SIMDOptmized { + fn dot_product(a: &[T], b: &[T]) -> Result; + fn manhattan_distance(a: &[T], b: &[T]) -> Result; + fn euclidean_distance(a: &[T], b: &[T]) -> Result; +} + +macro_rules! simd_optimized_impl { + ( $type_id:ident, $simd_type:ident ,$size: expr ,$simd_size:expr) => { + impl SIMDOptmized for $type_id { + fn dot_product(a: &[$type_id], b: &[$type_id]) -> Result<$type_id, &'static str> { + assert_eq!(a.len(), b.len()); + + #[cfg(feature = $simd_size)] + { + let size = a.len() - (a.len() % $size); + let c = a + .chunks_exact($size) + .map($simd_type::from_slice_unaligned) + .zip(b.chunks_exact($size).map($simd_type::from_slice_unaligned)) + .map(|(a, b)| a * b) + .sum::<$simd_type>() + .sum(); + let d: $type_id = a[size..].iter().zip(&b[size..]).map(|(p, q)| p * q).sum(); + Ok(c + d) + } + #[cfg(not(feature = $simd_size))] + { + Ok(a.iter().zip(b).map(|(p, q)| p * q).sum::<$type_id>()) + } + } + + fn manhattan_distance( + a: &[$type_id], + b: &[$type_id], + ) -> Result<$type_id, &'static str> { + assert_eq!(a.len(), b.len()); + + #[cfg(feature = $simd_size)] + { + let size = a.len() - (a.len() % $size); + let c = a + .chunks_exact($size) + .map($simd_type::from_slice_unaligned) + .zip(b.chunks_exact($size).map($simd_type::from_slice_unaligned)) + .map(|(a, b)| (a - b).abs()) + .sum::<$simd_type>() + .sum(); + let d: $type_id = a[size..] + .iter() + .zip(&b[size..]) + .map(|(p, q)| (p - q).abs()) + .sum(); + Ok(c + d) + } + + #[cfg(not(feature = $simd_size))] + { + Ok(a.iter() + .zip(b) + .map(|(p, q)| (p - q).abs()) + .sum::<$type_id>()) + } + } + + fn euclidean_distance( + a: &[$type_id], + b: &[$type_id], + ) -> Result<$type_id, &'static str> { + same_dimension(a, b)?; + + #[cfg(feature = $simd_size)] + { + let size = a.len() - (a.len() % $size); + let c = a + .chunks_exact($size) + .map($simd_type::from_slice_unaligned) + .zip(b.chunks_exact($size).map($simd_type::from_slice_unaligned)) + .map(|(a, b)| { + let c = (a - b); + c * c + }) + .sum::<$simd_type>() + .sum(); + + let d: $type_id = a[size..] + .iter() + .zip(&b[size..]) + .map(|(p, q)| (p - q).powi(2)) + .sum(); + Ok((d + c)) + } + #[cfg(not(feature = $simd_size))] + { + Ok(a.iter() + .zip(b) + .map(|(p, q)| (p - q).powi(2)) + .sum::<$type_id>()) + } + } + } + }; +} + +simd_optimized_impl!(f32, f32x16, 16, "simd"); +simd_optimized_impl!(f64, f64x8, 8, "simd"); diff --git a/src/index/bruteforce_idx.rs b/src/index/bruteforce_idx.rs new file mode 100644 index 0000000..e552c3a --- /dev/null +++ b/src/index/bruteforce_idx.rs @@ -0,0 +1,112 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::metrics; +use crate::core::neighbor; +use crate::core::node; +use crate::index::bruteforce_params::BruteForceParams; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::collections::BinaryHeap; + +use std::fs::File; + +use std::io::Write; + +#[derive(Debug, Serialize, Deserialize)] +pub struct BruteForceIndex { + #[serde(skip_serializing, skip_deserializing)] + nodes: Vec>>, + tmp_nodes: Vec>, // only use for serialization scene + mt: metrics::Metric, + dimension: usize, +} + +impl BruteForceIndex { + pub fn new(dimension: usize, params: BruteForceParams) -> BruteForceIndex { + BruteForceIndex:: { + nodes: Vec::new(), + mt: metrics::Metric::Unknown, + tmp_nodes: Vec::new(), + dimension, + } + } +} + +impl ann_index::ANNIndex for BruteForceIndex { + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str> { + self.mt = mt; + Result::Ok(()) + } + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str> { + self.nodes.push(Box::new(item.clone())); + Result::Ok(()) + } + fn built(&self) -> bool { + true + } + fn node_search_k( + &self, + item: &node::Node, + k: usize, + _args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + let mut heap = BinaryHeap::with_capacity(k + 1); + self.nodes + .iter() + .zip(0..self.nodes.len()) + .for_each(|(node, i)| { + heap.push(neighbor::Neighbor::new( + // use max heap, and every time pop out the greatest one in the heap + i, + item.metric(node, self.mt).unwrap(), + )); + if heap.len() > k { + let _xp = heap.pop().unwrap(); + } + }); + + let mut result = Vec::with_capacity(heap.len()); + while !heap.is_empty() { + let neighbor_rev = heap.pop().unwrap(); + result.push(( + *self.nodes[neighbor_rev.idx()].clone(), + neighbor_rev.distance(), + )) + } + result.reverse(); + result + } + + fn name(&self) -> &'static str { + "BruteForceIndex" + } + + fn dimension(&self) -> usize { + self.dimension + } +} + +impl + ann_index::SerializableIndex for BruteForceIndex +{ + fn load(path: &str, _args: &arguments::Args) -> Result { + let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); + let mut instance: BruteForceIndex = bincode::deserialize_from(file).unwrap(); + instance.nodes = instance + .tmp_nodes + .iter() + .map(|x| Box::new(x.clone())) + .collect(); + Ok(instance) + } + + fn dump(&mut self, path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + self.tmp_nodes = self.nodes.iter().map(|x| *x.clone()).collect(); + let encoded_bytes = bincode::serialize(&self).unwrap(); + let mut file = File::create(path).unwrap(); + file.write_all(&encoded_bytes) + .unwrap_or_else(|_| panic!("unable to write file {:?}", path)); + Result::Ok(()) + } +} diff --git a/src/index/bruteforce_params.rs b/src/index/bruteforce_params.rs new file mode 100644 index 0000000..de6498e --- /dev/null +++ b/src/index/bruteforce_params.rs @@ -0,0 +1,20 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::metrics; +use crate::core::neighbor; +use crate::core::node; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::collections::BinaryHeap; + +#[derive(Debug, Serialize, Deserialize)] +pub struct BruteForceParams {} + +impl BruteForceParams {} + +impl Default for BruteForceParams { + fn default() -> Self { + BruteForceParams {} + } +} diff --git a/src/index/hnsw_idx.rs b/src/index/hnsw_idx.rs new file mode 100644 index 0000000..ceaeaff --- /dev/null +++ b/src/index/hnsw_idx.rs @@ -0,0 +1,743 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::metrics; +use crate::core::neighbor::Neighbor; +use crate::core::node; +use crate::index::hnsw_params::HNSWParams; +use fixedbitset::FixedBitSet; +#[cfg(feature = "without_std")] +use hashbrown::HashMap; +#[cfg(feature = "without_std")] +use hashbrown::HashSet; +use rand::prelude::*; +use rayon::{iter::IntoParallelIterator, prelude::*}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::collections::BinaryHeap; + +#[cfg(not(feature = "without_std"))] +use std::collections::HashMap; +#[cfg(not(feature = "without_std"))] +use std::collections::HashSet; +use std::fs::File; +use std::io::Write; + +use std::sync::RwLock; + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct HNSWIndex { + _dimension: usize, // dimension + _n_items: usize, // next item count + _n_constructed_items: usize, + _max_item: usize, + _n_neighbor: usize, // neighbor num except level 0 + _n_neighbor0: usize, // neight num of level 0 + _max_level: usize, //max level + _cur_level: usize, //current level + #[serde(skip_serializing, skip_deserializing)] + _id2neighbor: Vec>>>, //neight_id from level 1 to level _max_level + #[serde(skip_serializing, skip_deserializing)] + _id2neighbor0: Vec>>, //neigh_id at level 0 + #[serde(skip_serializing, skip_deserializing)] + _nodes: Vec>>, // data saver + #[serde(skip_serializing, skip_deserializing)] + _item2id: HashMap, //item_id to id in Hnsw + _root_id: usize, //root of hnsw + _id2level: Vec, + _has_removed: bool, + _ef_build: usize, // num of max candidates when building + _ef_search: usize, // num of max candidates when searching + #[serde(skip_serializing, skip_deserializing)] + _delete_ids: HashSet, //save deleted ids + mt: metrics::Metric, //compute metrics + + // use for serde + _id2neighbor_tmp: Vec>>, + _id2neighbor0_tmp: Vec>, + _nodes_tmp: Vec>, + _item2id_tmp: Vec<(T, usize)>, + _delete_ids_tmp: Vec, +} + +impl HNSWIndex { + pub fn new(dimension: usize, params: &HNSWParams) -> HNSWIndex { + HNSWIndex { + _dimension: dimension, + _n_items: 0, + _n_constructed_items: 0, + _max_item: params.max_item, + _n_neighbor: params.n_neighbor, + _n_neighbor0: params.n_neighbor0, + _max_level: params.max_level, + _cur_level: 0, + _root_id: 0, + _has_removed: params.has_deletion, + _ef_build: params.ef_build, + _ef_search: params.ef_search, + mt: metrics::Metric::Unknown, + ..Default::default() + } + } + + fn get_random_level(&self) -> usize { + let mut rng = rand::thread_rng(); + let mut ret = 0; + while ret < self._max_level { + if rng.gen_range(0.0..1.0) > 0.5 { + ret += 1; + } else { + break; + } + } + ret + } + //input top_candidate as max top heap + //return min top heap in top_candidates, delete part candidate + fn get_neighbors_by_heuristic2( + &self, + sorted_list: &[Neighbor], + ret_size: usize, + ) -> Vec> { + let sorted_list_len = sorted_list.len(); + let mut return_list: Vec> = Vec::with_capacity(sorted_list_len); + + for iter in sorted_list.iter() { + if return_list.len() >= ret_size { + break; + } + + let idx = iter.idx(); + let distance = iter._distance; + if sorted_list_len < ret_size { + return_list.push(Neighbor::new(idx, distance)); + continue; + } + + let mut good = true; + + for ret_neighbor in return_list.iter() { + let cur2ret_dis = self.get_distance_from_id(idx, ret_neighbor.idx()); + if cur2ret_dis < distance { + good = false; + break; + } + } + + if good { + return_list.push(Neighbor::new(idx, distance)); + } + } + + return_list // from small to large + } + + fn get_neighbor(&self, id: usize, level: usize) -> &RwLock> { + if level == 0 { + return &self._id2neighbor0[id]; + } + &self._id2neighbor[id][level - 1] + } + + #[allow(dead_code)] + fn get_level(&self, id: usize) -> usize { + self._id2level[id] + } + + fn connect_neighbor( + &self, + cur_id: usize, + sorted_candidates: &[Neighbor], + level: usize, + is_update: bool, + ) -> Result { + let n_neigh = if level == 0 { + self._n_neighbor0 + } else { + self._n_neighbor + }; + let selected_neighbors = self.get_neighbors_by_heuristic2(sorted_candidates, n_neigh); + if selected_neighbors.len() > n_neigh { + return Err("Should be not be more than M_ candidates returned by the heuristic"); + } + // println!("{:?}",top_candidates); + if selected_neighbors.is_empty() { + return Err("top candidate is empty, impossible!"); + } + + let next_closest_entry_point = selected_neighbors[0].idx(); + + { + let mut cur_neigh = self.get_neighbor(cur_id, level).write().unwrap(); + cur_neigh.clear(); + selected_neighbors.iter().for_each(|selected_neighbor| { + cur_neigh.push(selected_neighbor.idx()); + }); + } + + for selected_neighbor in selected_neighbors.iter() { + let mut neighbor_of_selected_neighbors = self + .get_neighbor(selected_neighbor.idx(), level) + .write() + .unwrap(); + if neighbor_of_selected_neighbors.len() > n_neigh { + return Err("Bad Value of neighbor_of_selected_neighbors"); + } + if selected_neighbor.idx() == cur_id { + return Err("Trying to connect an element to itself"); + } + + let mut is_cur_id_present = false; + + if is_update { + for iter in neighbor_of_selected_neighbors.iter() { + if *iter == cur_id { + is_cur_id_present = true; + break; + } + } + } + + if !is_cur_id_present { + if neighbor_of_selected_neighbors.len() < n_neigh { + neighbor_of_selected_neighbors.push(cur_id); + } else { + let d_max = self.get_distance_from_id(cur_id, selected_neighbor.idx()); + + let mut candidates: BinaryHeap> = BinaryHeap::new(); + candidates.push(Neighbor::new(cur_id, d_max)); + for iter in neighbor_of_selected_neighbors.iter() { + let neighbor_id = *iter; + let d_neigh = + self.get_distance_from_id(neighbor_id, selected_neighbor.idx()); + candidates.push(Neighbor::new(neighbor_id, d_neigh)); + } + let return_list = + self.get_neighbors_by_heuristic2(&candidates.into_sorted_vec(), n_neigh); + + neighbor_of_selected_neighbors.clear(); + for neighbor_in_list in return_list { + neighbor_of_selected_neighbors.push(neighbor_in_list.idx()); + } + } + } + } + + Ok(next_closest_entry_point) + } + + #[allow(dead_code)] + fn delete_id(&mut self, id: usize) -> Result<(), &'static str> { + if id > self._n_constructed_items { + return Err("Invalid delete id"); + } + if self.is_deleted(id) { + return Err("id has deleted"); + } + self._delete_ids.insert(id); + Ok(()) + } + + fn is_deleted(&self, id: usize) -> bool { + self._has_removed && self._delete_ids.contains(&id) + } + + fn get_data(&self, id: usize) -> &node::Node { + &self._nodes[id] + } + + fn get_distance_from_vec(&self, x: &node::Node, y: &node::Node) -> E { + return metrics::metric(x.vectors(), y.vectors(), self.mt).unwrap(); + } + + fn get_distance_from_id(&self, x: usize, y: usize) -> E { + return metrics::metric( + self.get_data(x).vectors(), + self.get_data(y).vectors(), + self.mt, + ) + .unwrap(); + } + + fn search_layer_with_candidate( + &self, + search_data: &node::Node, + sorted_candidates: &[Neighbor], + visited_id: &mut FixedBitSet, + level: usize, + ef: usize, + has_deletion: bool, + ) -> BinaryHeap> { + let mut candidates: BinaryHeap> = BinaryHeap::new(); + let mut top_candidates: BinaryHeap> = BinaryHeap::new(); + for neighbor in sorted_candidates.iter() { + let root = neighbor.idx(); + if !has_deletion || !self.is_deleted(root) { + let dist = self.get_distance_from_vec(self.get_data(root), search_data); + top_candidates.push(Neighbor::new(root, dist)); + candidates.push(Neighbor::new(root, -dist)); + } else { + candidates.push(Neighbor::new(root, -E::max_value())) + } + visited_id.insert(root); + } + let mut lower_bound = if top_candidates.is_empty() { + E::max_value() //max dist in top_candidates + } else { + top_candidates.peek().unwrap()._distance + }; + + while !candidates.is_empty() { + let cur_neigh = candidates.peek().unwrap(); + let cur_dist = -cur_neigh._distance; + let cur_id = cur_neigh.idx(); + candidates.pop(); + if cur_dist > lower_bound { + break; + } + let cur_neighbors = self.get_neighbor(cur_id, level).read().unwrap(); + cur_neighbors.iter().for_each(|neigh| { + if visited_id.contains(*neigh) { + return; + } + visited_id.insert(*neigh); + let dist = self.get_distance_from_vec(self.get_data(*neigh), search_data); + if top_candidates.len() < ef || dist < lower_bound { + candidates.push(Neighbor::new(*neigh, -dist)); + + if !self.is_deleted(*neigh) { + top_candidates.push(Neighbor::new(*neigh, dist)) + } + + if top_candidates.len() > ef { + top_candidates.pop(); + } + + if !top_candidates.is_empty() { + lower_bound = top_candidates.peek().unwrap()._distance; + } + } + }); + } + + top_candidates + } + //find ef nearist nodes to search data from root at level + fn search_layer( + &self, + root: usize, + search_data: &node::Node, + level: usize, + ef: usize, + has_deletion: bool, + ) -> BinaryHeap> { + let mut visited_id = FixedBitSet::with_capacity(self._nodes.len()); + let mut top_candidates: BinaryHeap> = BinaryHeap::new(); + let mut candidates: BinaryHeap> = BinaryHeap::new(); + let mut lower_bound: E; + + if !has_deletion || !self.is_deleted(root) { + let dist = self.get_distance_from_vec(self.get_data(root), search_data); + top_candidates.push(Neighbor::new(root, dist)); + candidates.push(Neighbor::new(root, -dist)); + lower_bound = dist; + } else { + lower_bound = E::max_value(); //max dist in top_candidates + candidates.push(Neighbor::new(root, -lower_bound)) + } + visited_id.insert(root); + + while !candidates.is_empty() { + let cur_neigh = candidates.peek().unwrap(); + let cur_dist = -cur_neigh._distance; + let cur_id = cur_neigh.idx(); + candidates.pop(); + if cur_dist > lower_bound { + break; + } + let cur_neighbors = self.get_neighbor(cur_id, level).read().unwrap(); + cur_neighbors.iter().for_each(|neigh| { + if visited_id.contains(*neigh) { + return; + } + visited_id.insert(*neigh); + let dist = self.get_distance_from_vec(self.get_data(*neigh), search_data); + if top_candidates.len() < ef || dist < lower_bound { + candidates.push(Neighbor::new(*neigh, -dist)); + + if !self.is_deleted(*neigh) { + top_candidates.push(Neighbor::new(*neigh, dist)) + } + + if top_candidates.len() > ef { + top_candidates.pop(); + } + + if !top_candidates.is_empty() { + lower_bound = top_candidates.peek().unwrap()._distance; + } + } + }); + } + + top_candidates + } + + // fn search_layer_default( + // &self, + // root: usize, + // search_data: &node::Node, + // level: usize, + // ) -> BinaryHeap> { + // return self.search_layer(root, search_data, level, self._ef_build, false); + // } + + fn search_knn( + &self, + search_data: &node::Node, + k: usize, + ) -> Result>, &'static str> { + let mut top_candidate: BinaryHeap> = BinaryHeap::new(); + if self._n_constructed_items == 0 { + return Ok(top_candidate); + } + let mut cur_id = self._root_id; + let mut cur_dist = self.get_distance_from_vec(self.get_data(cur_id), search_data); + let mut cur_level = self._cur_level; + loop { + let mut changed = true; + while changed { + changed = false; + let cur_neighs = self + .get_neighbor(cur_id, cur_level as usize) + .read() + .unwrap(); + for neigh in cur_neighs.iter() { + if *neigh > self._max_item { + return Err("cand error"); + } + let dist = self.get_distance_from_vec(self.get_data(cur_id), search_data); + if dist < cur_dist { + cur_dist = dist; + cur_id = *neigh; + changed = true; + } + } + } + if cur_level == 0 { + break; + } + cur_level -= 1; + } + + let search_range = if self._ef_search > k { + self._ef_search + } else { + k + }; + + top_candidate = self.search_layer(cur_id, search_data, 0, search_range, self._has_removed); + while top_candidate.len() > k { + top_candidate.pop(); + } + + Ok(top_candidate) + } + + fn init_item(&mut self, data: &node::Node) -> usize { + let cur_id = self._n_items; + let mut cur_level = self.get_random_level(); + if cur_id == 0 { + cur_level = self._max_level; + self._cur_level = cur_level; + self._root_id = cur_id; + } + let neigh0: RwLock> = RwLock::new(Vec::with_capacity(self._n_neighbor0)); + let mut neigh: Vec>> = Vec::with_capacity(cur_level); + for _i in 0..cur_level { + let level_neigh: RwLock> = RwLock::new(Vec::with_capacity(self._n_neighbor)); + neigh.push(level_neigh); + } + self._nodes.push(Box::new(data.clone())); + self._id2neighbor0.push(neigh0); + self._id2neighbor.push(neigh); + self._id2level.push(cur_level); + // self._item2id.insert(data.idx().unwrap(), cur_id); + self._n_items += 1; + cur_id + } + + fn batch_construct(&mut self, _mt: metrics::Metric) -> Result<(), &'static str> { + if self._n_items < self._n_constructed_items { + return Err("contruct error"); + } + + (self._n_constructed_items..self._n_items) + .into_par_iter() + .for_each(|insert_id: usize| { + self.construct_single_item(insert_id).unwrap(); + // println!("insert_id {}", insert_id); + }); + + // for insert_id in self._n_constructed_items..self._n_items{ + // // println!("insert id {}", insert_id); + // self.construct_single_item(insert_id); + // } + self._n_constructed_items = self._n_items; + Ok(()) + } + + fn add_item_not_constructed(&mut self, data: &node::Node) -> Result<(), &'static str> { + if data.len() != self._dimension { + return Err("dimension is different"); + } + { + // if self._item2id.contains_key(data.idx().unwrap()) { + // //to_do update point + // return Ok(self._item2id[data.idx().unwrap()]); + // } + + if self._n_items >= self._max_item { + return Err("The number of elements exceeds the specified limit"); + } + } + + let insert_id = self.init_item(data); + let _insert_level = self.get_level(insert_id); + Ok(()) + } + + fn add_single_item(&mut self, data: &node::Node) -> Result<(), &'static str> { + //not support asysn + if data.len() != self._dimension { + return Err("dimension is different"); + } + { + // if self._item2id.contains_key(data.idx().unwrap()) { + // //to_do update point + // return Ok(self._item2id[data.idx().unwrap()]); + // } + + if self._n_items >= self._max_item { + return Err("The number of elements exceeds the specified limit"); + } + } + + let insert_id = self.init_item(data); + let _insert_level = self.get_level(insert_id); + self.construct_single_item(insert_id).unwrap(); + + self._n_constructed_items += 1; + + Ok(()) + } + + fn construct_single_item(&self, insert_id: usize) -> Result<(), &'static str> { + let insert_level = self._id2level[insert_id]; + // println!("insert id {} insert_level {}", insert_id, insert_level); + // println!("self._cur_level {}", self._cur_level); + let mut cur_id = self._root_id; + // println!("insert_id {:?}",insert_id); + // println!("insert_id {:?}, insert_level {:?} ", insert_id, insert_level); + + if insert_id == 0 { + return Ok(()); + } + + if insert_level < self._cur_level { + let mut cur_dist = self.get_distance_from_id(cur_id, insert_id); + let mut cur_level = self._cur_level; + while cur_level > insert_level { + let mut changed = true; + while changed { + changed = false; + let cur_neighs = self.get_neighbor(cur_id, cur_level).read().unwrap(); + for cur_neigh in cur_neighs.iter() { + if *cur_neigh > self._n_items { + return Err("cand error"); + } + let neigh_dist = self.get_distance_from_id(*cur_neigh, insert_id); + if neigh_dist < cur_dist { + cur_dist = neigh_dist; + cur_id = *cur_neigh; + changed = true; + } + } + } + cur_level -= 1; + } + } + + let mut level = if insert_level < self._cur_level { + insert_level + } else { + self._cur_level + }; + let mut visited_id = FixedBitSet::with_capacity(self._nodes.len()); + let mut sorted_candidates: Vec> = Vec::new(); + let insert_data = self.get_data(insert_id); + visited_id.insert(insert_id); + sorted_candidates.push(Neighbor::new( + cur_id, + self.get_distance_from_id(cur_id, insert_id), + )); + loop { + // let mut visited_id: HashSet = HashSet::new(); + let mut top_candidates = self.search_layer_with_candidate( + insert_data, + &sorted_candidates, + &mut visited_id, + level, + self._ef_build, + false, + ); + // let mut top_candidates = self.search_layer_default(cur_id, insert_data, level); + if self.is_deleted(cur_id) { + let cur_dist = self.get_distance_from_id(cur_id, insert_id); + top_candidates.push(Neighbor::new(cur_id, cur_dist)); + if top_candidates.len() > self._ef_build { + top_candidates.pop(); + } + } + // println!("cur_id {:?}", insert_id); + // println!("{:?}", top_candidates); + sorted_candidates = top_candidates.into_sorted_vec(); + if sorted_candidates.is_empty() { + return Err("sorted sorted_candidate is empty"); + } + cur_id = self + .connect_neighbor(insert_id, &sorted_candidates, level, false) + .unwrap(); + if level == 0 { + break; + } + level -= 1; + } + Ok(()) + } +} + +impl ann_index::ANNIndex for HNSWIndex { + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str> { + self.mt = mt; + self.batch_construct(mt) + } + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str> { + self.add_item_not_constructed(item) + } + fn built(&self) -> bool { + true + } + + fn node_search_k( + &self, + item: &node::Node, + k: usize, + _args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + let mut ret: BinaryHeap> = self.search_knn(item, k).unwrap(); + let mut result: Vec<(node::Node, E)> = Vec::with_capacity(k); + let mut result_idx: Vec<(usize, E)> = Vec::with_capacity(k); + while !ret.is_empty() { + let top = ret.peek().unwrap(); + let top_idx = top.idx(); + let top_distance = top.distance(); + ret.pop(); + result_idx.push((top_idx, top_distance)) + } + for i in 0..result_idx.len() { + let cur_id = result_idx.len() - i - 1; + result.push(( + *self._nodes[result_idx[cur_id].0].clone(), + result_idx[cur_id].1, + )); + } + result + } + + fn name(&self) -> &'static str { + "HNSWIndex" + } + + fn dimension(&self) -> usize { + self._dimension + } +} + +impl + ann_index::SerializableIndex for HNSWIndex +{ + fn load(path: &str, _args: &arguments::Args) -> Result { + let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); + let mut instance: HNSWIndex = bincode::deserialize_from(&file).unwrap(); + instance._nodes = instance + ._nodes_tmp + .iter() + .map(|x| Box::new(x.clone())) + .collect(); + instance._id2neighbor = Vec::with_capacity(instance._id2neighbor_tmp.len()); + for i in 0..instance._id2neighbor_tmp.len() { + let mut tmp = Vec::with_capacity(instance._id2neighbor_tmp[i].len()); + for j in 0..instance._id2neighbor_tmp[i].len() { + tmp.push(RwLock::new(instance._id2neighbor_tmp[i][j].clone())); + } + instance._id2neighbor.push(tmp); + } + instance._id2neighbor0 = Vec::with_capacity(instance._id2neighbor0_tmp.len()); + for i in 0..instance._id2neighbor0_tmp.len() { + instance + ._id2neighbor0 + .push(RwLock::new(instance._id2neighbor0_tmp[i].clone())); + } + + instance._item2id = HashMap::new(); + for iter in instance._item2id_tmp.iter() { + let (k, v) = &*iter; + instance._item2id.insert(k.clone(), *v); + } + + instance._delete_ids = HashSet::new(); + for iter in instance._delete_ids_tmp.iter() { + instance._delete_ids.insert(*iter); + } + instance._id2neighbor_tmp.clear(); + instance._id2neighbor0_tmp.clear(); + instance._nodes_tmp.clear(); + instance._item2id_tmp.clear(); + instance._delete_ids_tmp.clear(); + Ok(instance) + } + + fn dump(&mut self, path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + self._id2neighbor_tmp = Vec::with_capacity(self._id2neighbor.len()); + for i in 0..self._id2neighbor.len() { + let mut tmp = Vec::with_capacity(self._id2neighbor[i].len()); + for j in 0..self._id2neighbor[i].len() { + tmp.push(self._id2neighbor[i][j].read().unwrap().clone()); + } + self._id2neighbor_tmp.push(tmp); + } + + self._id2neighbor0_tmp = Vec::with_capacity(self._id2neighbor0.len()); + for i in 0..self._id2neighbor0.len() { + self._id2neighbor0_tmp + .push(self._id2neighbor0[i].read().unwrap().clone()); + } + + self._nodes_tmp = self._nodes.iter().map(|x| *x.clone()).collect(); + self._item2id_tmp = Vec::with_capacity(self._item2id.len()); + for (k, v) in &self._item2id { + self._item2id_tmp.push((k.clone(), *v)); + } + self._delete_ids_tmp = Vec::new(); + for iter in &self._delete_ids { + self._delete_ids_tmp.push(*iter); + } + + let encoded_bytes = bincode::serialize(&self).unwrap(); + let mut file = File::create(path).unwrap(); + file.write_all(&encoded_bytes) + .unwrap_or_else(|_| panic!("unable to write file {:?}", path)); + Result::Ok(()) + } +} diff --git a/src/index/hnsw_params.rs b/src/index/hnsw_params.rs new file mode 100644 index 0000000..9f8e9e2 --- /dev/null +++ b/src/index/hnsw_params.rs @@ -0,0 +1,85 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::metrics; +use crate::core::neighbor::Neighbor; +use crate::core::node; +use fixedbitset::FixedBitSet; +#[cfg(feature = "without_std")] +use hashbrown::HashMap; +#[cfg(feature = "without_std")] +use hashbrown::HashSet; +use rand::prelude::*; +use rayon::{iter::IntoParallelIterator, prelude::*}; +use serde::de::DeserializeOwned; +use std::collections::BinaryHeap; + +use serde::{Deserialize, Serialize}; + +#[cfg(not(feature = "without_std"))] +use std::collections::HashMap; +#[cfg(not(feature = "without_std"))] +use std::collections::HashSet; +use std::fs::File; +use std::io::Write; + +use std::sync::RwLock; + +#[derive(Debug, Serialize, Deserialize)] +pub struct HNSWParams { + pub max_item: usize, + pub n_neighbor: usize, + pub n_neighbor0: usize, + pub max_level: usize, + pub ef_build: usize, + pub ef_search: usize, + pub has_deletion: bool, + pub e_type: E, +} + +impl HNSWParams { + pub fn max_item(mut self, new_max_item: usize) -> Self { + self.max_item = new_max_item; + self + } + + pub fn n_neighbor(mut self, new_n_neighbor: usize) -> Self { + self.n_neighbor = new_n_neighbor; + self + } + + pub fn n_neighbor0(mut self, new_n_neighbor0: usize) -> Self { + self.n_neighbor0 = new_n_neighbor0; + self + } + + pub fn ef_build(mut self, new_ef_build: usize) -> Self { + self.ef_build = new_ef_build; + self + } + + pub fn ef_search(mut self, new_ef_search: usize) -> Self { + self.ef_search = new_ef_search; + self + } + + pub fn has_deletion(mut self, new_has_deletion: bool) -> Self { + self.has_deletion = new_has_deletion; + self + } +} + +impl Default for HNSWParams { + fn default() -> Self { + HNSWParams { + max_item: 1000000, + n_neighbor: 32, + n_neighbor0: 64, + max_level: 20, + ef_build: 500, + ef_search: 16, + has_deletion: false, + e_type: E::from_f32(0.0).unwrap(), + } + } +} diff --git a/src/index/mod.rs b/src/index/mod.rs new file mode 100644 index 0000000..4387cb7 --- /dev/null +++ b/src/index/mod.rs @@ -0,0 +1,10 @@ +mod bruteforce_idx; +mod bruteforce_params; +mod hnsw_idx; +mod hnsw_params; +mod pq_idx; +mod pq_params; +mod rpt_idx; +mod rpt_params; +mod ssg_idx; +mod ssg_params; diff --git a/src/index/pq_idx.rs b/src/index/pq_idx.rs new file mode 100644 index 0000000..361bb38 --- /dev/null +++ b/src/index/pq_idx.rs @@ -0,0 +1,535 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::kmeans; +use crate::core::metrics; +use crate::core::neighbor::Neighbor; +use crate::core::node; +use crate::index::pq_params::IVFPQParams; +use crate::index::pq_params::PQParams; +use rayon::prelude::*; +use serde::de::DeserializeOwned; +use std::collections::BinaryHeap; + +use serde::{Deserialize, Serialize}; + +use std::fs::File; + +use std::io::Write; + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct PQIndex { + _dimension: usize, //dimension of data + _n_sub: usize, //num of subdata + _sub_dimension: usize, //dimension of subdata + _dimension_range: Vec>, //dimension preset + _sub_bits: usize, // size of subdata code + _sub_bytes: usize, //code save as byte: (_sub_bit + 7)//8 + _n_sub_center: usize, //num of centers per subdata code + //n_center_per_sub = 1 << sub_bits + _code_bytes: usize, // byte of code + _train_epoch: usize, // training epoch + _centers: Vec>>, // size to be _n_sub * _n_sub_center * _sub_dimension + _is_trained: bool, + _has_residual: bool, + _residual: Vec, + + _n_items: usize, + _max_item: usize, + _nodes: Vec>>, + _assigned_center: Vec>, + mt: metrics::Metric, //compute metrics + // _item2id: HashMap, + _nodes_tmp: Vec>, +} + +impl PQIndex { + pub fn new(dimension: usize, params: &PQParams) -> PQIndex { + let n_sub = params.n_sub; + let sub_bits = params.sub_bits; + let train_epoch = params.train_epoch; + let sub_dimension = dimension / n_sub; + + let sub_bytes = (sub_bits + 7) / 8; + assert!(sub_bits <= 32); + let n_center_per_sub = (1 << sub_bits) as usize; + let code_bytes = sub_bytes * n_sub; + let mut new_pq = PQIndex:: { + _dimension: dimension, + _n_sub: n_sub, + _sub_dimension: sub_dimension, + _sub_bits: sub_bits, + _sub_bytes: sub_bytes, + _n_sub_center: n_center_per_sub, + _code_bytes: code_bytes, + _train_epoch: train_epoch, + _is_trained: false, + _n_items: 0, + _max_item: 100000, + _has_residual: false, + mt: metrics::Metric::Euclidean, + ..Default::default() + }; + + for i in 0..n_sub { + let begin; + let end; + if i < dimension % sub_dimension { + begin = i * (sub_dimension + 1); + end = (i + 1) * (sub_dimension + 1); + } else { + begin = (dimension % sub_dimension) * (sub_dimension + 1) + + (i - dimension % sub_dimension) * sub_dimension; + end = (dimension % sub_dimension) * (sub_dimension + 1) + + (i + 1 - dimension % sub_dimension) * sub_dimension; + }; + new_pq._dimension_range.push(vec![begin, end]); + } + new_pq + } + + fn init_item(&mut self, data: &node::Node) -> usize { + let cur_id = self._n_items; + // self._item2id.insert(item, cur_id); + self._nodes.push(Box::new(data.clone())); + self._n_items += 1; + cur_id + } + + fn add_item(&mut self, data: &node::Node) -> Result { + if data.len() != self._dimension { + return Err("dimension is different"); + } + // if self._item2id.contains_key(&item) { + // //to_do update point + // return Ok(self._item2id[&item]); + // } + + if self._n_items > self._max_item { + return Err("The number of elements exceeds the specified limit"); + } + + let insert_id = self.init_item(data); + Ok(insert_id) + } + + fn set_residual(&mut self, residual: Vec) { + self._has_residual = true; + self._residual = residual; + } + + fn train_center(&mut self) { + let n_item = self._n_items; + let n_sub = self._n_sub; + (0..n_sub).for_each(|i| { + let _dimension = self._sub_dimension; + let n_center = self._n_sub_center; + let n_epoch = self._train_epoch; + let begin = self._dimension_range[i][0]; + let end = self._dimension_range[i][1]; + let mut data_vec: Vec> = Vec::new(); + for node in self._nodes.iter() { + data_vec.push(node.vectors().to_vec()); + } + + let mut cluster = kmeans::Kmeans::::new(end - begin, n_center, self.mt); + cluster.set_range(begin, end); + if self._has_residual { + cluster.set_residual(self._residual.to_vec()); + } + + cluster.train(n_item, &data_vec, n_epoch); + let mut assigned_center: Vec = Vec::new(); + cluster.search_data(n_item, &data_vec, &mut assigned_center); + self._centers.push(cluster.centers().to_vec()); + self._assigned_center.push(assigned_center); + }); + self._is_trained = true; + } + + fn get_distance_from_vec_range( + &self, + x: &node::Node, + y: &[E], + begin: usize, + end: usize, + ) -> E { + let mut z = x.vectors()[begin..end].to_vec(); + if self._has_residual { + (0..end - begin).for_each(|i| z[i] -= self._residual[i + begin]); + } + return metrics::metric(&z, y, self.mt).unwrap(); + } + + fn search_knn_adc( + &self, + search_data: &node::Node, + k: usize, + ) -> Result>, &'static str> { + let mut dis2centers: Vec = Vec::new(); + dis2centers.resize(self._n_sub * self._n_sub_center, E::from_f32(0.0).unwrap()); + dis2centers.par_iter_mut().enumerate().for_each(|(idx, x)| { + let i = idx / self._n_sub_center; + let j = idx % self._n_sub_center; + let begin = self._dimension_range[i][0]; + let end = self._dimension_range[i][1]; + *x = self.get_distance_from_vec_range(search_data, &self._centers[i][j], begin, end); + }); + + let mut top_candidate: BinaryHeap> = BinaryHeap::new(); + (0..self._n_items).for_each(|i| { + let mut distance = E::from_f32(0.0).unwrap(); + (0..self._n_sub).for_each(|j| { + distance += dis2centers[j * self._n_sub_center + self._assigned_center[j][i]]; + }); + top_candidate.push(Neighbor::new(i, distance)); + }); + while top_candidate.len() > k { + top_candidate.pop(); + } + + Ok(top_candidate) + } +} + +impl ann_index::ANNIndex for PQIndex { + fn build(&mut self, _mt: metrics::Metric) -> Result<(), &'static str> { + self.mt = _mt; + self.train_center(); + Result::Ok(()) + } + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str> { + match self.add_item(item) { + Err(err) => Err(err), + _ => Ok(()), + } + } + fn built(&self) -> bool { + true + } + + fn node_search_k( + &self, + item: &node::Node, + k: usize, + _args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + let mut ret: BinaryHeap> = self.search_knn_adc(item, k).unwrap(); + let mut result: Vec<(node::Node, E)> = Vec::new(); + let mut result_idx: Vec<(usize, E)> = Vec::new(); + while !ret.is_empty() { + let top = ret.peek().unwrap(); + let top_idx = top.idx(); + let top_distance = top.distance(); + ret.pop(); + result_idx.push((top_idx, top_distance)) + } + for i in 0..result_idx.len() { + let cur_id = result_idx.len() - i - 1; + result.push(( + *self._nodes[result_idx[cur_id].0].clone(), + result_idx[cur_id].1, + )); + } + result + } + + fn name(&self) -> &'static str { + "PQIndex" + } + + fn dimension(&self) -> usize { + self._dimension + } +} + +impl + ann_index::SerializableIndex for PQIndex +{ + fn load(path: &str, _args: &arguments::Args) -> Result { + let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); + let mut instance: PQIndex = bincode::deserialize_from(&file).unwrap(); + instance._nodes = instance + ._nodes_tmp + .iter() + .map(|x| Box::new(x.clone())) + .collect(); + Ok(instance) + } + + fn dump(&mut self, path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + self._nodes_tmp = self._nodes.iter().map(|x| *x.clone()).collect(); + let encoded_bytes = bincode::serialize(&self).unwrap(); + let mut file = File::create(path).unwrap(); + file.write_all(&encoded_bytes) + .unwrap_or_else(|_| panic!("unable to write file {:?}", path)); + Result::Ok(()) + } +} + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct IVFPQIndex { + _dimension: usize, //dimension of data + _n_sub: usize, //num of subdata + _sub_dimension: usize, //dimension of subdata + _sub_bits: usize, // size of subdata code + _sub_bytes: usize, //code save as byte: (_sub_bit + 7)//8 + _n_sub_center: usize, //num of centers per subdata code + //n_center_per_sub = 1 << sub_bits + _code_bytes: usize, // byte of code + _train_epoch: usize, // training epoch + _search_n_center: usize, + _n_kmeans_center: usize, + _centers: Vec>, + _ivflist: Vec>, //ivf center id + _pq_list: Vec>, + _is_trained: bool, + + _n_items: usize, + _max_item: usize, + _nodes: Vec>>, + _assigned_center: Vec>, + mt: metrics::Metric, //compute metrics + // _item2id: HashMap, + _nodes_tmp: Vec>, +} + +impl IVFPQIndex { + pub fn new(dimension: usize, params: &IVFPQParams) -> IVFPQIndex { + let n_sub = params.n_sub; + let sub_bits = params.sub_bits; + let n_kmeans_center = params.n_kmeans_center; + let search_n_center = params.search_n_center; + let train_epoch = params.train_epoch; + + let sub_dimension = dimension / n_sub; + let sub_bytes = (sub_bits + 7) / 8; + assert!(sub_bits <= 32); + let n_center_per_sub = (1 << sub_bits) as usize; + let code_bytes = sub_bytes * n_sub; + let mut ivflist: Vec> = Vec::new(); + for _i in 0..n_kmeans_center { + let ivf: Vec = Vec::new(); + ivflist.push(ivf); + } + IVFPQIndex { + _dimension: dimension, + _n_sub: n_sub, + _sub_dimension: sub_dimension, + _sub_bits: sub_bits, + _sub_bytes: sub_bytes, + _n_sub_center: n_center_per_sub, + _code_bytes: code_bytes, + _n_kmeans_center: n_kmeans_center, + _search_n_center: search_n_center, + _ivflist: ivflist, + _train_epoch: train_epoch, + _is_trained: false, + _n_items: 0, + _max_item: 100000, + mt: metrics::Metric::Unknown, + ..Default::default() + } + } + + fn init_item(&mut self, data: &node::Node) -> usize { + let cur_id = self._n_items; + // self._item2id.insert(item, cur_id); + self._nodes.push(Box::new(data.clone())); + self._n_items += 1; + cur_id + } + + fn add_item(&mut self, data: &node::Node) -> Result { + if data.len() != self._dimension { + return Err("dimension is different"); + } + // if self._item2id.contains_key(&item) { + // //to_do update point + // return Ok(self._item2id[&item]); + // } + + if self._n_items > self._max_item { + return Err("The number of elements exceeds the specified limit"); + } + + let insert_id = self.init_item(data); + Ok(insert_id) + } + + fn train(&mut self) { + let n_item = self._n_items; + let dimension = self._dimension; + let n_center = self._n_kmeans_center; + let n_epoch = self._train_epoch; + let mut cluster = kmeans::Kmeans::::new(dimension, n_center, self.mt); + let mut data_vec: Vec> = Vec::new(); + for node in self._nodes.iter() { + data_vec.push(node.vectors().to_vec()); + } + cluster.set_range(0, dimension); + cluster.train(n_item, &data_vec, n_epoch); + let mut assigned_center: Vec = Vec::new(); + cluster.search_data(n_item, &data_vec, &mut assigned_center); + self._centers = cluster.centers().to_vec(); + (0..n_item).for_each(|i| { + let center_id = assigned_center[i]; + self._ivflist[center_id].push(i); + }); + for i in 0..n_center { + // println!("train center {:?}", i); + // println!("train center len {:?}", self._ivflist[i].len()); + let mut center_pq = PQIndex::::new( + self._dimension, + &PQParams::default() + .n_sub(self._n_sub) + .sub_bits(self._sub_bits) + .train_epoch(self._train_epoch), + ); + + for j in 0..self._ivflist[i].len() { + center_pq + .add_item(&self._nodes[self._ivflist[i][j]].clone()) + .unwrap(); + } + // println!("center: {:?}", self._centers[i].to_vec()) + center_pq.set_residual(self._centers[i].to_vec()); + center_pq.train_center(); + self._pq_list.push(center_pq); + } + + self._is_trained = true; + } + + fn get_distance_from_vec_range( + &self, + x: &node::Node, + y: &[E], + begin: usize, + end: usize, + ) -> E { + return metrics::metric(&x.vectors()[begin..end], y, self.mt).unwrap(); + } + + fn search_knn_adc( + &self, + search_data: &node::Node, + k: usize, + ) -> Result>, &'static str> { + let mut top_centers: BinaryHeap> = BinaryHeap::new(); + let n_kmeans_center = self._n_kmeans_center; + let dimension = self._dimension; + for i in 0..n_kmeans_center { + top_centers.push(Neighbor::new( + i, + -self.get_distance_from_vec_range(search_data, &self._centers[i], 0, dimension), + )) + } + + let mut top_candidate: BinaryHeap> = BinaryHeap::new(); + for _i in 0..self._search_n_center { + let center = top_centers.pop().unwrap().idx(); + // println!("{:?}", center); + let mut ret = self._pq_list[center] + .search_knn_adc(search_data, k) + .unwrap(); + while !ret.is_empty() { + let mut ret_peek = ret.pop().unwrap(); + ret_peek._idx = self._ivflist[center][ret_peek._idx]; + top_candidate.push(ret_peek); + if top_candidate.len() > k { + top_candidate.pop(); + } + } + } + Ok(top_candidate) + } +} + +impl ann_index::ANNIndex for IVFPQIndex { + fn build(&mut self, _mt: metrics::Metric) -> Result<(), &'static str> { + self.mt = _mt; + self.train(); + Result::Ok(()) + } + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str> { + match self.add_item(item) { + Err(err) => Err(err), + _ => Ok(()), + } + } + fn built(&self) -> bool { + true + } + + fn node_search_k( + &self, + item: &node::Node, + k: usize, + _args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + let mut ret: BinaryHeap> = self.search_knn_adc(item, k).unwrap(); + let mut result: Vec<(node::Node, E)> = Vec::new(); + let mut result_idx: Vec<(usize, E)> = Vec::new(); + while !ret.is_empty() { + let top = ret.peek().unwrap(); + let top_idx = top.idx(); + let top_distance = top.distance(); + ret.pop(); + result_idx.push((top_idx, top_distance)) + } + for i in 0..result_idx.len() { + let cur_id = result_idx.len() - i - 1; + result.push(( + *self._nodes[result_idx[cur_id].0].clone(), + result_idx[cur_id].1, + )); + } + result + } + + fn name(&self) -> &'static str { + "IVFPQIndex" + } + + fn dimension(&self) -> usize { + self._dimension + } +} + +impl + ann_index::SerializableIndex for IVFPQIndex +{ + fn load(path: &str, _args: &arguments::Args) -> Result { + let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); + let mut instance: IVFPQIndex = bincode::deserialize_from(&file).unwrap(); + instance._nodes = instance + ._nodes_tmp + .iter() + .map(|x| Box::new(x.clone())) + .collect(); + instance._nodes_tmp.clear(); + for i in 0..instance._n_kmeans_center { + instance._pq_list[i]._nodes = instance._pq_list[i] + ._nodes_tmp + .iter() + .map(|x| Box::new(x.clone())) + .collect(); + instance._pq_list[i]._nodes_tmp.clear(); + } + Ok(instance) + } + + fn dump(&mut self, path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + self._nodes_tmp = self._nodes.iter().map(|x| *x.clone()).collect(); + for i in 0..self._n_kmeans_center { + self._pq_list[i]._nodes_tmp = + self._pq_list[i]._nodes.iter().map(|x| *x.clone()).collect(); + } + let encoded_bytes = bincode::serialize(&self).unwrap(); + let mut file = File::create(path).unwrap(); + file.write_all(&encoded_bytes) + .unwrap_or_else(|_| panic!("unable to write file {:?}", path)); + Result::Ok(()) + } +} diff --git a/src/index/pq_params.rs b/src/index/pq_params.rs new file mode 100644 index 0000000..55215e5 --- /dev/null +++ b/src/index/pq_params.rs @@ -0,0 +1,103 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::kmeans; +use crate::core::metrics; +use crate::core::neighbor::Neighbor; +use crate::core::node; + +use rayon::prelude::*; +use serde::de::DeserializeOwned; +use std::collections::BinaryHeap; + +use serde::{Deserialize, Serialize}; + +use std::fs::File; + +use std::io::Write; + +#[derive(Debug, Serialize, Deserialize)] +pub struct PQParams { + pub n_sub: usize, + pub sub_bits: usize, + pub train_epoch: usize, + pub e_type: E, +} + +impl PQParams { + pub fn n_sub(mut self, new_n_sub: usize) -> Self { + self.n_sub = new_n_sub; + self + } + + pub fn sub_bits(mut self, new_sub_bits: usize) -> Self { + self.sub_bits = new_sub_bits; + self + } + + pub fn train_epoch(mut self, new_train_epoch: usize) -> Self { + self.train_epoch = new_train_epoch; + self + } +} + +impl Default for PQParams { + fn default() -> Self { + PQParams { + n_sub: 4, + sub_bits: 4, + train_epoch: 100, + e_type: E::from_f32(0.0).unwrap(), + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct IVFPQParams { + pub n_sub: usize, + pub sub_bits: usize, + pub n_kmeans_center: usize, + pub search_n_center: usize, + pub train_epoch: usize, + pub e_type: E, +} + +impl IVFPQParams { + pub fn n_sub(mut self, new_n_sub: usize) -> Self { + self.n_sub = new_n_sub; + self + } + + pub fn sub_bits(mut self, new_sub_bits: usize) -> Self { + self.sub_bits = new_sub_bits; + self + } + + pub fn n_kmeans_center(mut self, new_n_kmeans_center: usize) -> Self { + self.n_kmeans_center = new_n_kmeans_center; + self + } + + pub fn search_n_center(mut self, new_search_n_center: usize) -> Self { + self.search_n_center = new_search_n_center; + self + } + + pub fn train_epoch(mut self, new_train_epoch: usize) -> Self { + self.train_epoch = new_train_epoch; + self + } +} + +impl Default for IVFPQParams { + fn default() -> Self { + IVFPQParams { + n_sub: 25, + sub_bits: 4, + n_kmeans_center: 256, + search_n_center: 8, + train_epoch: 100, + e_type: E::from_f32(0.0).unwrap(), + } + } +} diff --git a/src/index/rpt_idx.rs b/src/index/rpt_idx.rs new file mode 100644 index 0000000..eb0491d --- /dev/null +++ b/src/index/rpt_idx.rs @@ -0,0 +1,646 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::calc; +use crate::core::metrics; +use crate::core::neighbor; +use crate::core::node; +use crate::core::random; +use crate::index::rpt_params::BPTParams; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::fs::File; + +use std::io::Write; + +// TODO: leaf as a trait with getter setter function +#[derive(Default, Clone, Debug, Serialize, Deserialize)] +struct Leaf { + n_descendants: i32, // tot n_descendants + children: Vec, // left and right and if it's a leaf leaf, children would be very large (depend on _K) + #[serde(skip_serializing, skip_deserializing)] + node: Box>, + tmp_node: Option>, + + // biz field + norm: E, + has_init: bool, +} + +impl Leaf { + fn new() -> Leaf { + Leaf { + children: vec![0, 0], + ..Default::default() + } + } + + fn new_with_vectors(_v: &[E]) -> Leaf { + Leaf { + children: vec![0, 0], + node: Box::new(node::Node::new(_v)), + ..Default::default() + } + } + + fn new_with_item(_v: &node::Node) -> Leaf { + Leaf { + children: vec![0, 0], + node: Box::new(_v.clone()), + ..Default::default() + } + } + + fn is_empty(&self) -> bool { + self.has_init + } + + fn init(&mut self) { + self.children = vec![0, 0]; + } + + fn clone_node(&self) -> node::Node { + *self.node.clone() + } + + fn normalize(&mut self) { + let norm = calc::get_norm(&self.node.vectors()).unwrap(); + if norm > E::float_zero() { + for i in 0..self.node.len() { + self.node.mut_vectors()[i] /= norm; + } + } + } + + fn copy(dst: &mut Leaf, src: &Leaf) { + dst.n_descendants = src.n_descendants; + dst.children = src.children.clone(); + dst.node = src.node.clone(); + dst.norm = src.norm; + } + + pub fn get_literal(&self) -> String { + format!( + "{{ \"n_descendants\": {:?}, \"children\": {:?}, \"has_init\": {:?} }}, \"node\": {:?},", + self.n_descendants, self.children, self.has_init, *self.node + ) + } + + // replace distance copy_leaf + fn copy_leaf(src: &Leaf) -> Leaf { + Leaf { + n_descendants: src.n_descendants, + node: src.node.clone(), + children: src.children.clone(), + ..Default::default() + } + } +} + +fn two_means( + leaves: &[Leaf], + mt: metrics::Metric, +) -> Result<(Leaf, Leaf), &'static str> { + const ITERATION_STEPS: usize = 200; + if leaves.len() < 2 { + return Err("empty leaves"); + } + + let count = leaves.len(); + + let i = random::index(count); + let mut j = random::index(count - 1); + // make sure j not equal to i; + if j >= i { + j += 1; + } + + let mut p = Leaf::copy_leaf(&leaves[i]); + let mut q = Leaf::copy_leaf(&leaves[j]); + + if mt == metrics::Metric::CosineSimilarity { + p.normalize(); + q.normalize(); + } + // TODO: dot normalize + + let one = E::float_one(); + let zero = E::float_zero(); + + let mut ic: E = one; + let mut jc: E = one; + + // produce two mean point. + for _z in 0..ITERATION_STEPS { + let k = random::index(count); + let di = ic * metrics::metric(&p.node.vectors(), &leaves[k].node.vectors(), mt).unwrap(); + let dj = jc * metrics::metric(&q.node.vectors(), &leaves[k].node.vectors(), mt).unwrap(); + + // + let mut norm = one; + if mt == metrics::Metric::CosineSimilarity { + norm = calc::get_norm(&leaves[k].node.vectors()).unwrap(); + match norm.partial_cmp(&zero) { + Some(Ordering::Equal) | Some(Ordering::Less) => continue, + _ => {} + }; + } + + // make p more closer to k in space. + if di < dj { + for l in 0..p.node.len() { + p.node.mut_vectors()[l] = + (p.node.vectors()[l] * ic + leaves[k].node.vectors()[l] / norm) / (ic + one); + } + ic += one; + } else if dj < di { + for l in 0..q.node.len() { + q.node.mut_vectors()[l] = + (q.node.vectors()[l] * jc + leaves[k].node.vectors()[l] / norm) / (jc + one); + } + jc += one; + } + } + Ok((p, q)) +} + +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct BPTIndex { + _dimension: usize, // dimension + _tot_items_cnt: i32, // add items count, means the physically the item count, _tot_items_cnt == leaves.size() + _tot_leaves_cnt: i32, // leaves count, whole tree leaves count + // _leaves_size: i32, // in source code, this means the memory which has been allocated, and we can use leaf's size to get data + _roots: Vec, // dummy root's children + _leaf_max_items: i32, // max number of n_descendants to fit into leaf + _built: bool, + leaves: Vec>, + mt: metrics::Metric, + _tree_num: i32, + _candidate_size: i32, +} + +impl BPTIndex { + pub fn new(dimension: usize, params: BPTParams) -> BPTIndex { + BPTIndex { + _built: false, + _dimension: dimension, + _leaf_max_items: ((dimension / 2) as i32) + 2, + _tree_num: params.tree_num, + _candidate_size: params.candidate_size, + leaves: vec![Leaf::new()], // the id count should start from 1, use a node as placeholder + ..Default::default() + } + } + + fn _add_item(&mut self, w: &node::Node) -> Result<(), &'static str> { + // TODO: remove + if w.len() != self._dimension { + return Err("dimension is different"); + } + + let mut nn = Leaf::new_with_item(w); + + nn.children[0] = 0; // TODO: as const value + nn.children[1] = 0; + nn.n_descendants = 1; // only the leaf itself, so the n_descendants include it self + + // no update method + self._tot_items_cnt += 1; + + self.leaves.push(nn); + + Ok(()) + } + + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str> { + if self._built { + return Err("has built"); + } + + self.mt = mt; + self._tot_leaves_cnt = self._tot_items_cnt; // init with build. + self._build(self._tree_num, self.mt); + self._built = true; + println!( + "tree number: {:?}, leaves: {:?}, items: {:?}, leaf size: {:?}", + self._tree_num, + self._tot_leaves_cnt, + self._tot_items_cnt, + self.get_k() + ); + Ok(()) + } + + fn clear(&mut self) { + self._roots.clear(); + self._tot_leaves_cnt = self._tot_items_cnt; + self._built = false; + } + fn get_distance(&self, i: i32, j: i32) -> E { + let ni = self.get_leaf(i).unwrap(); + let nj = self.get_leaf(j).unwrap(); + return metrics::metric(&ni.node.vectors(), &nj.node.vectors(), self.mt).unwrap(); + } + + fn get_tot_items_cnt(&self) -> i32 { + self._tot_items_cnt + } + fn get_n_tree(&self) -> i32 { + self._roots.len() as i32 + } + + fn get_dimension(&self) -> usize { + self._dimension + } + + fn get_k(&self) -> i32 { + self._leaf_max_items + } + + fn get_leaf_mut(&mut self, i: i32) -> &mut Leaf { + if self.leaves.len() <= i as usize { + self.extent_leaves(i as usize); + } + &mut self.leaves[i as usize] + } + + fn extent_leaf(&mut self) -> &mut Leaf { + let i = self.leaves.len(); + self.extent_leaves(self.leaves.len()); + if self.leaves[i].is_empty() { + self.leaves[i].init(); + } + &mut self.leaves[i] + } + + fn get_leaf(&self, i: i32) -> Option<&Leaf> { + if self.leaves.len() < i as usize { + return None; + } + if self.leaves[i as usize].is_empty() { + return None; + } + Some(&self.leaves[i as usize]) + } + + fn extent_leaves(&mut self, i: usize) { + let diff = i - self.leaves.len() + 1; + if diff > 0 { + for _i in 0..diff { + self.leaves.push(Leaf::new()); + } + } + } + + // q => tree count + // TODO: build failed + fn _build(&mut self, tree_num: i32, mt: metrics::Metric) { + let mut this_root: Vec = Vec::new(); + + loop { + if tree_num == -1 { + if self._tot_leaves_cnt >= 2 * self._tot_items_cnt { + break; + } + } else if this_root.len() >= (tree_num as usize) { + break; + } + + let mut indices: Vec = Vec::new(); + for i in 1..self._tot_items_cnt { + let leaf = self.get_leaf(i).unwrap(); + if leaf.n_descendants >= 1 { + indices.push(i as i32); + } + } + + let tree = self.make_tree(&indices, true, mt).unwrap(); + this_root.push(tree); + } + + // thread lock + self._roots.extend_from_slice(&this_root); + } + + fn make_tree( + &mut self, + indices: &[i32], + is_root: bool, + mt: metrics::Metric, + ) -> Result { + if indices.is_empty() { + return Err("empty indices"); + } + if indices.len() == 1 && !is_root { + return Ok(indices[0]); + } + + // the batch is a leaf cluster, make a parent node + if (indices.len() as i32) <= self._leaf_max_items + && (!is_root || self._tot_items_cnt <= self._leaf_max_items || indices.len() == 1) + { + self._tot_leaves_cnt += 1; + let item_cnt = self._tot_items_cnt; + let mut n = self.extent_leaf(); + + n.n_descendants = if is_root { + item_cnt + } else { + indices.len() as i32 + }; + n.children = indices.to_vec(); + + return Ok(self._tot_leaves_cnt); + } + + let mut children: Vec> = Vec::new(); + for j in indices.iter().skip(1) { + match self.get_leaf(*j) { + None => continue, + Some(leaf) => { + children.push(leaf.clone()); + } + } + } + + let mut new_parent_leaf = Leaf::new(); + let mut children_indices: [Vec; 2] = [Vec::new(), Vec::new()]; + + const ATTEMPT: usize = 5; + // find split hyperplane + for _i in 0..ATTEMPT { + children_indices[0].clear(); + children_indices[1].clear(); + self.create_split(children.as_slice(), &mut new_parent_leaf, mt) + .unwrap(); + + for leaf_idx in indices.iter().skip(1) { + let leaf = self.get_leaf(*leaf_idx as i32).unwrap(); + let side = self.side(&new_parent_leaf, &leaf.node.vectors()); + children_indices[(side as usize)].push(*leaf_idx); + } + + if calc::split_imbalance(&children_indices[0], &children_indices[1]) < 0.85 { + break; + } + } + + // don't get correct hyperplane situation + // TODO: record + while calc::split_imbalance(&children_indices[0], &children_indices[1]) > 0.98 { + children_indices[0].clear(); + children_indices[1].clear(); + + let is_initial = new_parent_leaf.node.len() == 0; + for z in 0..self._dimension { + if is_initial { + new_parent_leaf.node.push(&E::float_zero()); // TODO: make it const value + } else { + new_parent_leaf.node.mut_vectors()[z] = E::float_zero(); + } + } + + for j in indices.iter().skip(1) { + children_indices[random::flip() as usize].push(*j); + } + } + + let flip = (children_indices[0].len() > children_indices[1].len()) as bool; + + new_parent_leaf.n_descendants = if is_root { + self._tot_items_cnt + } else { + indices.len() as i32 + }; + + for side in 0..2 { + match self.make_tree(&children_indices[side ^ (flip as usize)], false, mt) { + Ok(tree) => { + new_parent_leaf.children[side ^ (flip as usize)] = tree; + } + Err(_e) => { + // TODO: log + continue; + } + } + } + self._tot_leaves_cnt += 1; + self.leaves.push(new_parent_leaf); + + Ok((self._tot_leaves_cnt) as i32) + } + + fn _search_k( + &self, + vectors: &[E], + n: usize, + ) -> Result, E)>, &'static str> { + let mut v_leaf = Leaf::::new(); + + v_leaf.node.set_vectors(&vectors.to_vec()); + + if self._roots.is_empty() || !self._built { + return Err("empty tree"); + } + + let mut candidate_size = self._candidate_size; + if candidate_size <= 0 { + candidate_size = (n * self._roots.len() * 2) as i32; + } + + let mut heap: BinaryHeap> = BinaryHeap::new(); // max-heap + self._roots.iter().for_each(|root| { + heap.push(neighbor::Neighbor { + _distance: self.pq_initial_value(), // float MAX + _idx: *root, + }); + }); + + // it use a heap to ensure the minest distance node will pop up + let mut nns: Vec = Vec::new(); + while nns.len() < (candidate_size as usize) && !(heap.is_empty()) { + let top = heap.peek().unwrap(); + let top_idx = top._idx; + let top_distance = top._distance; + + let nd = self.get_leaf(top_idx).unwrap(); + heap.pop(); + + if nd.n_descendants == 1 && (top_idx) < self._tot_items_cnt { + nns.push(top_idx); + } else if nd.n_descendants <= self._leaf_max_items { + nns.extend_from_slice(&nd.children); // push all of its children + } else { + let margin = self.margin(&nd, vectors)?; + // put two children into heap, and use distance to sort the order for poping up. + heap.push(neighbor::Neighbor { + _distance: self.pq_distance(top_distance, margin, 1), + _idx: nd.children[1], + }); + heap.push(neighbor::Neighbor { + _distance: self.pq_distance(top_distance, margin, 0), + _idx: nd.children[0], + }); + } + } + + nns.sort_unstable(); // sort id and filter dup to avoid same id; + let mut nns_vec: Vec> = Vec::new(); + let mut last = -1; + for j in nns.iter() { + if *j == last { + continue; + } + last = *j; + let leaf = self.get_leaf(*j).unwrap(); + if leaf.n_descendants == 1 { + nns_vec.push(neighbor::Neighbor::new( + *j as usize, + metrics::metric(&v_leaf.node.vectors(), &leaf.node.vectors(), self.mt).unwrap(), + )) + } + } + + nns_vec.sort_by(|a, b| a.distance().partial_cmp(&b.distance()).unwrap()); + let return_size = if n < nns_vec.len() { n } else { nns_vec.len() }; + let mut result: Vec<(node::Node, E)> = Vec::new(); + + for item in nns_vec.iter().take(return_size) { + result.push(( + self.get_leaf(item._idx as i32).unwrap().clone_node(), + item._distance, + )); + } + + Ok(result) + } + + fn show_trees(&self) { + let mut v = self._roots.clone(); + while !v.is_empty() { + let i = v.pop().unwrap(); + let item = self.get_leaf(i).unwrap(); + if item.n_descendants == 1 { + continue; + } + if !(item.children[0] == 0 && item.children[1] == 0) { + v.extend(&item.children); + } + println!( + "item {} children {:?}, vectors {:?}", + i, + item.children, + item.node.vectors() + ); + } + } + + // means same side? + fn margin(&self, src: &Leaf, dst: &[E]) -> Result { + calc::dot(&src.node.vectors(), &dst) + } + + fn side(&self, src: &Leaf, dst: &[E]) -> bool { + match self.margin(&src, &dst) { + Ok(x) => x > E::float_zero(), + Err(_e) => random::flip(), + } + } + + fn create_split( + &self, + leaves: &[Leaf], + new_mean_leaf: &mut Leaf, + mt: metrics::Metric, + ) -> Result<(), &'static str> { + let (p, q) = two_means(&leaves, mt)?; + + // TODO: remove + if new_mean_leaf.node.len() != 0 && new_mean_leaf.node.len() != p.node.len() { + return Err("empty leaf input"); + } + + // // get mean point between p and q. + let mut v = Vec::with_capacity(p.node.len()); + for i in 0..p.node.len() { + v.push(p.node.vectors()[i] - q.node.vectors()[i]); + } + new_mean_leaf.node.set_vectors(&v); + new_mean_leaf.normalize(); + Ok(()) + } + + fn pq_distance(&self, distance: E, mut margin: E, child_nr: usize) -> E { + if child_nr == 0 { + margin = -margin; + } + if distance < margin { + distance + } else { + margin + } + } + + fn pq_initial_value(&self) -> E { + E::max_value() + } +} + +impl ann_index::ANNIndex for BPTIndex { + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str> { + self.build(mt) + } + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str> { + self._add_item(item) + } + fn built(&self) -> bool { + self._built + } + + fn node_search_k( + &self, + item: &node::Node, + k: usize, + _args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + self._search_k(item.vectors(), k).unwrap() + } + + fn name(&self) -> &'static str { + "BPForestIndex" + } + + fn dimension(&self) -> usize { + self._dimension + } +} + +impl + ann_index::SerializableIndex for BPTIndex +{ + fn load(path: &str, _args: &arguments::Args) -> Result { + let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); + let mut instance: BPTIndex = bincode::deserialize_from(&file).unwrap(); + + for i in 0..instance.leaves.len() { + instance.leaves[i].node = + Box::new(instance.leaves[i].tmp_node.as_ref().unwrap().clone()); + instance.leaves[i].tmp_node = None; + } + + Ok(instance) + } + + fn dump(&mut self, path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + self.leaves + .iter_mut() + .for_each(|x| x.tmp_node = Some(*x.node.clone())); + let encoded_bytes = bincode::serialize(&self).unwrap(); + let mut file = File::create(path).unwrap(); + file.write_all(&encoded_bytes) + .unwrap_or_else(|_| panic!("unable to write file {:?}", path)); + Result::Ok(()) + } +} diff --git a/src/index/rpt_params.rs b/src/index/rpt_params.rs new file mode 100644 index 0000000..ed1a997 --- /dev/null +++ b/src/index/rpt_params.rs @@ -0,0 +1,51 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::metrics; +use crate::core::neighbor; +use crate::core::node; +use fixedbitset::FixedBitSet; +#[cfg(feature = "without_std")] +use hashbrown::HashSet; +use rand::prelude::*; +use rayon::prelude::*; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::cmp::Reverse; +use std::collections::BinaryHeap; +use std::collections::LinkedList; + +use crate::core::kmeans; +#[cfg(not(feature = "without_std"))] +use std::collections::HashSet; +use std::collections::VecDeque; + +use std::fs::File; +use std::io::Write; +use std::sync::{Arc, Mutex}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct BPTParams { + pub tree_num: i32, + pub candidate_size: i32, +} + +impl BPTParams { + pub fn tree_num(mut self, new_tree_num: i32) -> Self { + self.tree_num = new_tree_num; + self + } + pub fn candidate_size(mut self, new_candidate_size: i32) -> Self { + self.candidate_size = new_candidate_size; + self + } +} + +impl Default for BPTParams { + fn default() -> Self { + BPTParams { + tree_num: 0, + candidate_size: 0, + } + } +} diff --git a/src/index/ssg_idx.rs b/src/index/ssg_idx.rs new file mode 100644 index 0000000..4a8fca0 --- /dev/null +++ b/src/index/ssg_idx.rs @@ -0,0 +1,582 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::kmeans; +use crate::core::metrics; +use crate::core::neighbor; +use crate::core::node; +use crate::index::ssg_params::SSGParams; +use fixedbitset::FixedBitSet; +#[cfg(feature = "without_std")] +use hashbrown::HashSet; +use rand::prelude::*; +use rayon::prelude::*; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::cmp::Reverse; +use std::collections::BinaryHeap; +#[cfg(not(feature = "without_std"))] +use std::collections::HashSet; +use std::collections::LinkedList; +use std::collections::VecDeque; + +use std::fs::File; +use std::io::Write; +use std::sync::{Arc, Mutex}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct SSGIndex { + #[serde(skip_serializing, skip_deserializing)] + nodes: Vec>>, + tmp_nodes: Vec>, // only use for serialization scene + mt: metrics::Metric, + dimension: usize, + neighbor_neighbor_size: usize, + index_size: usize, + graph: Vec>, + knn_graph: Vec>, + init_k: usize, // as knn's k + root_nodes: Vec, + width: usize, + angle: E, + threshold: E, + root_size: usize, + + // stat + search_times: usize, +} + +impl SSGIndex { + pub fn new(dimension: usize, params: &SSGParams) -> SSGIndex { + SSGIndex:: { + nodes: Vec::new(), + tmp_nodes: Vec::new(), + mt: metrics::Metric::Unknown, + dimension, + neighbor_neighbor_size: params.neighbor_neighbor_size, + init_k: params.init_k, + graph: Vec::new(), + knn_graph: Vec::new(), + root_nodes: Vec::new(), + width: 0, + index_size: params.index_size, + angle: params.angle, + threshold: (params.angle / E::from_f32(180.0).unwrap() * E::PI()).cos(), + root_size: params.root_size, + + search_times: 0, + } + } + + fn build_knn_graph(&mut self) { + let tmp_graph = Arc::new(Mutex::new(vec![vec![0]; self.nodes.len()])); + (0..self.nodes.len()).into_par_iter().for_each(|n| { + let item = &self.nodes[n]; + let mut heap = BinaryHeap::with_capacity(self.init_k); + self.nodes + .iter() + .zip(0..self.nodes.len()) + .for_each(|(node, i)| { + if i == n { + return; + } + heap.push(neighbor::Neighbor::new( + i, + item.metric(&node, self.mt).unwrap(), + )); + if heap.len() > self.init_k { + heap.pop(); + } + }); + let mut tmp = Vec::with_capacity(heap.len()); + while !heap.is_empty() { + tmp.push(heap.pop().unwrap().idx()); + } + + tmp_graph.lock().unwrap()[n] = tmp; + }); + self.graph = tmp_graph.lock().unwrap().to_vec(); + self.knn_graph = tmp_graph.lock().unwrap().to_vec(); + } + + fn get_random_nodes_idx_lite(&self, indices: &mut [usize]) { + let mut rng = rand::thread_rng(); + (0..indices.len()).for_each(|i| { + indices[i] = rng.gen_range(0..self.nodes.len() - indices.len()); + }); + } + + fn get_point_neighbor_size_neighbors( + &self, + q: usize, + expand_neighbors_tmp: &mut Vec>, + ) { + let mut flags = HashSet::with_capacity(self.neighbor_neighbor_size); + + flags.insert(q); + for neighbor_id in self.graph[q].iter() { + for nn_id in self.graph[*neighbor_id].iter() { + if *neighbor_id == *nn_id { + continue; + } + if flags.contains(&nn_id) { + continue; + } + flags.insert(*nn_id); + let dist = self.nodes[q].metric(&self.nodes[*nn_id], self.mt).unwrap(); + expand_neighbors_tmp.push(neighbor::Neighbor::new(*nn_id, dist)); + if expand_neighbors_tmp.len() >= self.neighbor_neighbor_size { + return; + } + } + } + } + + fn expand_connectivity(&mut self) { + let range = self.index_size; + + let mut ids: Vec = (0..self.nodes.len()).collect(); + ids.shuffle(&mut thread_rng()); + for id in ids.iter().take(self.root_size) { + self.root_nodes.push(*id); + } + + (0..self.root_size).for_each(|i| { + let root_id = self.root_nodes[i]; + let mut flags = HashSet::new(); + let mut my_queue = VecDeque::new(); + my_queue.push_back(root_id); + flags.insert(root_id); + + let mut unknown_set: Vec = Vec::with_capacity(1); + while !unknown_set.is_empty() { + while !my_queue.is_empty() { + let q_front = my_queue.pop_front().unwrap(); + + for j in 0..self.graph[q_front].len() { + let child = self.graph[q_front][j]; + if flags.contains(&child) { + continue; + } + flags.insert(child); + my_queue.push_back(child); + } + } + unknown_set.clear(); + for j in 0..self.nodes.len() { + if flags.contains(&j) { + continue; + } + unknown_set.push(j); + } + if !unknown_set.is_empty() { + for j in 0..self.nodes.len() { + if flags.contains(&j) && self.graph[j].len() < range { + self.graph[j].push(unknown_set[0]); + break; + } + } + my_queue.push_back(unknown_set[0]); + flags.insert(unknown_set[0]); + } + } + }); + } + + fn link_each_nodes(&mut self, pruned_graph_tmp: &mut Vec>) { + let mut expand_neighbors_tmp = Vec::new(); + (0..self.nodes.len()).for_each(|i| { + expand_neighbors_tmp.clear(); + self.get_point_neighbor_size_neighbors(i, &mut expand_neighbors_tmp); // get related one + self.prune_graph( + i, + &mut expand_neighbors_tmp, + self.threshold, + pruned_graph_tmp, + ); + }); + (0..self.nodes.len()).for_each(|i| { + self.inter_insert(i, self.index_size, pruned_graph_tmp); + }); + } + + fn prune_graph( + &mut self, + query_id: usize, + expand_neighbors_tmp: &mut Vec>, + threshold: E, + pruned_graph_tmp: &mut Vec>, + ) { + let mut start = 0; + let mut flags = HashSet::with_capacity(expand_neighbors_tmp.len()); + for iter in expand_neighbors_tmp.iter() { + flags.insert(iter.idx()); + } + self.graph[query_id].iter().for_each(|linked_id| { + if flags.contains(linked_id) { + return; + } + expand_neighbors_tmp.push(neighbor::Neighbor::new( + *linked_id, + self.nodes[query_id] + .metric(&self.nodes[*linked_id], self.mt) + .unwrap(), + )); + }); + + expand_neighbors_tmp.sort_unstable(); + let mut result = Vec::new(); + if expand_neighbors_tmp[start].idx() == query_id { + start += 1; + } + result.push(expand_neighbors_tmp[start].clone()); + + start += 1; + while result.len() < self.index_size && start < expand_neighbors_tmp.len() { + let p = &expand_neighbors_tmp[start]; + let mut occlude = false; + // TODO: check every metrics, and decide use euclidean forcibly. + for iter in result.iter() { + if p.idx() == iter.idx() { + // stop early + occlude = true; + break; + } + let djk = self.nodes[iter.idx()] + .metric(&self.nodes[p.idx()], self.mt) + .unwrap(); + let cos_ij = (p.distance().powi(2) + iter.distance().powi(2) - djk.powi(2)) + / (E::from_usize(2).unwrap() * (p.distance() * iter.distance())); + + if cos_ij > threshold { + occlude = true; + break; + } + } + if !occlude { + result.push(p.clone()); + } + start += 1; + } + + (0..result.len()).for_each(|t| { + pruned_graph_tmp[t + query_id * self.index_size]._idx = result[t].idx(); + pruned_graph_tmp[t + query_id * self.index_size]._distance = result[t].distance(); + }); + if result.len() < self.index_size { + (result.len()..self.index_size).for_each(|i| { + pruned_graph_tmp[query_id * self.index_size + i]._distance = E::max_value(); + pruned_graph_tmp[query_id * self.index_size + i]._idx = self.nodes.len(); + // means not exist + }); + } + } + + // to handle neighbor's graph + fn inter_insert( + &self, + n: usize, + range: usize, + pruned_graph_tmp: &mut Vec>, + ) { + (0..range).for_each(|i| { + if pruned_graph_tmp[i + n].distance() == E::max_value() { + return; + } + + let sn = neighbor::Neighbor::new(n, pruned_graph_tmp[i + n].distance()); // distance of n to i + let des = pruned_graph_tmp[i + n].idx(); + let mut temp_pool = Vec::new(); + let mut dup = false; + + for j in 0..range { + if pruned_graph_tmp[j + des * self.index_size].distance() == E::max_value() { + break; + } + // each other has neighbor relationship + if n == pruned_graph_tmp[j + des * self.index_size].idx() { + // neighbor and point meet + dup = true; + break; + } + temp_pool.push(pruned_graph_tmp[j + des * self.index_size].clone()); + // neighbor's neighbor + } + + if dup { + return; + } + + temp_pool.push(sn.clone()); + if temp_pool.len() > range { + let mut result = Vec::new(); + let mut start = 0; + temp_pool.sort_unstable(); + result.push(temp_pool[start].clone()); + start += 1; + while result.len() < range && start < temp_pool.len() { + let p = &temp_pool[start]; + let mut occlude = false; + for rt in result.iter() { + if p.idx() == rt.idx() { + occlude = true; + break; + } + let djk = self.nodes[rt.idx()] + .metric(&self.nodes[p.idx()], self.mt) + .unwrap(); + let cos_ij = (p.distance().powi(2) + rt.distance().powi(2) - djk.powi(2)) + / (E::from_usize(2).unwrap() * (p.distance() * rt.distance())); + + if cos_ij > self.threshold { + occlude = true; + break; + } + } + if !occlude { + result.push(p.clone()); + } + start += 1; + } + (0..result.len()).for_each(|t| { + pruned_graph_tmp[t + des * self.index_size] = result[t].clone(); + }); + + if result.len() < range { + pruned_graph_tmp[result.len() + des * self.index_size]._distance = + E::max_value(); + } + } else { + for t in 0..range { + if pruned_graph_tmp[t + des * self.index_size].distance() == E::max_value() { + pruned_graph_tmp[t + des * self.index_size] = sn.clone(); + if (t + 1) < range { + pruned_graph_tmp[t + des * self.index_size]._distance = E::max_value(); + break; + } + } + } + } + }); + } + + fn _build(&mut self) { + self.build_knn_graph(); + + let mut pruned_graph_tmp: Vec> = + Vec::with_capacity(self.nodes.len() * self.index_size); + (0..self.nodes.len() * self.index_size).for_each(|i| { + pruned_graph_tmp.push(neighbor::Neighbor::::new(i, E::float_zero())); + }); + self.link_each_nodes(&mut pruned_graph_tmp); + + for i in 0..self.nodes.len() { + let mut pool_size = 0; + for j in 0..self.index_size { + if pruned_graph_tmp[i * self.index_size + j].distance() == E::max_value() { + break; + } + pool_size = j; + } + pool_size += 1; + self.graph[i] = Vec::with_capacity(pool_size); + for j in 0..pool_size { + self.graph[i].push(pruned_graph_tmp[i * self.index_size + j].idx()); + } + } + + self.expand_connectivity(); + + self.root_nodes = kmeans::general_kmeans(self.root_size, 256, &self.nodes, self.mt); + + let mut max = 0; + let mut min = self.nodes.len(); + let mut avg: f32 = 0.; + for t in 0..self.nodes.len() { + let size = self.graph[t].len(); + max = if max < size { size } else { max }; + min = if min > size { size } else { min }; + avg += size as f32; + } + avg /= 1.0 * self.nodes.len() as f32; + println!( + "stat: k: {:?}, max {:?}, min {:?}, avg {:?}", + self.init_k, max, min, avg + ); + } + + fn search( + &self, + query: &node::Node, + k: usize, + _args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + // let mut search_flags = HashSet::with_capacity(self.nodes.len()); + let mut search_flags = FixedBitSet::with_capacity(self.nodes.len()); + let mut heap: BinaryHeap> = BinaryHeap::new(); // max-heap + let mut search_queue: LinkedList = LinkedList::new(); + + let mut vec_tmp = Vec::with_capacity(self.root_nodes.len()); + self.root_nodes.iter().for_each(|n| { + let dist = self.nodes[*n].metric(query, self.mt).unwrap(); + vec_tmp.push(neighbor::Neighbor::new(*n, dist)); + }); + vec_tmp.sort(); + for iter in vec_tmp.iter() { + if heap.len() < k { + heap.push(iter.clone()); + search_queue.push_back(iter.idx()); + } + search_flags.insert(iter.idx()); + } + + let mut cnt = 0; + + // greedy BFS search + let mut c = Vec::new(); + while !search_queue.is_empty() { + let id = search_queue.pop_front().unwrap(); + + let mut contribute = 0; + let mut calc = 0; + let mut pass = 0; + let mut tmp = BinaryHeap::with_capacity(self.graph[id].len()); + for iter in self.graph[id].iter() { + if search_flags.contains(*iter) { + pass += 1; + continue; + } + + let dist = self.nodes[*iter].metric(query, self.mt).unwrap(); + tmp.push(Reverse(neighbor::Neighbor::new(*iter, dist))); + search_flags.insert(*iter); + calc += 1; + cnt += 1; + } + while !tmp.is_empty() { + let Reverse(item) = tmp.pop().unwrap(); + // let item = tmp.pop().unwrap(); + if item.distance() > heap.peek().unwrap().distance() { + break; + } + heap.pop(); + search_queue.push_back(item.idx()); + heap.push(item); + contribute += 1; + } + + c.push(( + contribute, + calc, + self.graph[id].len(), + // (contribute as f32) / (calc as f32), + // (calc as f32) / (self.graph[*id].len() as f32), + // (contribute as f32) / (self.graph[*id].len() as f32), + // (pass as f32) / (self.graph[*id].len() as f32), + )); + } + + // println!("stat_here cnt {:?}", cnt); + let mut result = Vec::with_capacity(heap.len()); + + while !heap.is_empty() { + let tmp = heap.pop().unwrap(); + result.push((*self.nodes[tmp.idx()].clone(), tmp.distance())); + } + result.reverse(); + result + } + + fn check_edge(&self, h: usize, t: usize) -> bool { + let mut flag = true; + for i in 0..self.graph[h].len() { + if t == self.graph[h][i] { + flag = false; + } + } + flag + } + + fn connectivity_profile(&self) { + let mut visited = HashSet::with_capacity(self.nodes.len()); + let mut queue = VecDeque::new(); + + queue.push_back(0); + while !queue.is_empty() { + let id = queue.pop_front().unwrap(); + if visited.contains(&id) { + continue; + } + + for x in 0..self.graph[id].len() { + queue.push_back(self.graph[id][x]); + if self.graph[id][x] > self.nodes.len() { + // println!("{:?} {:?} {:?}", self.graph[id][x], self.graph[id], id); + } + } + visited.insert(id); + } + + println!("connectivity: {:?} {:?}", self.nodes.len(), visited.len()); + } +} + +impl + ann_index::SerializableIndex for SSGIndex +{ + fn load(path: &str, _args: &arguments::Args) -> Result { + let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); + let mut instance: SSGIndex = bincode::deserialize_from(&file).unwrap(); + instance.nodes = instance + .tmp_nodes + .iter() + .map(|x| Box::new(x.clone())) + .collect(); + Ok(instance) + } + + fn dump(&mut self, path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + self.tmp_nodes = self.nodes.iter().map(|x| *x.clone()).collect(); + let encoded_bytes = bincode::serialize(&self).unwrap(); + let mut file = File::create(path).unwrap(); + file.write_all(&encoded_bytes) + .unwrap_or_else(|_| panic!("unable to write file {:?}", path)); + Result::Ok(()) + } +} + +impl ann_index::ANNIndex for SSGIndex { + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str> { + self.mt = mt; + self._build(); + + Result::Ok(()) + } + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str> { + self.nodes.push(Box::new(item.clone())); + Result::Ok(()) + } + fn built(&self) -> bool { + true + } + fn node_search_k( + &self, + item: &node::Node, + k: usize, + args: &arguments::Args, + ) -> Vec<(node::Node, E)> { + self.search(&item, k, &args) + } + + fn name(&self) -> &'static str { + "SSGIndex" + } + + fn nodes_size(&self) -> usize { + self.nodes.len() + } + + fn dimension(&self) -> usize { + self.dimension + } +} diff --git a/src/index/ssg_params.rs b/src/index/ssg_params.rs new file mode 100644 index 0000000..2a189f9 --- /dev/null +++ b/src/index/ssg_params.rs @@ -0,0 +1,69 @@ +#![allow(dead_code)] +use crate::core::ann_index; +use crate::core::arguments; +use crate::core::metrics; +use crate::core::neighbor; +use crate::core::node; +use fixedbitset::FixedBitSet; +#[cfg(feature = "without_std")] +use hashbrown::HashSet; +use rand::prelude::*; +use rayon::prelude::*; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use std::cmp::Reverse; +use std::collections::BinaryHeap; +use std::collections::LinkedList; + +use crate::core::kmeans; +#[cfg(not(feature = "without_std"))] +use std::collections::HashSet; +use std::collections::VecDeque; + +use std::fs::File; +use std::io::Write; +use std::sync::{Arc, Mutex}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct SSGParams { + pub angle: E, + pub init_k: usize, + pub index_size: usize, + pub neighbor_neighbor_size: usize, + pub root_size: usize, +} + +impl SSGParams { + pub fn angle(mut self, new_angle: f32) -> Self { + self.angle = E::from_f32(new_angle).unwrap(); + self + } + pub fn init_k(mut self, new_init_k: usize) -> Self { + self.init_k = new_init_k; + self + } + pub fn index_size(mut self, new_index_size: usize) -> Self { + self.index_size = new_index_size; + self + } + pub fn neighbor_neighbor_size(mut self, new_neighbor_neighbor_size: usize) -> Self { + self.neighbor_neighbor_size = new_neighbor_neighbor_size; + self + } + pub fn root_size(mut self, new_root_size: usize) -> Self { + self.root_size = new_root_size; + self + } +} + +impl Default for SSGParams { + fn default() -> Self { + SSGParams { + angle: E::from_f32(30.0).unwrap(), + init_k: 100, + index_size: 100, + neighbor_neighbor_size: 100, + root_size: 30, + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..7fac92e --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +mod core; +mod index; From 0984da0caaebc722da244f29e09a1e06c30c335a Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 02:55:19 +0800 Subject: [PATCH 02/22] update: update name --- benches/bench_metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/bench_metrics.rs b/benches/bench_metrics.rs index 53d21d4..10f4511 100644 --- a/benches/bench_metrics.rs +++ b/benches/bench_metrics.rs @@ -1,4 +1,4 @@ -use fastann::core::metrics; +use hora::core::metrics; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rand::distributions::Standard; From c231cdb80239f549a6c41d5c789414d6c9064dea Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 02:58:40 +0800 Subject: [PATCH 03/22] update: update code for clippy warning --- src/core/kmeans.rs | 2 +- src/index/bruteforce_idx.rs | 2 +- src/index/bruteforce_params.rs | 8 +------- src/index/hnsw_params.rs | 18 ++---------------- src/index/mod.rs | 20 ++++++++++---------- src/index/pq_params.rs | 14 +------------- src/index/rpt_params.rs | 22 ++-------------------- src/index/ssg_idx.rs | 21 --------------------- src/index/ssg_params.rs | 22 +++------------------- src/lib.rs | 4 ++-- 10 files changed, 23 insertions(+), 110 deletions(-) diff --git a/src/core/kmeans.rs b/src/core/kmeans.rs index 4911f2b..e7e4b9f 100644 --- a/src/core/kmeans.rs +++ b/src/core/kmeans.rs @@ -314,7 +314,7 @@ mod tests { let mut base: Vec = Vec::with_capacity(dimension); for _i in 0..dimension { let n: f64 = rng.gen::() * range; // base number - base.push((n as f32)); + base.push(n as f32); } let v_iter: Vec = rng diff --git a/src/index/bruteforce_idx.rs b/src/index/bruteforce_idx.rs index e552c3a..00b76e7 100644 --- a/src/index/bruteforce_idx.rs +++ b/src/index/bruteforce_idx.rs @@ -23,7 +23,7 @@ pub struct BruteForceIndex { } impl BruteForceIndex { - pub fn new(dimension: usize, params: BruteForceParams) -> BruteForceIndex { + pub fn new(dimension: usize, _params: BruteForceParams) -> BruteForceIndex { BruteForceIndex:: { nodes: Vec::new(), mt: metrics::Metric::Unknown, diff --git a/src/index/bruteforce_params.rs b/src/index/bruteforce_params.rs index de6498e..e9c0d20 100644 --- a/src/index/bruteforce_params.rs +++ b/src/index/bruteforce_params.rs @@ -1,12 +1,6 @@ #![allow(dead_code)] -use crate::core::ann_index; -use crate::core::arguments; -use crate::core::metrics; -use crate::core::neighbor; -use crate::core::node; -use serde::de::DeserializeOwned; + use serde::{Deserialize, Serialize}; -use std::collections::BinaryHeap; #[derive(Debug, Serialize, Deserialize)] pub struct BruteForceParams {} diff --git a/src/index/hnsw_params.rs b/src/index/hnsw_params.rs index 9f8e9e2..bb85f5f 100644 --- a/src/index/hnsw_params.rs +++ b/src/index/hnsw_params.rs @@ -1,30 +1,16 @@ #![allow(dead_code)] -use crate::core::ann_index; -use crate::core::arguments; -use crate::core::metrics; -use crate::core::neighbor::Neighbor; + use crate::core::node; -use fixedbitset::FixedBitSet; + #[cfg(feature = "without_std")] use hashbrown::HashMap; #[cfg(feature = "without_std")] use hashbrown::HashSet; -use rand::prelude::*; -use rayon::{iter::IntoParallelIterator, prelude::*}; -use serde::de::DeserializeOwned; -use std::collections::BinaryHeap; use serde::{Deserialize, Serialize}; #[cfg(not(feature = "without_std"))] -use std::collections::HashMap; #[cfg(not(feature = "without_std"))] -use std::collections::HashSet; -use std::fs::File; -use std::io::Write; - -use std::sync::RwLock; - #[derive(Debug, Serialize, Deserialize)] pub struct HNSWParams { pub max_item: usize, diff --git a/src/index/mod.rs b/src/index/mod.rs index 4387cb7..8cfe04b 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1,10 +1,10 @@ -mod bruteforce_idx; -mod bruteforce_params; -mod hnsw_idx; -mod hnsw_params; -mod pq_idx; -mod pq_params; -mod rpt_idx; -mod rpt_params; -mod ssg_idx; -mod ssg_params; +pub mod bruteforce_idx; +pub mod bruteforce_params; +pub mod hnsw_idx; +pub mod hnsw_params; +pub mod pq_idx; +pub mod pq_params; +pub mod rpt_idx; +pub mod rpt_params; +pub mod ssg_idx; +pub mod ssg_params; diff --git a/src/index/pq_params.rs b/src/index/pq_params.rs index 55215e5..7e4776b 100644 --- a/src/index/pq_params.rs +++ b/src/index/pq_params.rs @@ -1,21 +1,9 @@ #![allow(dead_code)] -use crate::core::ann_index; -use crate::core::arguments; -use crate::core::kmeans; -use crate::core::metrics; -use crate::core::neighbor::Neighbor; -use crate::core::node; -use rayon::prelude::*; -use serde::de::DeserializeOwned; -use std::collections::BinaryHeap; +use crate::core::node; use serde::{Deserialize, Serialize}; -use std::fs::File; - -use std::io::Write; - #[derive(Debug, Serialize, Deserialize)] pub struct PQParams { pub n_sub: usize, diff --git a/src/index/rpt_params.rs b/src/index/rpt_params.rs index ed1a997..7eb470f 100644 --- a/src/index/rpt_params.rs +++ b/src/index/rpt_params.rs @@ -1,29 +1,11 @@ #![allow(dead_code)] -use crate::core::ann_index; -use crate::core::arguments; -use crate::core::metrics; -use crate::core::neighbor; -use crate::core::node; -use fixedbitset::FixedBitSet; + #[cfg(feature = "without_std")] use hashbrown::HashSet; -use rand::prelude::*; -use rayon::prelude::*; -use serde::de::DeserializeOwned; + use serde::{Deserialize, Serialize}; -use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::collections::LinkedList; -use crate::core::kmeans; #[cfg(not(feature = "without_std"))] -use std::collections::HashSet; -use std::collections::VecDeque; - -use std::fs::File; -use std::io::Write; -use std::sync::{Arc, Mutex}; - #[derive(Debug, Serialize, Deserialize)] pub struct BPTParams { pub tree_num: i32, diff --git a/src/index/ssg_idx.rs b/src/index/ssg_idx.rs index 4a8fca0..77c0119 100644 --- a/src/index/ssg_idx.rs +++ b/src/index/ssg_idx.rs @@ -430,28 +430,19 @@ impl SSGIndex { search_flags.insert(iter.idx()); } - let mut cnt = 0; - // greedy BFS search - let mut c = Vec::new(); while !search_queue.is_empty() { let id = search_queue.pop_front().unwrap(); - let mut contribute = 0; - let mut calc = 0; - let mut pass = 0; let mut tmp = BinaryHeap::with_capacity(self.graph[id].len()); for iter in self.graph[id].iter() { if search_flags.contains(*iter) { - pass += 1; continue; } let dist = self.nodes[*iter].metric(query, self.mt).unwrap(); tmp.push(Reverse(neighbor::Neighbor::new(*iter, dist))); search_flags.insert(*iter); - calc += 1; - cnt += 1; } while !tmp.is_empty() { let Reverse(item) = tmp.pop().unwrap(); @@ -462,21 +453,9 @@ impl SSGIndex { heap.pop(); search_queue.push_back(item.idx()); heap.push(item); - contribute += 1; } - - c.push(( - contribute, - calc, - self.graph[id].len(), - // (contribute as f32) / (calc as f32), - // (calc as f32) / (self.graph[*id].len() as f32), - // (contribute as f32) / (self.graph[*id].len() as f32), - // (pass as f32) / (self.graph[*id].len() as f32), - )); } - // println!("stat_here cnt {:?}", cnt); let mut result = Vec::with_capacity(heap.len()); while !heap.is_empty() { diff --git a/src/index/ssg_params.rs b/src/index/ssg_params.rs index 2a189f9..51f3aaa 100644 --- a/src/index/ssg_params.rs +++ b/src/index/ssg_params.rs @@ -1,29 +1,13 @@ #![allow(dead_code)] -use crate::core::ann_index; -use crate::core::arguments; -use crate::core::metrics; -use crate::core::neighbor; + use crate::core::node; -use fixedbitset::FixedBitSet; + #[cfg(feature = "without_std")] use hashbrown::HashSet; -use rand::prelude::*; -use rayon::prelude::*; -use serde::de::DeserializeOwned; + use serde::{Deserialize, Serialize}; -use std::cmp::Reverse; -use std::collections::BinaryHeap; -use std::collections::LinkedList; -use crate::core::kmeans; #[cfg(not(feature = "without_std"))] -use std::collections::HashSet; -use std::collections::VecDeque; - -use std::fs::File; -use std::io::Write; -use std::sync::{Arc, Mutex}; - #[derive(Debug, Serialize, Deserialize)] pub struct SSGParams { pub angle: E, diff --git a/src/lib.rs b/src/lib.rs index 7fac92e..da01de0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,2 @@ -mod core; -mod index; +pub mod core; +pub mod index; From fa7529a1de1d948dafa7638ebb3b722f55c28011 Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 03:08:33 +0800 Subject: [PATCH 04/22] update: update code for clippy warning --- CHANGELOG.md | 0 benches/bench_metrics.rs | 4 ++-- src/core/kmeans.rs | 6 +++--- src/core/knn.rs | 2 +- src/index/rpt_idx.rs | 34 +++++++++++++++++++--------------- 5 files changed, 25 insertions(+), 21 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/benches/bench_metrics.rs b/benches/bench_metrics.rs index 10f4511..db30a7f 100644 --- a/benches/bench_metrics.rs +++ b/benches/bench_metrics.rs @@ -22,7 +22,7 @@ fn make_normal_distribution_clustering( let mut base: Vec = Vec::with_capacity(dimension); for _i in 0..dimension { let n: f64 = rng.gen::() * range; // base number - base.push((n as f32)); + base.push(n as f32); } let v_iter: Vec = rng @@ -32,7 +32,7 @@ fn make_normal_distribution_clustering( .clone(); for _i in 0..node_n { let mut vec_item = Vec::with_capacity(dimension); - for i in 0..dimension { + for (i, _item) in base.iter().enumerate().take(dimension) { let vv = (v_iter[_i * dimension..(_i + 1) * dimension][i] as f32) + base[i]; // add normal distribution noise vec_item.push(vv); } diff --git a/src/core/kmeans.rs b/src/core/kmeans.rs index e7e4b9f..18b2308 100644 --- a/src/core/kmeans.rs +++ b/src/core/kmeans.rs @@ -128,7 +128,7 @@ impl Kmeans { pub fn search_data( &mut self, batch_size: usize, - batch_data: &Vec>, + batch_data: &[Vec], assigned_center: &mut Vec, ) { let n_center = self._n_center; @@ -192,7 +192,7 @@ impl Kmeans { Ok(()) } - pub fn train(&mut self, batch_size: usize, batch_data: &Vec>, n_epoch: usize) { + pub fn train(&mut self, batch_size: usize, batch_data: &[Vec], n_epoch: usize) { self.init_center(batch_size, batch_data); (0..n_epoch).for_each(|epoch| { let mut assigned_center: Vec = Vec::with_capacity(batch_size); @@ -238,7 +238,7 @@ pub fn general_kmeans( nodes.par_iter().zip(0..nodes.len()).for_each(|(node, _j)| { let mut idx = 0; let mut distance = E::max_value(); - for i in 0..means.len() { + for (i, _item) in means.iter().enumerate() { let _distance = node.metric(&means[i], mt).unwrap(); if _distance < distance { idx = i; diff --git a/src/core/knn.rs b/src/core/knn.rs index 2b3534c..1ff2c9a 100644 --- a/src/core/knn.rs +++ b/src/core/knn.rs @@ -433,7 +433,7 @@ mod tests { use std::collections::HashSet; use std::iter::FromIterator; - use std::time::{Duration, SystemTime, UNIX_EPOCH}; + use std::time::SystemTime; fn make_normal_distribution_clustering( clustering_n: usize, node_n: usize, diff --git a/src/index/rpt_idx.rs b/src/index/rpt_idx.rs index eb0491d..df5ca0c 100644 --- a/src/index/rpt_idx.rs +++ b/src/index/rpt_idx.rs @@ -117,12 +117,12 @@ fn two_means( j += 1; } - let mut p = Leaf::copy_leaf(&leaves[i]); - let mut q = Leaf::copy_leaf(&leaves[j]); + let mut first = Leaf::copy_leaf(&leaves[i]); + let mut second = Leaf::copy_leaf(&leaves[j]); if mt == metrics::Metric::CosineSimilarity { - p.normalize(); - q.normalize(); + first.normalize(); + second.normalize(); } // TODO: dot normalize @@ -134,14 +134,16 @@ fn two_means( // produce two mean point. for _z in 0..ITERATION_STEPS { - let k = random::index(count); - let di = ic * metrics::metric(&p.node.vectors(), &leaves[k].node.vectors(), mt).unwrap(); - let dj = jc * metrics::metric(&q.node.vectors(), &leaves[k].node.vectors(), mt).unwrap(); + let rand_k = random::index(count); + let di = ic + * metrics::metric(&first.node.vectors(), &leaves[rand_k].node.vectors(), mt).unwrap(); + let dj = jc + * metrics::metric(&second.node.vectors(), &leaves[rand_k].node.vectors(), mt).unwrap(); // let mut norm = one; if mt == metrics::Metric::CosineSimilarity { - norm = calc::get_norm(&leaves[k].node.vectors()).unwrap(); + norm = calc::get_norm(&leaves[rand_k].node.vectors()).unwrap(); match norm.partial_cmp(&zero) { Some(Ordering::Equal) | Some(Ordering::Less) => continue, _ => {} @@ -150,20 +152,22 @@ fn two_means( // make p more closer to k in space. if di < dj { - for l in 0..p.node.len() { - p.node.mut_vectors()[l] = - (p.node.vectors()[l] * ic + leaves[k].node.vectors()[l] / norm) / (ic + one); + for l in 0..first.node.len() { + first.node.mut_vectors()[l] = (first.node.vectors()[l] * ic + + leaves[rand_k].node.vectors()[l] / norm) + / (ic + one); } ic += one; } else if dj < di { - for l in 0..q.node.len() { - q.node.mut_vectors()[l] = - (q.node.vectors()[l] * jc + leaves[k].node.vectors()[l] / norm) / (jc + one); + for l in 0..second.node.len() { + second.node.mut_vectors()[l] = (second.node.vectors()[l] * jc + + leaves[rand_k].node.vectors()[l] / norm) + / (jc + one); } jc += one; } } - Ok((p, q)) + Ok((first, second)) } #[derive(Default, Debug, Serialize, Deserialize)] From 12779ec1710f32e2ca15df4cdb6d428959d0922b Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 05:12:23 +0800 Subject: [PATCH 05/22] update: add readme --- README.md | 271 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 270 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 82822d8..56e6eb9 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,271 @@ -# hora +# Hora + approximate nearest neighbor search library in Rust + +## Introduction + +Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at that!`. + +## Key Features + +* **Performant** ⚡️ + * SIMD acceleration + * stuble algorithm implementation + * multiple threads design + +* **Multi Language Support** ☄️ + * `Python` + * `Javascript` + * `Java` + * `Go` + * `Ruby` + * `Swift` (WIP) + * `R` (WIP) + * `Julia` (WIP) + * and it also can serve as a service + +* **Portable** 💼 + * `no_std` support (WIP) + * `Windows`, `Linux` and `OS X` support + * `IOS` and `Android` Support (WIP) + * thanks for `LLVM`, Hora can be used in `x66` and `ARM` CPUs + +* **security** 🔒 + * thanks for rust strict compiler + * all language lib's Hora memory is managed by the Rust + * full coverage testing + +* **Multiple Index support** 🚀 + * `Hierarchical Navigable Small World Graph Index(HNSW)` ([reference](https://arxiv.org/abs/1603.09320)) + * `Satellite System Graph (SSG)` ([reference](https://arxiv.org/abs/1907.06146)) + * `Product Quantization Inverted File(PQIVF)` ([reference](https://lear.inrialpes.fr/pubs/2011/JDS11/jegou_searching_with_quantization.pdf)) + * `Random Projection Tree(RPT)`, `BruteForce` + +* **Light** 💡 + * the whole library did not dependent any heavy library, such as `BLAS` + +* **Configurable Compilation** 📕 + * Hora support some features, such as `SIMD` + +* **Multiple Distances Support** 🧮 + * `Dot Product distance` + * + * `Euclidean distance` + * + * `Manhattan distance` + * + * `cosine distance` + * + +* **Productive** ⭐ + * well documented + * elegant and simple API, which is extremely easy to learn + +## Installation + +### rust + +add this into `Cargo.toml` + +```toml +[dependencies] +hora = "0.1.0" +``` + +### Python + +```Bash +pip install hora +``` + +### Building from source + +```bash +git clone https://github.com/hora-search/hora +cargo build +``` + +## Interface + +All Index have already implement these method + +```Rust +pub trait ANNIndex: Send + Sync { + /// build up the ANN index + /// + /// build up index with all node which have add into before, it will cost some time, and the time it cost depends on the algorithm + /// return `Err(&'static str)` if there is something wrong with the building process, and the `static str` is the debug reason + fn build(&mut self, mt: metrics::Metric) -> Result<(), &'static str>; + + /// add node internal method + /// + /// it will allocate a space in the heap(Vector), and init a `Node` + /// return `Err(&'static str)` if there is something wrong with the adding process, and the `static str` is the debug reason + fn add_node(&mut self, item: &node::Node) -> Result<(), &'static str>; + + /// add node + /// + /// call `add_node()` internal + fn add(&mut self, vs: &[E], idx: T) -> Result<(), &'static str> { + self.add_node(&node::Node::new_with_idx(vs, idx)) + } + + /// add multiple node one time + /// + /// return `Err(&'static str)` if there is something wrong with the adding process, and the `static str` is the debug reason + fn add_batch(&mut self, vss: &[&[E]], indices: &[T]) -> Result<(), &'static str> { + if vss.len() != indices.len() { + return Err("vector's size is different with index"); + } + for idx in 0..vss.len() { + let n = node::Node::new_with_idx(vss[idx], indices[idx].clone()); + if let Err(err) = self.add_node(&n) { + return Err(err); + } + } + Ok(()) + } + + /// return the index has already been built or not + /// + /// return `True` if the index has been built + fn built(&self) -> bool; + + /// to rebuild the index with all nodes inside + /// + /// /// return `Err(&'static str)` if there is something wrong with the rebuilding process, and the `static str` is the debug reason + fn rebuild(&mut self, _mt: metrics::Metric) -> Result<(), &'static str> { + Err("not implement") + } + + /// search for k nearest neighbors node internal method + fn node_search_k( + &self, + item: &node::Node, + k: usize, + args: &arguments::Args, + ) -> Vec<(node::Node, E)>; + + /// search for k nearest neighbors and return full info + /// + /// it will return the all node's info including the original vectors, and the metric distance + /// + /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic + fn search_full(&self, item: &[E], k: usize) -> Vec<(node::Node, E)> { + assert_eq!(item.len(), self.dimension()); + self.node_search_k(&node::Node::new(item), k, &arguments::Args::new()) + } + + /// search for k nearest neighbors + /// + /// it only return the idx of the nearest node + /// + /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic + fn search(&self, item: &[E], k: usize) -> Vec { + assert_eq!(item.len(), self.dimension()); + self.node_search_k(&node::Node::new(item), k, &arguments::Args::new()) + .iter() + .map(|x| x.0.idx().as_ref().unwrap().clone()) + .collect::>() + } + + /// return the name of the Index + /// format like this + /// `HNSWIndex(Hierarchical Navigable Small World Index)` + fn name(&self) -> &'static str; + + /// internal nodes' size + fn nodes_size(&self) -> usize { + 0 + } + + /// clear all nodes and index built before + fn clear(&mut self) {} + + /// return String of Index statistics informations + fn idx_info(&self) -> String { + "not implement".to_string() + } + + /// return the dimension it require + fn dimension(&self) -> usize { + 0 + } +} + +pub trait SerializableIndex< + E: node::FloatElement + DeserializeOwned, + T: node::IdxType + DeserializeOwned, +>: Send + Sync + ANNIndex +{ + /// load file with path + fn load(_path: &str, _args: &arguments::Args) -> Result + where + Self: Sized, + { + Err("empty implementation") + } + + /// dump the file into the path + fn dump(&mut self, _path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + Err("empty implementation") + } +} +``` + +## Benchmark + +pic here + +## Example + +Rust usage example + +```Rust + +``` + +Python usage exmaple + +```Python + +``` + +## Related Project and Comparison + +* [Faiss](https://github.com/facebookresearch/faiss): Facebook AI Similarity Search, which is the most popular ANN library currently + * Diffrences: Faiss more focus on the GPU scene, and Hora is more light than Faiss + +* Annoy + +### Contribute + +we are pretty gald to have you to participate, any contributions is welcome, including the documentations and tests. +you can do the `Pull Requests`, `Issue` on the github, and we will review it as soon as possible. + +We use GitHub issues for tracking suggestions and bugs. + +To install for development: + +#### clone the repo + +```bash +git clone https://github.com/hora-search/hora +``` + +#### build + +```bash +cargo build +``` + +#### try the changes + +```bash +cd exmaples +cargo run +``` + +## License + +The entire repo is under Apache License. From 88808a3e6243262fbc83f76ae26cb35170af3c5c Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 05:17:10 +0800 Subject: [PATCH 06/22] add: add github workflows --- .github/workflows/rust.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/rust.yml diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..1fb3d53 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,24 @@ +name: Rust + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Build + run: cargo build --verbose + - name: Build Nightly + run: cargo +nightly build --verbose + - name: Run tests + run: cargo test --verbose From 7341b83381060d1bac2994ca5f4a5350ef10f64e Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 05:24:26 +0800 Subject: [PATCH 07/22] fix: fix readme latext display bug --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 56e6eb9..e0fe5cf 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * `Hierarchical Navigable Small World Graph Index(HNSW)` ([reference](https://arxiv.org/abs/1603.09320)) * `Satellite System Graph (SSG)` ([reference](https://arxiv.org/abs/1907.06146)) * `Product Quantization Inverted File(PQIVF)` ([reference](https://lear.inrialpes.fr/pubs/2011/JDS11/jegou_searching_with_quantization.pdf)) - * `Random Projection Tree(RPT)`, `BruteForce` + * `Random Projection Tree(RPT)` + * `BruteForce` * **Light** 💡 * the whole library did not dependent any heavy library, such as `BLAS` @@ -49,13 +50,13 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * **Multiple Distances Support** 🧮 * `Dot Product distance` - * + * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Csum%7B%28x*y%29%7D) * `Euclidean distance` - * + * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Csqrt%7B%5Csum%7B%28x-y%29%5E2%7D%7D) * `Manhattan distance` - * + * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Csum%7B%7C%28x-y%29%7C%7D) * `cosine distance` - * + * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Cfrac%7Bx%20*y%7D%7B%7C%7Cx%7C%7C*%7C%7Cy%7C%7C%7D) * **Productive** ⭐ * well documented From 8525e4f48937f724ecf7e77879d3e4d416e5c56f Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 14 Jun 2021 16:23:52 +0800 Subject: [PATCH 08/22] update: update readme and github workflow --- .github/workflows/rust.yml | 2 -- README.md | 54 ++++++++++++++++++++++---------------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 1fb3d53..7ae98f3 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -18,7 +18,5 @@ jobs: - uses: actions/checkout@v2 - name: Build run: cargo build --verbose - - name: Build Nightly - run: cargo +nightly build --verbose - name: Run tests run: cargo test --verbose diff --git a/README.md b/README.md index e0fe5cf..50fbcbc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Hora -approximate nearest neighbor search library in Rust +a approximate nearest neighbor search library, written in Rust ## Introduction @@ -13,7 +13,7 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * stuble algorithm implementation * multiple threads design -* **Multi Language Support** ☄️ +* **Multi Language Support** ☄️ * `Python` * `Javascript` * `Java` @@ -22,7 +22,14 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * `Swift` (WIP) * `R` (WIP) * `Julia` (WIP) - * and it also can serve as a service + * **also can serve as a service** + +* **Multiple Index support** 🚀 + * `Hierarchical Navigable Small World Graph Index(HNSW)` ([reference](https://arxiv.org/abs/1603.09320)) + * `Satellite System Graph (SSG)` ([reference](https://arxiv.org/abs/1907.06146)) + * `Product Quantization Inverted File(PQIVF)` ([reference](https://lear.inrialpes.fr/pubs/2011/JDS11/jegou_searching_with_quantization.pdf)) + * `Random Projection Tree(RPT)` + * `BruteForce` * **Portable** 💼 * `no_std` support (WIP) @@ -30,24 +37,11 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * `IOS` and `Android` Support (WIP) * thanks for `LLVM`, Hora can be used in `x66` and `ARM` CPUs -* **security** 🔒 +* **Security** 🔒 * thanks for rust strict compiler * all language lib's Hora memory is managed by the Rust * full coverage testing -* **Multiple Index support** 🚀 - * `Hierarchical Navigable Small World Graph Index(HNSW)` ([reference](https://arxiv.org/abs/1603.09320)) - * `Satellite System Graph (SSG)` ([reference](https://arxiv.org/abs/1907.06146)) - * `Product Quantization Inverted File(PQIVF)` ([reference](https://lear.inrialpes.fr/pubs/2011/JDS11/jegou_searching_with_quantization.pdf)) - * `Random Projection Tree(RPT)` - * `BruteForce` - -* **Light** 💡 - * the whole library did not dependent any heavy library, such as `BLAS` - -* **Configurable Compilation** 📕 - * Hora support some features, such as `SIMD` - * **Multiple Distances Support** 🧮 * `Dot Product distance` * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Csum%7B%28x*y%29%7D) @@ -58,6 +52,12 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * `cosine distance` * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Cfrac%7Bx%20*y%7D%7B%7C%7Cx%7C%7C*%7C%7Cy%7C%7C%7D) +* **Light** 💡 + * the whole library did not dependent any heavy library, such as `BLAS` + +* **Configurable** 📕 + * Hora support some features, such as `SIMD`, `no_std` + * **Productive** ⭐ * well documented * elegant and simple API, which is extremely easy to learn @@ -76,16 +76,20 @@ hora = "0.1.0" ### Python ```Bash -pip install hora +$ pip install hora ``` ### Building from source ```bash -git clone https://github.com/hora-search/hora -cargo build +$ git clone https://github.com/hora-search/hora +$ cargo build ``` +## Benchmark + +pic here + ## Interface All Index have already implement these method @@ -214,9 +218,7 @@ pub trait SerializableIndex< } ``` -## Benchmark -pic here ## Example @@ -232,6 +234,14 @@ Python usage exmaple ``` +## Roadmap + +- [ ] Full Coverage Test +- [ ] implement a [EFANNA](http://arxiv.org/abs/1609.07228) to achieve faster KNN buiding +- [ ] Swift Support and also IOS/Mac OS deployment example +- [ ] R Support +- [ ] mmap file support + ## Related Project and Comparison * [Faiss](https://github.com/facebookresearch/faiss): Facebook AI Similarity Search, which is the most popular ANN library currently From e11faf601df479b6063c3629fc5abf13a1bbefee Mon Sep 17 00:00:00 2001 From: salamer Date: Mon, 21 Jun 2021 01:16:14 +0800 Subject: [PATCH 09/22] chore: fix typo --- src/index/bruteforce_idx.rs | 2 +- src/index/hnsw_params.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/index/bruteforce_idx.rs b/src/index/bruteforce_idx.rs index 00b76e7..519c5f3 100644 --- a/src/index/bruteforce_idx.rs +++ b/src/index/bruteforce_idx.rs @@ -23,7 +23,7 @@ pub struct BruteForceIndex { } impl BruteForceIndex { - pub fn new(dimension: usize, _params: BruteForceParams) -> BruteForceIndex { + pub fn new(dimension: usize, _params: &BruteForceParams) -> BruteForceIndex { BruteForceIndex:: { nodes: Vec::new(), mt: metrics::Metric::Unknown, diff --git a/src/index/hnsw_params.rs b/src/index/hnsw_params.rs index bb85f5f..1b450a9 100644 --- a/src/index/hnsw_params.rs +++ b/src/index/hnsw_params.rs @@ -9,7 +9,6 @@ use hashbrown::HashSet; use serde::{Deserialize, Serialize}; -#[cfg(not(feature = "without_std"))] #[cfg(not(feature = "without_std"))] #[derive(Debug, Serialize, Deserialize)] pub struct HNSWParams { From 86629d62d599a39fb24659ed3d6713eed8758423 Mon Sep 17 00:00:00 2001 From: salamer Date: Tue, 22 Jun 2021 01:05:22 +0800 Subject: [PATCH 10/22] chore: format whole project --- .github/workflows/rust.yml | 15 +++++++-------- .gitignore | 5 ++++- Cargo.toml | 12 ++++++------ src/core/simd_metrics.rs | 2 +- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7ae98f3..ed65277 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -2,21 +2,20 @@ name: Rust on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] env: CARGO_TERM_COLOR: always jobs: build: - runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Build - run: cargo build --verbose - - name: Run tests - run: cargo test --verbose + - uses: actions/checkout@v2 + - name: Build + run: cargo build --verbose + - name: Run tests + run: cargo test --verbose diff --git a/.gitignore b/.gitignore index 7292a07..e61fe5a 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,7 @@ Cargo.lock #Cargo.lock examples/Cargo.lock examples/target -examples/target/ \ No newline at end of file +examples/target/ +*.so +.DS_STORE +build/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 1a5fb82..72bb432 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,16 +1,16 @@ [package] -name = "hora" -version = "0.1.0" -authors = ["salamer "] +authors = ["aljun "] edition = "2018" license = "Apache-2.0" +name = "hora" +version = "0.1.0" description = "Hora Search Everywhere" documentation = "https://docs.rs/hora/" repository = "https://github.com/hora-search/hora" -keywords = ["approximate nearest neighbor search", "artificial intelligence", "SIMD", "nearest-neighbor-search", "no_std"] categories = ["artificial intelligence"] +keywords = ["approximate nearest neighbor search", "artificial intelligence", "SIMD", "nearest-neighbor-search", "no_std"] [profile.dev] codegen-units = 4 @@ -35,8 +35,8 @@ panic = 'unwind' rpath = false [features] -simd = ["packed_simd"] no_std = ["hashbrown"] +simd = ["packed_simd"] [dependencies] bincode = "1.3.2" @@ -54,5 +54,5 @@ smallvec = {version = "1.6.1", features = ["serde"], optional = true} criterion = "0.3.4" [[bench]] +harness = false name = "bench_metrics" -harness = false \ No newline at end of file diff --git a/src/core/simd_metrics.rs b/src/core/simd_metrics.rs index 127aacf..bf54cc1 100644 --- a/src/core/simd_metrics.rs +++ b/src/core/simd_metrics.rs @@ -1,6 +1,6 @@ use crate::core::calc::same_dimension; #[cfg(feature = "simd")] -use packed_simd::{f32x16, f32x4, f32x8, f64x4}; +use packed_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; pub trait SIMDOptmized { fn dot_product(a: &[T], b: &[T]) -> Result; From fca4516211730b83f27718989b17cb53fc280625 Mon Sep 17 00:00:00 2001 From: salamer Date: Fri, 25 Jun 2021 00:59:10 +0800 Subject: [PATCH 11/22] chore: fix dot product --- src/core/simd_metrics.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/simd_metrics.rs b/src/core/simd_metrics.rs index bf54cc1..00ee57e 100644 --- a/src/core/simd_metrics.rs +++ b/src/core/simd_metrics.rs @@ -25,11 +25,11 @@ macro_rules! simd_optimized_impl { .sum::<$simd_type>() .sum(); let d: $type_id = a[size..].iter().zip(&b[size..]).map(|(p, q)| p * q).sum(); - Ok(c + d) + Ok(-(c + d)) } #[cfg(not(feature = $simd_size))] { - Ok(a.iter().zip(b).map(|(p, q)| p * q).sum::<$type_id>()) + Ok(-(a.iter().zip(b).map(|(p, q)| p * q).sum::<$type_id>())) } } From e78025b1eb87b4f494c97a2ef1fbcef965e6748f Mon Sep 17 00:00:00 2001 From: salamer Date: Fri, 25 Jun 2021 02:04:03 +0800 Subject: [PATCH 12/22] chore: update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e61fe5a..89a87c5 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ examples/target examples/target/ *.so .DS_STORE -build/ \ No newline at end of file +build/ +examples/src/*.hdf5 \ No newline at end of file From b66330b253acc533941d7a2224aef3b1867ee71a Mon Sep 17 00:00:00 2001 From: salamer Date: Fri, 25 Jun 2021 02:04:14 +0800 Subject: [PATCH 13/22] chore: add examples --- examples/Cargo.toml | 5 +- examples/src/ann_bench.rs | 251 +++++++++++++++++++++++++++++++++++ examples/src/load_dataset.sh | 4 + examples/src/main.rs | 4 +- examples/src/mod.rs | 2 + 5 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 examples/src/ann_bench.rs create mode 100644 examples/src/load_dataset.sh create mode 100644 examples/src/mod.rs diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 08d8075..8499a29 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -5,4 +5,7 @@ authors = ["salamer "] edition = "2018" [dependencies] -hora = { package = "hora", path = "../../hora", features=["simd"]} \ No newline at end of file +real_hora = { package = "hora", path = "../", features=["simd"]} +hdf5 = {version = "0.7.1"} +rayon = "^1.5" +rand = "0.7.3" \ No newline at end of file diff --git a/examples/src/ann_bench.rs b/examples/src/ann_bench.rs new file mode 100644 index 0000000..6dad8cd --- /dev/null +++ b/examples/src/ann_bench.rs @@ -0,0 +1,251 @@ +#![deny(clippy::all)] +use real_hora::core; +use real_hora::core::ann_index::ANNIndex; + +use std::collections::HashSet; + +use std::time::SystemTime; + +struct StatMetrics { + QPS: f64, + Accuracy: usize, + Cost: f64, + BuildCost: f64, + TestSize: usize, +} + +const data_path: &str = + "lastfm-64-dot.hdf5"; +const dimension: usize = 65; +const K: usize = 10; + +pub fn ann_bench() { + let file = hdf5::File::open(&data_path).unwrap(); + let train: Vec> = file + .dataset("train") + .unwrap() + .read_raw::() + .unwrap() + .chunks(dimension) + .map(|s| s.to_vec()) + .collect(); + let test: Vec> = file + .dataset("test") + .unwrap() + .read_raw::() + .unwrap() + .chunks(dimension) + .map(|s| s.to_vec()) + .collect(); + let neighbors: Vec> = file + .dataset("neighbors") + .unwrap() + .read_raw::() + .unwrap() + .chunks(100) + .map(|s| s[..K].iter().cloned().collect::>()) + .collect(); + + println!("train len: {:?}", train.len()); + println!("test len: {:?}", test.len()); + // bench_hnsw(&train, &test, &neighbors); + bench_ssg(&train, &test, &neighbors); + // bench_ivfpq(&train, &test, &neighbors); +} + +fn bench_ssg( + train: &Vec>, + test: &Vec>, + neighbors: &Vec>, +) { + let params_set = vec![ + real_hora::index::ssg_params::SSGParams::::default() + .angle(60.0) + .init_k(20) + .index_size(20) + .neighbor_neighbor_size(30) + .root_size(256), + real_hora::index::ssg_params::SSGParams::default() + .angle(60.0) + .init_k(50) + .index_size(50) + .neighbor_neighbor_size(50) + .root_size(256), + real_hora::index::ssg_params::SSGParams::default() + .angle(60.0) + .init_k(50) + .index_size(50) + .neighbor_neighbor_size(50) + .root_size(256), + ]; + + let mut metrics_stats: Vec = Vec::new(); + for params in params_set.iter() { + println!("start params {:?}", params); + let mut ssg_idx = Box::new(real_hora::index::ssg_idx::SSGIndex::::new( + dimension, params, + )); + make_idx_baseline(train, &mut ssg_idx); + metrics_stats.push(bench_calc(ssg_idx, test, neighbors)); + println!("finish params {:?}", params); + } + + for i in 0..metrics_stats.len() { + println!( + "idx ssg params {:?} result {:?}/{:?} {:?}ms qps {:?}", + params_set[i], + metrics_stats[i].Accuracy, + metrics_stats[i].TestSize, + metrics_stats[i].Cost, + metrics_stats[i].QPS, + ); + } +} + +fn bench_hnsw( + train: &Vec>, + test: &Vec>, + neighbors: &Vec>, +) { + let params_set = vec![ + real_hora::index::hnsw_params::HNSWParams::::default() + .max_item(10000000) + .n_neighbor(16) + .n_neighbor0(32) + .ef_build(500) + .ef_search(16) + .has_deletion(false), + real_hora::index::hnsw_params::HNSWParams::::default() + .max_item(10000000) + .n_neighbor(8) + .n_neighbor0(16) + .ef_build(500) + .ef_search(16) + .has_deletion(false), + real_hora::index::hnsw_params::HNSWParams::::default() + .max_item(10000000) + .n_neighbor(16) + .n_neighbor0(32) + .ef_build(500) + .ef_search(16) + .has_deletion(false), + ]; + + let mut metrics_stats: Vec = Vec::new(); + for params in params_set.iter() { + let mut hnsw_idx = Box::new(real_hora::index::hnsw_idx::HNSWIndex::::new( + dimension, params, + )); + make_idx_baseline(train, &mut hnsw_idx); + metrics_stats.push(bench_calc(hnsw_idx, test, neighbors)); + println!("finish params {:?}", params); + } + + for i in 0..metrics_stats.len() { + println!( + "idx hnsw params {:?} result {:?}/{:?} {:?}ms qps {:?}", + params_set[i], + metrics_stats[i].Accuracy, + metrics_stats[i].TestSize, + metrics_stats[i].Cost, + metrics_stats[i].QPS, + ); + } +} + +fn bench_ivfpq( + train: &Vec>, + test: &Vec>, + neighbors: &Vec>, +) { + let params_set = vec![real_hora::index::pq_params::IVFPQParams::::default() + .n_sub(16) + .sub_bits(4) + .n_kmeans_center(256) + .search_n_center(4) + .train_epoch(100)]; + + let mut metrics_stats: Vec = Vec::new(); + for params in params_set.iter() { + let mut ivfpq_idx = Box::new(real_hora::index::pq_idx::IVFPQIndex::::new( + dimension, params, + )); + make_idx_baseline(train, &mut ivfpq_idx); + metrics_stats.push(bench_calc(ivfpq_idx, test, neighbors)); + println!("finish params {:?}", params); + } + + for i in 0..metrics_stats.len() { + println!( + "idx ivfpq params {:?} result {:?}/{:?} {:?}ms qps {:?}", + params_set[i], + metrics_stats[i].Accuracy, + metrics_stats[i].TestSize, + metrics_stats[i].Cost, + metrics_stats[i].QPS, + ); + } +} + +fn bench_calc + ?Sized>( + ann_idx: Box, + test: &Vec>, + neighbors: &Vec>, +) -> StatMetrics { + let mut accuracy = 0; + let mut cost = 0.0; + + for idx in 0..test.len() { + let start = SystemTime::now(); + let result = ann_idx.search(test[idx].as_slice(), K); + let since_start = SystemTime::now().duration_since(start).expect("error"); + cost += (since_start.as_micros() as f64) / 1000.0; + let true_set = &neighbors[idx]; + result.iter().for_each(|candidate| { + if true_set.contains(candidate) { + accuracy += 1; + } + }); + } + println!("cost: {:?}", cost); + println!("cost: {:?}", cost); + println!( + "{:?} result {:?}/{:?} {:?}ms qps {:?}", + ann_idx.name(), + accuracy, + test.len() * K, + cost, + 1.0 / (((cost as f32) / 1000.0) / test.len() as f32) + ); + StatMetrics { + QPS: 1.0 / (((cost as f64) / 1000.0) / test.len() as f64), + Accuracy: accuracy, + TestSize: test.len() * K, + Cost: cost, + BuildCost: 0.0, + } +} + +fn make_idx_baseline + ?Sized>( + embs: &Vec>, + idx: &mut Box, +) { + let start = SystemTime::now(); + for i in 0..embs.len() { + idx.add_node(&core::node::Node::::new_with_idx( + embs[i].as_slice(), + i, + )) + .unwrap(); + } + idx.build(core::metrics::Metric::DotProduct).unwrap(); + let since_start = SystemTime::now() + .duration_since(start) + .expect("Time went backwards"); + + println!( + "index {:?} build time {:?} ms", + idx.name(), + since_start.as_millis() as f64 + ); +} diff --git a/examples/src/load_dataset.sh b/examples/src/load_dataset.sh new file mode 100644 index 0000000..5c178c0 --- /dev/null +++ b/examples/src/load_dataset.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +#TODO: use real embedding data as test data +wget http://ann-benchmarks.com/lastfm-64-dot.hdf5 \ No newline at end of file diff --git a/examples/src/main.rs b/examples/src/main.rs index e7a11a9..3722b3d 100644 --- a/examples/src/main.rs +++ b/examples/src/main.rs @@ -1,3 +1,5 @@ +mod ann_bench; + fn main() { - println!("Hello, world!"); + ann_bench::ann_bench(); } diff --git a/examples/src/mod.rs b/examples/src/mod.rs new file mode 100644 index 0000000..019a183 --- /dev/null +++ b/examples/src/mod.rs @@ -0,0 +1,2 @@ +pub mod ann_bench; +pub mod bench; From 7a1c280b191d418be28459f6d003de4620e888c9 Mon Sep 17 00:00:00 2001 From: salamer Date: Fri, 25 Jun 2021 02:08:15 +0800 Subject: [PATCH 14/22] chore: clippy code --- src/core/calc.rs | 8 ++++---- src/core/kmeans.rs | 2 +- src/core/knn.rs | 14 +++++++------- src/index/rpt_idx.rs | 26 +++++++++++++------------- src/index/ssg_idx.rs | 6 +++--- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/core/calc.rs b/src/core/calc.rs index d0f6a31..e00d502 100644 --- a/src/core/calc.rs +++ b/src/core/calc.rs @@ -4,7 +4,7 @@ pub fn get_norm(vec1: &[T]) -> Result where T: FloatElement, { - match dot(&vec1, &vec1) { + match dot(vec1, vec1) { Ok(val) => Ok(val.sqrt()), Err(err) => Err(err), } @@ -14,7 +14,7 @@ pub fn dot(vec1: &[T], vec2: &[T]) -> Result where T: FloatElement, { - T::dot_product(&vec1, &vec2) + T::dot_product(vec1, vec2) } #[inline(always)] @@ -47,7 +47,7 @@ mod tests { use rand::distributions::Standard; use rand::Rng; - use std::time::{Duration, SystemTime, UNIX_EPOCH}; + use std::time::SystemTime; fn make_normal_distribution_clustering( clustering_n: usize, node_n: usize, @@ -135,7 +135,7 @@ mod tests { let base_start = SystemTime::now(); let sumsimd = ns .iter() - .map(|nsx| f32::euclidean_distance(&nsx, &nsx).unwrap()) + .map(|nsx| f32::euclidean_distance(nsx, nsx).unwrap()) .sum::(); let base_since_the_epoch = SystemTime::now() .duration_since(base_start) diff --git a/src/core/kmeans.rs b/src/core/kmeans.rs index 18b2308..90947ad 100644 --- a/src/core/kmeans.rs +++ b/src/core/kmeans.rs @@ -277,7 +277,7 @@ pub fn general_kmeans( let mut mean_idx = 0; let mut mean_distance = E::max_value(); nodes.iter().zip(0..nodes.len()).for_each(|(node, i)| { - let distance = node.metric(&mean, mt).unwrap(); + let distance = node.metric(mean, mt).unwrap(); if distance < mean_distance { mean_idx = i; mean_distance = distance; diff --git a/src/core/knn.rs b/src/core/knn.rs index 1ff2c9a..ad0da2b 100644 --- a/src/core/knn.rs +++ b/src/core/knn.rs @@ -167,7 +167,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { let mut ccc: usize = 0; for j in 0..nn_new_neighbors.len() { for k in j..nn_new_neighbors.len() { - if self.update(nn_new_neighbors[j], nn_new_neighbors[k], &my_graph) { + if self.update(nn_new_neighbors[j], nn_new_neighbors[k], my_graph) { ccc += 1; } flags.insert(nn_new_neighbors[j] * length + nn_new_neighbors[k]); @@ -177,7 +177,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { nn_new_neighbors.iter().for_each(|j| { nn_old_neighbors.iter().for_each(|k| { - if self.update(*j, *k, &my_graph) { + if self.update(*j, *k, my_graph) { ccc += 1; } flags.insert(j * length + k); @@ -193,7 +193,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { if self.update( reversed_new_neighbors[j], reversed_new_neighbors[k], - &my_graph, + my_graph, ) { ccc += 1; } @@ -207,7 +207,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { } reversed_new_neighbors.iter().for_each(|j| { reversed_old_neighbors.iter().for_each(|k| { - if self.update(*j, *k, &my_graph) { + if self.update(*j, *k, my_graph) { ccc += 1; } flags.insert(j * length + k); @@ -217,7 +217,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { nn_new_neighbors.iter().for_each(|j| { reversed_old_neighbors.iter().for_each(|k| { - if self.update(*j, *k, &my_graph) { + if self.update(*j, *k, my_graph) { ccc += 1; } flags.insert(j * length + k); @@ -227,7 +227,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { nn_new_neighbors.iter().for_each(|j| { reversed_new_neighbors.iter().for_each(|k| { - if self.update(*j, *k, &my_graph) { + if self.update(*j, *k, my_graph) { ccc += 1; } flags.insert(j * length + k); @@ -237,7 +237,7 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { nn_old_neighbors.iter().for_each(|j| { reversed_new_neighbors.iter().for_each(|k| { - if self.update(*j, *k, &my_graph) { + if self.update(*j, *k, my_graph) { ccc += 1; } flags.insert(j * length + k); diff --git a/src/index/rpt_idx.rs b/src/index/rpt_idx.rs index df5ca0c..0b00c13 100644 --- a/src/index/rpt_idx.rs +++ b/src/index/rpt_idx.rs @@ -66,7 +66,7 @@ impl Leaf { } fn normalize(&mut self) { - let norm = calc::get_norm(&self.node.vectors()).unwrap(); + let norm = calc::get_norm(self.node.vectors()).unwrap(); if norm > E::float_zero() { for i in 0..self.node.len() { self.node.mut_vectors()[i] /= norm; @@ -135,15 +135,15 @@ fn two_means( // produce two mean point. for _z in 0..ITERATION_STEPS { let rand_k = random::index(count); - let di = ic - * metrics::metric(&first.node.vectors(), &leaves[rand_k].node.vectors(), mt).unwrap(); - let dj = jc - * metrics::metric(&second.node.vectors(), &leaves[rand_k].node.vectors(), mt).unwrap(); + let di = + ic * metrics::metric(first.node.vectors(), leaves[rand_k].node.vectors(), mt).unwrap(); + let dj = + jc * metrics::metric(second.node.vectors(), leaves[rand_k].node.vectors(), mt).unwrap(); // let mut norm = one; if mt == metrics::Metric::CosineSimilarity { - norm = calc::get_norm(&leaves[rand_k].node.vectors()).unwrap(); + norm = calc::get_norm(leaves[rand_k].node.vectors()).unwrap(); match norm.partial_cmp(&zero) { Some(Ordering::Equal) | Some(Ordering::Less) => continue, _ => {} @@ -245,7 +245,7 @@ impl BPTIndex { fn get_distance(&self, i: i32, j: i32) -> E { let ni = self.get_leaf(i).unwrap(); let nj = self.get_leaf(j).unwrap(); - return metrics::metric(&ni.node.vectors(), &nj.node.vectors(), self.mt).unwrap(); + return metrics::metric(ni.node.vectors(), nj.node.vectors(), self.mt).unwrap(); } fn get_tot_items_cnt(&self) -> i32 { @@ -382,7 +382,7 @@ impl BPTIndex { for leaf_idx in indices.iter().skip(1) { let leaf = self.get_leaf(*leaf_idx as i32).unwrap(); - let side = self.side(&new_parent_leaf, &leaf.node.vectors()); + let side = self.side(&new_parent_leaf, leaf.node.vectors()); children_indices[(side as usize)].push(*leaf_idx); } @@ -477,7 +477,7 @@ impl BPTIndex { } else if nd.n_descendants <= self._leaf_max_items { nns.extend_from_slice(&nd.children); // push all of its children } else { - let margin = self.margin(&nd, vectors)?; + let margin = self.margin(nd, vectors)?; // put two children into heap, and use distance to sort the order for poping up. heap.push(neighbor::Neighbor { _distance: self.pq_distance(top_distance, margin, 1), @@ -502,7 +502,7 @@ impl BPTIndex { if leaf.n_descendants == 1 { nns_vec.push(neighbor::Neighbor::new( *j as usize, - metrics::metric(&v_leaf.node.vectors(), &leaf.node.vectors(), self.mt).unwrap(), + metrics::metric(v_leaf.node.vectors(), leaf.node.vectors(), self.mt).unwrap(), )) } } @@ -543,11 +543,11 @@ impl BPTIndex { // means same side? fn margin(&self, src: &Leaf, dst: &[E]) -> Result { - calc::dot(&src.node.vectors(), &dst) + calc::dot(src.node.vectors(), dst) } fn side(&self, src: &Leaf, dst: &[E]) -> bool { - match self.margin(&src, &dst) { + match self.margin(src, dst) { Ok(x) => x > E::float_zero(), Err(_e) => random::flip(), } @@ -559,7 +559,7 @@ impl BPTIndex { new_mean_leaf: &mut Leaf, mt: metrics::Metric, ) -> Result<(), &'static str> { - let (p, q) = two_means(&leaves, mt)?; + let (p, q) = two_means(leaves, mt)?; // TODO: remove if new_mean_leaf.node.len() != 0 && new_mean_leaf.node.len() != p.node.len() { diff --git a/src/index/ssg_idx.rs b/src/index/ssg_idx.rs index 77c0119..bc1aef4 100644 --- a/src/index/ssg_idx.rs +++ b/src/index/ssg_idx.rs @@ -82,7 +82,7 @@ impl SSGIndex { } heap.push(neighbor::Neighbor::new( i, - item.metric(&node, self.mt).unwrap(), + item.metric(node, self.mt).unwrap(), )); if heap.len() > self.init_k { heap.pop(); @@ -119,7 +119,7 @@ impl SSGIndex { if *neighbor_id == *nn_id { continue; } - if flags.contains(&nn_id) { + if flags.contains(nn_id) { continue; } flags.insert(*nn_id); @@ -544,7 +544,7 @@ impl ann_index::ANNIndex for SSGI k: usize, args: &arguments::Args, ) -> Vec<(node::Node, E)> { - self.search(&item, k, &args) + self.search(item, k, args) } fn name(&self) -> &'static str { From 096ccdaf28d88a9adb880e550efdc573415a2125 Mon Sep 17 00:00:00 2001 From: salamer Date: Fri, 25 Jun 2021 02:08:32 +0800 Subject: [PATCH 15/22] chore: format code --- examples/src/ann_bench.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/src/ann_bench.rs b/examples/src/ann_bench.rs index 6dad8cd..6320d57 100644 --- a/examples/src/ann_bench.rs +++ b/examples/src/ann_bench.rs @@ -14,8 +14,7 @@ struct StatMetrics { TestSize: usize, } -const data_path: &str = - "lastfm-64-dot.hdf5"; +const data_path: &str = "lastfm-64-dot.hdf5"; const dimension: usize = 65; const K: usize = 10; From 737ab656793238a6070573a92382519711a7900f Mon Sep 17 00:00:00 2001 From: salamer Date: Sun, 27 Jun 2021 13:50:31 +0800 Subject: [PATCH 16/22] chore: update readme --- README.md | 54 ++++++++++++------ ...shion-mnist-784-euclidean_10_euclidean.png | Bin 0 -> 51037 bytes asset/logo.png | Bin 0 -> 14042 bytes 3 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 asset/fashion-mnist-784-euclidean_10_euclidean.png create mode 100644 asset/logo.png diff --git a/README.md b/README.md index 50fbcbc..04fdf15 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,15 @@ +
+ +
# Hora a approximate nearest neighbor search library, written in Rust -## Introduction +# Introduction -Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at that!`. +Hora, `ほら` in Japanese, sound like `[hōlə]`, means `You see!` or `Look at that!`. -## Key Features +# Key Features * **Performant** ⚡️ * SIMD acceleration @@ -36,6 +39,8 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * `Windows`, `Linux` and `OS X` support * `IOS` and `Android` Support (WIP) * thanks for `LLVM`, Hora can be used in `x66` and `ARM` CPUs + * the whole library did not dependent any heavy library, such as `BLAS` + * Hora support some features, such as `SIMD`, `no_std` * **Security** 🔒 * thanks for rust strict compiler @@ -52,17 +57,32 @@ Hora, `ほら` in Japanese, sound like `hōlə`, means `You see!` or `Look at th * `cosine distance` * ![equation](https://latex.codecogs.com/gif.latex?D%28x%2Cy%29%20%3D%20%5Cfrac%7Bx%20*y%7D%7B%7C%7Cx%7C%7C*%7C%7Cy%7C%7C%7D) -* **Light** 💡 - * the whole library did not dependent any heavy library, such as `BLAS` - -* **Configurable** 📕 - * Hora support some features, such as `SIMD`, `no_std` - * **Productive** ⭐ * well documented * elegant and simple API, which is extremely easy to learn -## Installation +# Contents + +- [Hora](#hora) +- [Introduction](#introduction) +- [Key Features](#key-features) +- [Contents](#contents) +- [Installation](#installation) + - [rust](#rust) + - [Python](#python) + - [Building from source](#building-from-source) +- [Benchmark](#benchmark) + - [Interface](#interface) +- [Example](#example) +- [Roadmap](#roadmap) +- [Related Project and Comparison](#related-project-and-comparison) +- [Contribute](#contribute) + - [clone the repo](#clone-the-repo) + - [build](#build) + - [try the changes](#try-the-changes) +- [License](#license) + +# Installation ### rust @@ -86,9 +106,9 @@ $ git clone https://github.com/hora-search/hora $ cargo build ``` -## Benchmark +# Benchmark -pic here + ## Interface @@ -220,7 +240,7 @@ pub trait SerializableIndex< -## Example +# Example Rust usage example @@ -234,7 +254,7 @@ Python usage exmaple ``` -## Roadmap +# Roadmap - [ ] Full Coverage Test - [ ] implement a [EFANNA](http://arxiv.org/abs/1609.07228) to achieve faster KNN buiding @@ -242,14 +262,14 @@ Python usage exmaple - [ ] R Support - [ ] mmap file support -## Related Project and Comparison +# Related Project and Comparison * [Faiss](https://github.com/facebookresearch/faiss): Facebook AI Similarity Search, which is the most popular ANN library currently * Diffrences: Faiss more focus on the GPU scene, and Hora is more light than Faiss * Annoy -### Contribute +# Contribute we are pretty gald to have you to participate, any contributions is welcome, including the documentations and tests. you can do the `Pull Requests`, `Issue` on the github, and we will review it as soon as possible. @@ -277,6 +297,6 @@ cd exmaples cargo run ``` -## License +# License The entire repo is under Apache License. diff --git a/asset/fashion-mnist-784-euclidean_10_euclidean.png b/asset/fashion-mnist-784-euclidean_10_euclidean.png new file mode 100644 index 0000000000000000000000000000000000000000..58f660950b7b5a6f11b6a52ef36956bc928960e1 GIT binary patch literal 51037 zcmb^Zby!u~7d8xUQo5z2OQb8h9 z&-tC_dB5xX{`%H+dEjvEwbq%gA?A5Y-&4!wo~O5kyA9&5g{P~Ni>H&lCBp+7cMp4)``38{c?7r^>^wbP zJ;Zr=o&Wa%9v638Ue3^EeRv4A>um!M1cJ~4`46Q)D&HP~=;~9Fm(hNhx;5wRucdQw zX?JkI=~*6H1A8$>AG=K`yPPn44~e{6B@KJe3c^i7ij0D=C#QhS4dscEYbFDOT*8vo zE%dCOs^?R;4prPe0)w7+<=vfrb2laLtkJBz+3wTL!+Dp_GtUEkKfOyNk`0df=c2wD zo{!7G^!Lg>4v``L_d+pOm;d+D<7fZpBTP&PyeI$O{EY+i*1sP?5B~2~lg`Y4KN5xD z``@kE#gt^k8l}%zum>eiO7`~}yfzt>NIs?C+^kHVwLku~9u*Vw%adxAP3rtNGmBDt zywv$O?F*a!7g<)_aTWSpwMP0>dS%8Trsdr)xQ#ZpQpbp?s2;D6CD_9Q6qO&~nJ{^zF~ z#I&^dN{M_AcNdl8tt$12NJt*Rq1d+lhSm4* zO#_c?RMI7=c3LhZ>3mj+5v_N4o5H*{$CpAb(X}4$Z@NvGc>C0A?mSrk%qnrRRsDM< zUTW5NHp1UoyqK{GGs)yCeb6btZi&9enh-H@!lrgQPjJ-i%Dc66Ic;sSCr_U2_Vc!w z&oy{WzW3&hqq~ViPfxG*QjhuMV7uena1ILMaMpv+($ey2YHAF;)}l2SB`P|)1C3B( zvGRD{XDv;f_O9%Hce0S9xo`@<*M@e=fyjbC=_Nus`3%Vb*DrlZ9PvLl%UhUk-lVna zig7!ht+pNPf(GArlEWMblN&5SAtso(3mGW_1#1;J{e?CQ$*g2&{^-4tPqzUSBI%F2!f zu3zW(`CBxPQ2t3jU0ugtVlZ9c)FXe+qDzp_RdE~M?;(Y>cwMT zzVi*1!YLM&`kBLa5+=$`wGXxpW3Gsl$x^-#Jo6QFnWq7VtgNimp|kCO!EHC!ASCYn zee2h|8(te>lau--Ms;OEL!Fh5s zbIjqUi7E!TBFE(mT*BuuU4qoHrWcp^nX-=~v<9Q73qsH?p&~w|idl`EPnZQ+{8;U^ z`}_vm-`{@-=Ib5jDKB4*!q5 zA!}mOf{&8#-bYHB8xKC_4Z?0&T3d^}Zu`mn*SBFlO1%1&xEm!ieHWJ}Ei27CpIrVn z`|di7j)`VVkD|izuBzKJ= zHwg&|&G;?^0(0w|nRlX_Pah)iH$MLRoo4TOPAj!mAzakflxCO z!{&a_K*?{3-h6gogvCfh=Lg%f{>uv^1VSzXuToWu+a$E`m*MnV=PSO4-|@}N%o_K{ zjS$L@DzfAvnFIyNU<&;F{11N)h;XLLVTDkbK1AU*tO-`l?|8~6$Kg+yu zH=W2kS90{v$Hcl+;Nm{z~{-s!5e>UK(h$)a>yMdicSft1qi@5$pUb~iJAI8Hrp zUVy-N>2M)PDp}YWCnzXrHsEmX^8BSOG51xhJhhBCVdv@A?(R!VOG~fBJnn@PJLK+o z=c{FOfkO~eQbq*_BR;<3N5jCt5Dtirj^68Gj%Q$DLB|Y5S?P-H2*;%;D*4h)AA}4Y z2*mY!A1j*kHpfe=4~~|jd54PicwuugiHcHs&->UWjbs!z2;bSBdrvW8?17JT^}M@R z{CC?fU5@**5BNn4{(yk+_iJMe*Y9AfaA*yp5=?`wy}ccF;P5*S+Y*E7u7;+nP3zsow#i1H z>xqepCEGSZ_r*`Dvpw(T#!DXe99NsekX5DBn3&uNE@)&%rIXHWh1l79rXOrg^8ead zphv<9shR{zlWF{*)CCQAMQc~=6@Is6c?4^8D*=rtQFlCj_s7@QhpTPX-sI<7jpW^} zcU?k#6dIcF>{+PyLI8zUzG`RF;e7P$ta0PvydR~I16knNzFytkyT1=+T$WZ=R$R5= zBE!U^45q#DAhV#L=SvGmnOO@RJgcKXBd5Z1U6YKAY;izj;X^PAn(MbA#*~Ym%VhnC7=}#3phEDDb-~b61ogN zEPwkpR{FzbY;;V_W$=*)9dv%GY2x8VE=`w_0l&t6qyl2&tb0HAWP=x{LJTDef}Nec zHB%TBsIqq|n^4H>C9N$|m0%!epngBP^f)lTKrrq%F zf*EIO7Vd7>EiAWPq7MTbq>6czobVdfU?II-A)YRFI7e{>Hf?uL0<(o6wX=iUfiIb? z`E0%0WRsuBy@BNL?{n|1X5KpMSD2&lm^Mb~nFU;CWMtHD3A}0Qzjq7BltphM>-btN zfSSaQ9%j4YYz#WTEq;Xb?c2A#nk9sUy}uhHR~`&oYg(6qd7{oN&a z0OS3Zvj@L6CkWp^*mielu*z3WL#~>|aQ3Zw_+qTbHwk^$7uuBvsx{^fI?7K9&|Bl&8I<)%#~%)7<`8P(mxhtfV2 zyrwmn;%bsdEWdnj&IcTtL^8)06)n7;$mICcNU!%bKO?Z+HX|ScB09L8B>+JEQpA;A zATx#wdYYP=NP1uxn*8)Bk^jZn;VU6Wa(8$4UBC}L-!vo_n3|e`ef-EPxd6ZiF%hVX zU>#AGx;XXtn0A8#P;cM8kBK;8f>eHW3}O@~A_}%{R85Vb+xe+ar9}rOF$oD%jbW*1 z(0OC+PK8AW((mLoG!kkDEGwe|&yF8|`)21jlYJ{19kKW2rJ1g1MyDmQES80E$}Kmy z%7M4wy~O0?Y=ElvBLvbCKR;f-?4YWmQu5;n0~7Pf{kaC6Y1{cmAJkw`_thvO*(lp| zAB9OjKhB_eh%*qr*}pd(nx+H-C17G;Xw3_luCYfAez^SjDhG`>{GJW`aJ(DlQ;lGd zJpRU<5@=L+zt;X@qqrO$K}=6?w*Ip^A3lQ`JX!CSQ_FqhhLND-xC#)2j_HHbpM$tf zro5`@686;>5FHtxE9Z)Oe@}&BeE1-bBK`a1Bzkmolw%P2I2;5MH}`-=#{T96Zn!LC z;Q5hl%E4H%9xeh&-DI&CA@s14;e@{ub<2KgGnMDEzW*XQnFB}@rI`DhyBSHGdN>Hn zfn=faw=$ua*ofs&LdlA^mnkTs!B6?^KeIT_Jh{vXRyP=k9021;wpAQ$L_7)EnHXs78gth~G&f*O%*{R5M&g%&deQAWUlLGD@e?`7P` zK_p~1csYhU=L#hWIgGLo16*ycd@dh&#cx$&#>*In-0$cJe(+Dnle+rC;;!a zyoJ@?XW=YL$UygJ#E(Cv-Ixh51wLIkWUBRLVX!Z~e9MD7pk;lm_$428?DzrX7BH<8 zm*&D>5(lOy;M4L5A^^3@w@J+lNkI!~U>Q50h5LJu9xx)JF3|&gE44l`^JxBp!T47oqS4PShMtS{S%)nru=BM)#_ENnp z^(cUYcc;dWqk4Q&eHBKf@#rOKAadDFRk5jS&f{O@_iNV{NuEj+Lo%{EB zK97tfq^HMm>Xn&~7HEiiZ9Ml%M#AuLsgcMgR!)`j#gkHPj#9@7eJ*lqYwK1g;T-Rs z`QK-U3v&mJ?j9aMcR9AT8g7b+OypW!z#i}E>uVn^)G94k^z{{guFQITuj*blCjgIvWAc7IFJ?qV#Mu9B6{i~z4c#rO1{sL)OW16w=rF1gByWIWobg%leC%b z`N4es`OpUBk?p`LIBH&+c4g zK2w_m33MrE^Z+DSjJsgjiEl^axd8Tq_iAP!!9*k}hXWPYZZMmgXf7zgf`^WKO>Q-H zgK9AIbq~@Gw=aE{S65dN!TbJS%tn%VxK3wWS{Q+l5K8{e9EkeFZ`G9v#<|=qVse?3 z)Qa{s>hl=9&7rK&fy1ie9ghRiV>)&hU~Bc?8#bL3288WDe-OX?PJDmj`SqRdq<8RF zBipc#Bla*GCVHG~Him}w;TU%SyXS3c1wzmo)p17Ss zUS`-;Xe*B|Z){96UFQ@8fJQv{vuzZ~GL(a_M~8B0$~i*0Y0-L#5TuuA)3%|}^;7pzd0(6eEsV7Ox-1Qfqgy+2^v7yb|6AD5mLFe&)p9ky*Eqimgl?|I&U7%hC+Ic+7W@@ z0kkou_t{mW&{TN&((k<$o%9o!fI2%_(s{SkwQJZ24wXy8c7k>zd9hx8h2-3Z+7F!{ zJ$mH2+C!XTHSLE_dsnOwpY0j8)ah=h*EHGUv=?Jp*@V^<8Jn%>al_6`9pRwFPdwnl z>K08YU_EqnbbYfEz_*lb*xtfgW+-5OB)tIf5hs z@X5KG_o4x|T9!feJ)N@G&l3_tklb%XO{#P^z>xZL65uyyN_TE~z`c8Hr)Osg&!3}* z%W7DZ)+L+n#5IecAYBJk5trCFvvVI5*1^&(akaAM!KsxkKQm2DlCt;T8*sL_r)#Tc z7G57a_Pst_>3iOy5R#a|3i-|LJ9k25v!r!($>#%4oEy&$W*)>ZeRSN4s+QtZes>&% z2k8NAC{s4(al@KTZq(HE@&T0{_|@R-~VrGd<3*P)Y=n zfqD9)pBI@`R65THe@qo4y*S$m+5^#O^x+$2dv0chm{!P@?|b{%S7`T5-7o>`TMnk* zL^4ZZm-(1dqq^nlLCFpXYn1^f57?8GA)tD$f5s?p{PASY3N*XVwO3bAb2aVjOth=M zRTm=@%j&7E(E<_0gYR?Esi`zEbT_A`zxH`)9QWUIb#;wRk8UbId41iM516*+)+8}J z3CHlQ6LzdZD1?heAk(GeJs5iB-3_jg0^#g)gRXIdmakSBACxAqrhOKHS+E4YC8)YiR7ZUHcH=W^b>X z9u$}FtuXl~K&A6PpD8OoP6{|vOP9b#jCbAyvLD^pC{ZWbq6rEWa!bQf@Ah$)FXp~9 zZHGM)JNMq(3Umpid_q7k8Dr5Ap(DPW3$Fw*7FfS)J(e#-PL$_dqDH>lXsq4~cYsNb z|8xY63uHvYB_Rp7+a)6cRm7*YBJ^=G9qtJ{G_vaE-7)Orl=EG>L5-{P%wA+?gxA@7B-%qUERc2 zF7HN6ohEv{u6Zx7AFE2=JjFNZh4C=atPRS3=qKHzP@G9UGmw#&_lk5RBNkE*y+qiB z#MaB4x{!loWo2bif<83wyUG2&W#~m4PEMpG?MBoOL7HlV1^)#OU_kLktYY-f1kY3<|5IU8j)o3l1WNx;oc|- z0-3{~pPxTm&waU7hM{#JCG00y`oJBMnQtuDCQ>j7-d@<)t>!!DhoZcv0#8ML9(hjLN~HR5kzZ6Rq$5 zhRx4&t+DB=;KHR9Uk89GY`e-2!w=`5=RdxtxU@S7>mAcO^h1kuKAtqPZWEp zC{C#F%r^lEegX-`jBy5JsyE~4rA)sZ-t#IO_~Eb_@F^WB5re6a+IxvJ6%`eM;#2XQ zyz&qvix6{N3?87CTN}*i2HF-1&bga5AyGVlJqDl zYJ)%#+{YZ2xs|(a_7PHeKiHZ=3hyA+4CUUTK#BkWuK38TbFmf07}Ui`Li)Gg>mXnw zMHZxN1yc?O(hKS@cClW$Lf!p2*O9v-OAxw0fQR#gSj)o3X1TvH7WMeCjHzk*OuC%9 zI+3u`WH9IjIkto8^=s*VMBsg^tJa`Y6CiVW-7@3H%>gHdhg4$jEQYlX;Q+1{ZI3Y2 zIg<3DTp=qf>-v3G%ux!Y!A1~C>Nm^Y4}s`0`(Va-ToQPv!o~R+=V|sv4Dh5Sq_mXC zdS@3DKXU*n5Y&N(lOeOE#)E0cR}ww~^FbE@C5E*)q*qntA!H?_q(lQ}y#&sk6er9+ z)*RsXFG`tjtKyz;NZXk6fYcT3Q`?L1Ou|;W(1v!-5X3t#=)GRi}R;8RKUk5 zBq3XOnt6-gk`Bsf`c>B0V2$0?XKI$9Cn1Fb zzWcMHNN6wCEtLgJWWW@C?)0>3cyD)02L=EE(M_Z|LH!6V4#vlC#P#t7j0nS;W`&H zscBn1V`z>710y~+-1U7ZJMNHrN`04lnGUmB3VUjkwWRxcTm;*5$aAQ;=eJ08_D zr|i<^w4*y+#TEV5dFBcH0}8|yMzjo^8tEG3pr^}1tq}k;I^}+Y5;APwTY%g(>Yc|L zGJC*{RbYmy4kHv3?==aPf$YQE^eYkSO|?S~wb>97mmzU4b}fF$$T6?laO%8~<_1L) zw}pVi$Kd$bNy;`&xjP`tZ5>EG+|B$pQ&$eok*!m#yPTwF>h^FRHbFE5xyPX2gSgBL zr3X$>`|o?aU$h^|!)`KlIW#^tIx7QLzRV6 z)#9T0aJiXOrT>xprt^DJQc?`W9`I8=pZknqfbaQ~Qh|+7ekP)hRe<7jRH0VBZvD;L zVLLJy!DUcO$qh?&A*+52s_SD|!f3#wlxD~A65^Y_zbX!{2--b`iW0rkaf$IeP2iRq z1!E3K$qXq4C%k@bKhizC{dqee4Rrqxpu2{t=97YMw`_KC{1RmC;*q+dz%NO9P+Kw! zH+ubYzIl3Has99zc;PK+>0rLsD#9n?ekL?9#?Lz;KL9#uks?r*HivV%nN3~7LXG`1 zqy={O_J|V_5|En~DNq%wxjJzD1GF2U&~9 zsx{=!&dKS75YNR$9zC!}6VSxzXDY~M3?!*5MZNr~9w>qlZp2FM(04Z|WMqk+9X z2BB#tFC%A@g_l=t7KjEDAD=p+!bj=>r0w=i^77j6Sr{ArljN8jo=*Yu(UDXbOV!+E(RDjBZ%do3DNPGG=#zCSO)Rv z&=cuNLS3CDkCAO0se%ywk!22~a?he93z^;Cyx$H#3-E5tQ0D)2gJ2;h{_j?ge>+JK zWg6(fvSItRne**_;)jRBrM($O1Oh5Fkf4*{#4ThV=MG=G1!}e|5Wkxyn162;5&+Vq z7){PCVS?6qzOl{6Ncluf!+*RO#l+pujjl^V@- zZIp6XJim*9nHdcLVSGXH@y0W_KS@>OIwc-J&7rH1d~JudfK2{K+rmP|#ZoqA^n{oY zX5*(q{zWg<*pOapR0L0u>^{j=Igq+%@JUo!_%CEB7SH+A$HA711ayWT(e*fPg zxxelk!5|{`c`smPmL; z`S-ILa@}Cl0vx=*b}gtxb|bAK(Ed)_!3jCjq?NefZ?10qzQ?~?)v{b2o&P?!_Axw2 zLoD%1T&2W~D?OhSaC1p$1*tvNi-%=$WH^&>lzVAy|Gv~+3TdS;=$uKfXo-S*0|O`3 zI}&bDSW{8p;8s>w5Fr+sm7;WuOWQOq)l!Ej=bk*d`q%6z$omNDuk&%}u%Mp{Qt#yt zW38^CA=;CaQx@4ZOCyqV;{J@h2Msw=ohEd2t392}?!J46jPDNU!H zAl7mL^WUeUA$=ruRoVR413PvB8>*`+Y^i+IZtd2xRa`f`=$M07*hu42R1m=@r)O@@ zOn1_+LYfJeHa2oNJnn3KnrKnuc0{Ga`f}-923<{q#~V-hSIJ zYkpUiAiF$H0!NR7(XEgA|6Ce(FJ;3_&Bu<Fy;xcL&{~B;_}zX!5s3p+J`AG)FelvaV<^-Y3HIFe;&kDAEz+rId1pVrE)Q>H_Ry- zgI0(-GT@F}u0Nw|K|N*Uljzc!WX_~-`qXsxf_Ax@GV9B{t9Vfb!midxn`zAb*#)@@ z8=jPfBQjVaWCjtR+IkqOi9~hoCmR}2*icd6{^vwI*Ov=I4P&J*wPapr$ewQ9M25{C z52SPWh$$r!mARijNayM2V<)S(tQ7TXWMbUSAC8iWRbY^!M-b&{XIs5q>`}{V6*)RO zy_G)u1O>Tl32Vr{Jcq@09gY%wT)74=BZ^*03&G`@UUaRo_h<-BYpQEiF&R&uIMnJo zq&iR&x#9ouv_lKnOry8K?XeE_Zv~l|A%c~#QiWcZv<+iF!|tsz=3ILb-uhbtT zjDVbAK}S7Uma*i-bHa}3UcPBGk*L1$Rs~OP=wScEmv0^z$_lYU^53zd>LdQl#P82c z-i#)O*~MZGQ{&(NyorJ^sET>UMta^rO_L@_tzq2hmdc1$VSI)_V4)}uD^?QKYtOZE zu%NGT8pIb$q{%Mvu^$Wa4zh{*aj^H?S7$+cD0zM4i>8SNH6{L!{mvFtwCw|f*Gb{b z3OPbC)cF%u&sA;fc8U!QdJ=8&+XW*T^=xL!0_h6W^+S8BK3h|5y1Q4j6kv@VDSJHm zv*g=IyUFW@D6^JnaJa?B6CxNoow_c)dpD`4jlQNUS~bWdA1-`y$K_>=WD#ea144c2 zNSwM@lVaacoyV@4xx=y&`|S@7gJ&DBhNDIesc-7)8YT6yvAeHfF=77MHMOCzYly|j z@-!`E)QRr(2ndfZQh53$gtfPM7A2!=mK6!Ib5J0}HjG?0ymDP=!`*Vtth~`BHr+k; zv@96^gvV^dal6p$`g+k1ot-jep)Ca@5RT1rNvhFM`KAWT4MP&*S(a)S7ZWx3Wu+B* zYgqi4v^gY*DY0ncH!J&sbFCvUR>8+P|L1o^aiVf$k=oKdvFN*s~+6?*lF6v55&)NxFd-mL&?~ra{^n0`P)+MF%aWG5u{o{XX)S=Fr1WYwpKw$)y;uM@ z^ik?Yz83qgsa=6qKzI&c;jERS%ri6AXv2m3eCEH#%==NbkA30y%vQ@I+wFql6K$+z zj>68@<|Tqr#r8u;0wnL1Ka32K+5xAXExvDxqvbg@s$SLK@EWyC{eI(f8`DH>>;yho zUgJz}s>c&hN;&C#`2wP_!lvixCYo68sn<$e%5?ds-ZntBQ$y{r{(By2JWo{N*`rT) zFMh$Zt!S>vj%oPL;X#kwm8q%Q0(uukQ+nEA0k>|AcT*zcd2_KUldJI3GYIJwW?BRzXYv6_ zHD=45%vX(i=h~zCieg&}B&b?_s;SocY2T+A#_Bx^oePA1MRIa5Z<>J^jNZ|S0;PPi8f%=AV)@?^=vam&_E$rwvAW|-nB}$ zb7V#UW)b4BLCc{H51-WNoz}aKkBQEnL?p{FV_6wT&2f@oUnwe~*RT92BS(SxT;R29 zMLi{gufx1mUP|hHz)x=SZ=1dB4M;vvtR+1GC5w*57cj2WzocoS_JA6H+EMV;@(0WH zL>5N%LU}+S9>dasx3k44pTmpWhQ(lRciG+;aS?KRfFD%?mc!T4} zP^(=?^hRB}C{*L74H^>^pYx)_)OTO@T915ZR$x0Y%Z;RC|Vj1 zt1W zB9cXEShfG5r9Jr16T`5QaELiBAHT4#!wT^P zdvY=;HHqkVUlXQu2X}1STiwm>qrSmIEmQ)L^zdCC%2q>ZIhI8?0io(Ppn zZ|8&V!Afdx*z5-SZJq{m;g$SsraD?Rd~LksvcTDuo%jb1Uf}^x!ja%LWICtEtVFrE ziCdaB)XReYgPVL)EY+Z;u;K5JyZ#Ot4e|`FT#~y<%AOMr&TnnZXhS^bow}5Ah=V9& z0U_)6PZdNWHQ0>tJ+^4-dStdS$Z;h<4%FIjuB=#zupjr^eHfvl8t+zaeehiCsQW-c zsV(r8&T+l;i^{qJtQK+#k;&jFZwF?k?4rN@=|#4v&y~TgqsAQEa!d&kw_n=$r>4cG zg@+T}*KeGelhLr*!jM92mCb3p%_f{R3jnyO>qicDq+7TkIS~!v!x1sPWm>_xW%_EE&z^l^$9L7=Qm+6<{2Dj|St`K|6;c zv&n|Qq@-~5^ZZy?I6x>#7JiN^5G z&1+{+vdh!VdaY@&WEK>Ts>yv9#R|E|V}CpMdvG!SM~Q`SVvJc$n)+W)FQoGkO?0l|NRKRSnE6X zyb_bvESVVQnoC0bK!I6EgFTbo+Y||wt2}=*D(c`a63Q_)@8*b?<#FsqWlW5&<4#=$ zHy9H!S#v6ruO(rrv#T?p2No_R>%eWfe{&=uhJ$tj_t+b#IYwuvd>=^C8d+GuC`xoKT4P1IyPkH(U z5(n;4&YcKCbTj_D_X<#O%f5I8P@{RPZE7i#v}<#O6>5Jo{r<^=obf3%X@ut7`SR$^ zbi;{JNGB>TLMq}3Y&5#RreV4|+F2tR&cdi)C|>~~@ylCm#;5j`i&b1JZ-%38hKJu4 zOtEn5@CoAJ`j8?fhNZ_rT&kM`2w+!B?L6x6jS@6yA&$RBiZWIYG&DDLm!gOt-@5? z9(4c~KO|m11Fr7#CvgJf@Q}#1Quy#01h3pWaVxPaA2_&uSI_guI|4+e)^E9@T=Pzo zk!4jg{3m^+h)*d4qAtQ#-WH4qX_UxZSvEWA_;`6K6_ zUHe8S--EQ5+HgTae5$(Im;Xih?}&SzU3Y7=YZ*BseL`UVJQZ8YMx9&NrhlyUHn);pv2ISOVXTHY zEuxhjouoWM3K{tUGuy&Dt78#J2+@1JTTrbK%6d0N_rxa&nJjDj{qsmc6`kIY@E|_Hq8ZODoZkO(gu|HexvKNnmtu7Wg&Y}` z*SqY4mTj-ynd3ES!;p>>*?lg^OK3a5x)=Y}Osa;1rz^Plj0!MnjP_rvZLEn98@O{U z;|l9dm#6Hg|N5DtpJjp-q9WFr$9#nGo_4%dVukycd`@%%|XM?Dx?cTpP_IBmw z`mvR-o|&Q>ec!OOmb=@s(qIQUPfl{&Y{*wTWfxPAGO!Btb3$5V+ky+IAFhco6v(e~>`*yRraF>YnX;8lp}> zWtCjnOxX~Ia8w!P+bP z(KUo-N$QDCezbfjnD`%-Rr^fnN8n|sq7}61)kMN=~`1I(Cf8Qoa$H9^HcJ7o`Cr7m%xQEU`hNaD( z`Ky(k6&zIC=$~z%6UuUpuhm7su;2zm??XJaQ#KUD!B=N_$GcoiwUzh9)UJCONzgW< zOu17Fe6e2bH&DNQptxq8{LI-gtlgqO+pbqGp$84|%0fQ*h(<86yhB`a;310mEyN>j z3oK1xxf{-ezu8ZVj!L$0H41lmLs4X;&ZCwYy$ruAVCWr&g8H z^~M-PtL6>Hd0#RvC!q%$M`MUzLK(_XU^eahh>j>Ir>ky%E)tF!Vk1eY$&tn5ErB$( z-an=mA_^%|&krr~7sFL9VC+CgV5wuuU7Kp*G&oYnep;fn-N*TsZ1F56AA!IrAS%FX ztI6YSH>28`ea6nLU-)4w7@>0T(?gu^bN}X$Hb)nljK^Zm5E|m6EW?R|-ScbxJzOW{ zosUY2Gi(jG*vT&Bl~D-Buk}5*N$F0yE5$i8fzF`wBEj%}kl!f_wu~~XXJQ!emupo{ z?-xagHMswz1>7ei(y|=p&#ZZ}8;&bQW;VjIm?RInL=vd*t5aaf%-qUqZmv9@Z}8GEg^iZuzbm zuGf*{6rI~%kj`|6uDRbdyyeU7+usO75*d1j=VlQz!-{1+q^LeRB{^WXEC1R}Gx1Gu zi$Hzfuw>T8IOY9U5h`LBufO*1YGI0P8A)pY&=U+-{ye$Ed3EK{Iy%oQRs9)q!L!Nx zX4QO*Bc7Dc_Lp8|5_TP@_D>p>gaw~1+IoEAaQb9`fq3;7{GdzW&XH-;iou7mZa6un z?~XupM<1-{51S(VhSBKT@T83MTPR9}=AQD}%H{8tS72%c3Lza)?H`-TC*kPUN>`ZS z<{y(kMzv!xqrBA{`#n@UZl~eJwfsp5VfJKMZk@({N`&<*-ldg~Yh{v&nJ)yR!g&+j z&_46GT9~6FTE4iriL-O;r0!9Uo1`i0y+B71a^EvL=ls3B5&N|dQ!^K3WIIs0WSP)E z<&f&*ZX`B}p44f1AbP7BRjlPwBv+dmYg$JzH=M9%viK9~+ND%ZFi4J*nN z@pT6{k1LVzJ+`z{kJ0^2wlwL0AG0N9`0Kht@8KjZcG5lZ_)rX^tQY>7FKQq}4c!sx zYq;6Gc%qykXJCU?_ygq`c`!NEfp=qrwFW1m4)g3%V4q#8Pzo~JiKC1)PJv#Ff7mFK zJEu}&rXHqrIbK1H*%fE+n8fcgd9ZRnY{H6XM7L5bcv>8?EIfWJ9go!n-bsxGx-3YECTYoZNfQDO94K_)R3@Fkb zUVeCo8DCxfNNv!4f5%Ce3`%JX6ow8-e-&aXmThF-?8W>25@*G{oBp^cj zT|6_J23~0UdD<6cw46FQIex?Z;vc*hW?$Egy5(uFQ}01N7B5h|2E^T>ct)E6~Av&vdOX!^@6R{gv_JmqqC%lSmX93WRw%Rp*C z9&)EEKvcJ8;wvR9uFIvxih2s-&*cxZ*ICeuU+127So!uwllF7oyQOkKKM{{j9 z*4*Zfv2X?^f@NClenL;T4W29=ldGmmYw9oKn29THP4UuUMPg(^sYNgT<_$}BtW4$q z7UFua%P=($Q1hxg6)Q!8&@jKb0Q6Sh$;lv7Up`=qb{+qHl_WZTF>zhc(2-{6rM^Ep zn}!qqW157lTR0I0^c1p@YCl8pkHjboZbqG;`N`rlrNNTi&hTM2C45HKz(i;|SyJS{ki<`ZNgZI1t`++~>uDvPUccr_X4XHHNeN}UTknGN^}2U_w6(S(z8>dY;C47r!HHc*OMl8&C~=ssg}Ctz2P_{ z5Ub`5Vhhq8iDZVM?_w8u4jH;&M16l1{97RWA)3}spXPs(^~^=esW%XJ4my@yPL-YtJ4QWz-)lT)G6e~d>vvyf*5-M5Wsne$ zp~S1*oT;`YL^jSwMPb2#Uu2VjYzAoG@(K#=a0nI7Cw9W$9I6A(PmiDP-Z`^z-abEJ>7UId}Fgn&bW0t2~L$2_~^8&)|^Dt_tj5(6q?_$VYClBlfCQy?6A) z?;^WHv!V0Kb8{Tu&CTsW9rBw49{w6u+=FJ`Ki$^QVaEptd*0;bp(2`(zFw(uS)hZ? z`tts&KehCz#t6gD4UKB)tfZx*4ie;KVZaDkH?$1wLW>8on?~Bylum_>6k6RY?S`4&cXRUaC_sbwz8k3Wh*oO>XuNv2 z_nLT&K6gU6+|8lqVVagz>)(2Mt<|G*jj(IYU1}H^(ygf2B25GpCc z_;B|7H}q^e9hlYKr-gpwCve*PhD0}Xnny!hkDgu7;hZ~U~#TnBF+GQa^C z=uk}OiR_8!RwUD5gRefQ;$r6F`j~f$W@@729;Iq`<{Ypbi7(Dw#M zoA99d-7v@Z@dhUt7LJ`_u5`ykf9rv#n0)P{{~8ob6^{|E+NXxQ+^VkwJGdjRA`oV) z=a+_)CF|K%*)l`7dGxHQ%uH!tTRk`myXUE;Xx>*FuQTZmO>u+n$8wQBhZwPaS+cRu z*Id8koL1ba&l=(TeeKcKecl_73jEWc`KpwH`AM1ayXAF~D4L=PnS^5K_A<4hFV)2@ zeY<>WTx&Q$FF(4t2R#@}lQ}snp z^VR77irYsITJj=sKQj)YAK8Wa*T@iv=Uh8q+tnw3mB90f@iwi)NIH~&^>3LtB;h#Z ze(o)DBrJC>8r<+wC!?Xk%hi1R{5e_50sIF#d;AJG{xFK{F3$fm<-qA09OC%(pw6UH z5Tmc;<(m$J$nJMBSF&^nx-L|>pN25W7mqy+;Xs2!7^Y1GPXwv)k=Z|S7s%A|U+Mn8EqMctd~7x3QU4fwF6Q?uE53stps}%|x2?k?e2r@?u1f zeQYMKlLU7>b^zfp{?aWUA0N?xc1vC zR#pip9;UyNxFAsq42S|XvQRsYU z6ceL@ZocmEQX@_-*Sr6`kF-SFQs$ZPZEq+NhsuV~8FH%@l4d3lif+A(K6t9()?~os z&I=QQ&Z7vT?9Y-OYoESoL+?HE;x%`{XSV%+DF{{8l^afCmr2O%IIbly#W7vt4-UO{ zUtZe-M)=pJ1xweiP#%f9-SJh@G!I^|u2J`vPTZr!{CyV{u{AIgHJ+@G&LOwwJKe;t zfcTSlAW$3qC==cbO=q(6@DM-g7^CWR4StxjyOJQtffW0ISIzh+SR4^TROtqA{ZY-;$_W>Ye{`@|Di(it3m)Z z4`%u?6<+cY=nWADTaTGP+F0 z@ZhLNFiBJ4Af9Hn@^uK}?QKTdHR4=$cRm=D;UaM(lz(Ry&5aZ8n(?UP$=AZ0ftWkb?TPRWYm19@?bP*~x}(YtBiaWQ%`>D!iD- zS95T-p59&==>Ghd13?Y#MGbtllxhy`vMaU65RDl#-i^`#7IxQ~U0WK?X~$}pf3=7n z3Sa-%8S&c~7|3RrSWN?OwMPX6&<^%@2SriG8=+EInw`I#Ogdpa-J>YeYQ&_!c4nhJQ_}_z7p>#5Zg7WZ&AX$!=I6K38{}}q8Qf)z6J6%6U-@AT#u8yliGIsO7 zc_v{htkhm1el+*gylC7FXyEeKz9@+t=pTS^gY52xzI};*y~AhnS7sz%N2UzdIW$&i z$PS~d_P(ywrq!6U|9=>J4{)si_ig;v&{QcQp)w+&NysRn5Fw+CsH_NC*{h|Hl~G3a zo)NMlQX-~PcuIs$c>w2B%%hNRwVep0s z%+!?izga%U{gN9u{|xrnJ^wCpr?5EZ!mxstw#%+S89jqssmRtQ{Ux>#ebzB@+WiCz zzeqN`nc5J4jUwu>63Z5vk&*`=bpS}4WL}76o$C4TN<2Yj?86$z^jhoK$De1YZAo&< zc^BNKITb$Yk&%3YQv15i8Azw%e!8;cb}<@?SuW0+5UX6C9|Q-os4^rv0(s#ChE^oUvuxn|H$@9lpoi>aZ zac1c{qVKVPPIsxSzUk2xyDw$f5|*Du6uQrpp>_n4O*2iwk^Kff9B6kYuaOY62= z+T(LSrVDjc{mzwbi<@MK4s%;?d7LII);0x8jTn>>6^c z96i2ea_aqs=`byu0gYE$!t!c8lns{9oJ8#{ChwG*y@{&1ANF zWtEF@>!&J2n|hhOKnGLNQBj^UXy3X}K}8Cm+5awq zywziJE+vlOY{kvva+u-zcah7*eOZ)fmHh5rqT2VdQ&?inn^pd$!(tQvu27@_?f1b= zceV4mDKJWT9(~pgQ5FKh*#}>3(e-=jPw}w! zFzqS>(y6V(4+YZv%P-T9FT*57nWzaOLT;^Wcu%eG<3X8z%GKg`&Vw^LI%*1o+2@#O z{(bPoSPVhUJJ|m;hg=5N+7qk1=|>g&d6}xe*Du4ENIKfSi;nFP6?emnWSr_%xx{$N zD2bsB;?dv@MEW$)6`}Y!_x*CCm9R&d8-DYY_N2uTG2^A-1>A!24Hy(W@(t76OApy^ z+w1FT5w!`_0$rxQx;U?unx5&i=@aevUA3tn+RFMY%f~(@{7_yql zJFA`wX>69KSZ=C<+9M|ytKVjN#Bd25MjCjhE*un8UF)+r1QZbr)e+#IMSArpMTwZPgZ7;B3j!*>d=#}%z|IO*6GOa8s?!Zpxl1vi3D>8MtXgp z?WigL_i~A@vY}F?@Av$D09C53^18-PSjdFzvfmEp{E@J*c-Xr!dLsw9#QwQsryrM} zTnRh!LYYoZOw`B{HqK^}Da9^}s`e z==-rk9G;B6;idc+{LXY!{rmi4j*h0GBWJS?f+OE}vHH~&$_A@{pA-jq)Z%94hd(Sf zu|+Dp*tc(X1EKTsf16BSL+bm&Pw(r_e}{$a#fvI~xV?$lwPnc%u>vR55R86&e0KNV zy)|#&zqiYyT3QUGQ`WCUY$tE_*9sTfW{F*T8WF$|Ip8+g{}_E{Z@FJvZtq{y=x|z0 z7F%FBJ2+U(4i`o!@c&I}?fxKS@FM8JArY=3)wYnDY7?eb1KZY@yC|L4%;ihGXHAs( z_eFgA`nG)x6ZzPmuHXANw!Dbu5$jsyT5nz^9Y=EVXgb| zVy*4^cteZxk*$jpcGxEI&-oE3bbF5icIPkwkdcD`YYJ4Q#H)UFv^;gn}ku*oAWeepzg0f9B{)a?)b{M6}% zYBONg39LbcKHL7MZ3_-?=FHQCgym2c$>!|vghlN9lmy?6ykrZCJ z{6eLxu-It*uR85ryT`}U3&TmeFK=C_5wpCva;b!7RyayIq}+;upd>z&u= zwYq@k&%FXQJ2qTSwGQM;jF-&c|CwArX(*23b0q%lKPmY{C;8K-ck>Ae%_ZIs3+Pu8 z;i2OWwwiC>@n+sQY1X~HG0p3&nc0@xBZ7})f*0O?3T5*N3F(U97|cm5VbTMg=}n?i z<@9d9wA&|cWpDb>C~)kA5)~~@%pFwE9ji9_QZzSq`1Raax{RjeDy*MJZ6>3S`}(}| zxU=4DQsyuXED|9UL+{`J_LO@}mtKA;?smuAxnqH2Pkf4r6;4?@R^cqMAhF|T|FAX6 zTTW=U!xYs)xtg%{ud*CU8`|>%CS@-$9;i~ZOOMcG6fgM7mo3vc@o{!QGpW#vxi@D=3RX~4~Hgm#hFUeI%V~fwle}A8gWkKOzF|9c|Ghukq zy+Q7>Z*=kFg|5R@pIgBb@UK7&pFnrdEpO(gOGCD}L#OJ5wk~zOnbI1P=z4(1zTvvf zUQ-O>Cj{L*t!wIZDEa?|1hT&We@LkCw=b9*lP%APF?KV2`)QZgRp2peu3_V$4|c0L zx=Yt3Jm+>V-^X1-3<{%DeM;=Ct5!|qZXaJ?_o%anmebskBUdlwj_J)!_p%y(#gDJI z=l#9fm^>(W$+VP4^Jih4^j@s6xfGw$1Rwlx;~;I}uXl_gLcyI8iFDbOvB%O+ zP2y}1H%gZO<@k{zTWikNvZ0pMH(TA@s~{*aTwZx_Qcj6N<>dtvkHa71KO2i`wbPvG zj$R(}-mqcALocsC76+F0prvkixhE}uMwE;i!oU>=%M+Z`#2crgTB$+#y=tR&T(GylGKGj zvF>dvmqtP&MfqB12_-_^iyMEtpWJ=uCDMzxdX7CZFm&@KuWZ@Q&A-8<%C2m+`t|qk zP2uo*ryM(rVeq!%gK0x0G14uJ$scF^$;JM}#a@#>O~y%oTI|Zd7|Yy{Ww<6*W)Rpp z%;c>vt4Mf!d3Qu&fy%UV`B49zhl#|&7vdq$%zIUoW;5;Qcd=Xt4y-1OE!n$4^-#t@|QQ>o;(e$2wF$KL(wZD8|H!lgksBOHIwYI8{ZdwEEu{{nVuL zIPN*!hEQ3fy4Si@htOg1bxj<*65~?yge;IKUW;$ocMGlT3rZ%YRX_Wp&KR|FQoF=R zTYbm6-Kx~nKGSC#={4Z=u&ihotxwZG!Do`QVd{;?NJls70zx$&5*g&_`3&OqZOr!9 ze(ViDO|9B_kzAsC=gJ%SG#KSfh^1Wtq>~LfS9WrA2=}UEkQi5+V$(VZM(t<8x+JE= ze>oxp0UnK$ee1WLXntwZZVP3=ke_l&9S2(DPDs$|UfL5EwMXH|n#+BiZxSDJ5NE*_ zA2(??3tNwUz`1ol-eHs1xioPX;%wqMM(Y6uX8I&nlen*~`qag%{#I9lBVDR*-`;C`$slY`xyO-qG{UW*TC3FlQhjxa z2MI;LBI^q5>h_k3DK5ERS`+=~Q>;VRyVB;Ozb3c`3_uG>nVCmUaxu7`uOi`yCLvV3 z^x)V@6Dg$zM=IyELNsQs_fOng5{AnFz=F=o|29CZoP{GCfW!0Irmgr zG4b=izw6(yHO6qKkZI4xPtS4{C~asYf|r8HWABh)_xkB^qgK(tL$0-2w+#M5Xy)6e zld}*lam>o@`9TfMf;|8~*7{kGN1Lj!^NC}Ky`qiXWqgs#*ELb6Hrn{rWb3)$?6+?Z zUdkHKTVFFpo%7>bWBs<3auM|X7xnVo-1P1r%FVjDc{16VRg99pkaD~W;mL5B*^%}* z2$pRaVQ)&hb;QtMy>FE4`*sr7{WghGsq{Go|m$)6TB_0y8bvf?V4%r>qt&hl$Nd2+0Brsi7p3vX=n zCva^5ce{&+$Ep)5KZLM*uLZoJc`aPyN|OU+69;Ow6H?Fk7s zjS#mx`Jk^)Zu73~8dZsf7lSyjU%$ebO&3?PDh6EWe_xY_N$M^c{82kscla-I&dJ@`kPC& zPq5mWeGB~uB~Mj9=Q2eR0s(i0B+G{VA8^Dq_w@gui;WACMi+T(`7Y&tTX8AYJ5b1? zlaUYR85EeC|Fd&$Ld7kMi!Cl*RAis?-F4MnKReyg*QcGCa8I1%7dr5-Z&mt$SgfcW zMIYZDX!$?+!){9dR27y=_L<=vy2pYFN&W=5ott|pJ8>eIizrG7Y&yw5co((n66~TI zT~9G41rllrrA1cs{GDRvEGy#?kf@8b#o=qd_OCayzdgC&m{h|*=sxBGL93NSP+xw4 zg8aDMAlDj^aSxgN^6ImluN`;e32@YiI`#I0ZN)WFfk_UZW+LY_KC9dBdMLOu{6u-V|E%Xr z8&1vKJCR?V(k6e6jVGP3Ov#&VSl5G-J^C^oD`}YW@{P!-2v3ifH!9m+Gn6!3J+0C| z@rg@&ICWkS2Ml=DtRV4nDPNXgd2zSmQ&W@ut1odj2wds?)ak9R9&*;#xw%=lK_HCH zlmgMwI&ksO6cbZMXI<2~mm2vcae5bs=SV8eYciCU&O?e>nqJVlqkjd9W>xneh_T`i z9_n=^x|_;!&I4&9V%@v`z4e1=x$qAm{Kh9cz(l<3ETy^1t0Kf98Ka;zP~TyEGKcLa zw_bM@--O3fTryAsg!@swhulALO!Kr7$6EXw+s9iV@nhjDiU0AP;u|LhY3|2JzlR=W zz|KZaUzTwE1EME_SWsi6L0o2 zl~hT^E7Lq~+2)Fn2l$t&sLZ7{j#-U9xH@OeZtsxf5CY(Rcl`tUt6E|m$*QNJg5B%w z=XdYUomKbl-3vz}5PBUWsgIYw#Q@p$YfNKMwJba91vTW`qvWJ}+?rcrB09;1d6a|? z&%6#V&il+G@6cl*X8{EgLL+Dj8ak$(C2QCJs_sdt(LW5`OG0%LVu;(RHgY>G#r}G^ zZiuc^JSw*Y%Hbt0jAcNa+Pd>T4Z*uZ*n{@51PdG6bw~;j z!g>(*xtFSu4|$%~*@OERH$h?JHqP8!?=(lb2I4d%741WnRja|h`kaw{ z%w9VP4D@Oe4(ipJ<2?L z8wQo0Q0}Kye2VHGI4%pW(yTmz$E|{co!$A37KwE8>uUx`SskO=xHc7ca(rxbxhauRS1ce7VgP+8U2`u;pd(Eq&U^ z2YY1#ez7&}XFBxFLarWiknnEULMYKgB#Dxd(KvG+;*6$trb}y3tb-F{U`GyZZQHa_ zi{N}^ZwaugLPimy_xI53+snm85hd}s1i~=cW_^r+fnAa^^Dd0gj|k+Qj%pO+vbPmYWxgT6m-Ye-zBFgf!44lA)Fn| zYg7W2GZ)BhMQ9c|Hr7ZOLA|%icIlSKdn(;Nk?DLnGu-HF$JMYq8u@Y|-p+TzoE|`M zxQ~3xJuFbUsi}J9TtJmmxB@iug7=hvmv{6EaoSE7LEvs&+JUCP;l*84@`b%vSx~^2%AujAZhW=8 zrTs~`D2_v%qu1?-qP*}MZ*w5n_8p$ zOZP&)ROzZ;O-CVnb55GJpN}Y+25Z!#vD-t=*$#GtsuE_#+zP$bIsH5G?D_r3&!h;i zi+oVtAI{FpOHGW!K~-1FY8H~uE|5`s|L--s!Z&Zcb9Ly_)Z<-uzekRu&xRzmc|t-Z$?PBt375u>GezzK?DdiO+ z+CehZ>hY%KfR}OhX%)NRt?=*UQ7=R<&U`OxtPU=mP&Nwa@dK>iQ&#qz`)<5c!QgTm z(Pr&c-ZLlk-a$kFfs?xl za$wULrZFbjwc-vlzu!RE$_t%dSBH$u%wG@--HYoDkzflom*vo*BPqpdpJ!=qBO;T3 zo7bynvAp0N+~uF4-oC%BMR^ywSod&9kZk}`7i66&p_%6F<+YWVO2Tj>VS94y*ba~@ ze7A>aZ6}8wVP5PigwwM$PAn}89&sSPJHYelm zL=r|rQ1g0241&Uf;yP3f&q+y%UcI^x#uBFQ8LS8)Y;66M;pM3Ekfi8%BGa2j^Ik~M z#;*Rl!_S2&XmxPYwFWcL^pPuo@J}hvTcE3Ce@mEA$NcWi;HNgIW zcqm@?Ro-NDe)G#CM@Xc7Vxb{{?kZ1mx-^z|a(U7LiU%BVu96Dv=QnoM9~-;90-@sN z6Ad0m#crYz-L?ID?Gfp?=Q5Myr4I}|IWeLIm2>V9@$RP*Q7)pL_46NBpr?G7Vhi0_ zl9sHb3%;-cLw=r%PmR792Cz7(rL#UsYkP@CoPz1VaXJ})hh59FJ(Mk1XJ$>lV^DR!r)ZCeE+iU$e_V!#lgjAWvUyOyMk2vJekRVew_9bS0}`6%VQck<9M zE1w^eHq8GuTGW2q6Fo;r7~pWI^;iA3$|oIQRD~HqOe~Rw%JycP@3?>O-nZHq>BGTQ z&_48^wB%rijQ*24wmm0|L+Tus)A_(AnYL_RuPT~DB?_0;-%m?x`$UPRVP4ptA8jrB zE6pSsWY2G9G86CTVXe%tyAugDq-MbYmJdG~@94O4`Cn;Xx(#9-V(p1L!{taZgR3`? z)wN`PzCEBAXIbCC&>H{iR!@+bK}l5EwHJ;=b|(hJAVTGpoXiPRo2&TteJz=6*~VEX zte2E+FaZ7Jc#v;;{AMdzkt7)m!`v)a*BzJF{_w}*a`%1pD^Y!f{?$N3(h;phpK z7iVIG6zX<{+DlP)H6EDT{pDN?=6dM}4XsZB)3)+ex%5<_{O05?E-pJ;SFc(%RkpmF z(_(Mbk+03aZIib-F60xoKSM@YWyDJ zoOmi0qrSzhG5PY{%D_WW(B0E-N@al5uq)L0I-%TcjYMh?^-c;3ScCzO)7-Ch3xq%{ zA=V4Awt<2z%lOMUVZ67iLi$ikd%$b!BS-~j2kh|u|Y0$6{f`G4}|Lzm2vHcF*7_Os-2+cZ) zQ>REUAb^7e1ZFjUV6L12s`mHZXq~dKcxg$YyYHT*nwHjHXqw&3?3RT)7}Vt-VAP$O z{4N=Rp`c|+RxI!|7P#r>_;{)GmeiD)cRk_pkQ9W`#d^YJiEy@oVM{6M4p$gh?0~9w z(qp=VF^U`*A}0CSci!K|Yo*P+bde48xr09yuj(BYFkT$q`Eq*!&!ZuH)}T-a+~|oV zu>%eUCGZpQ2@KR5#$dPrq_gX0rU)r)+(BXq<&WegiKf4Vr3$B7_Gai4_qFGrV6^Sq zH<%4kS_WwSoiQ^ziUvj0#$#^q?05t%S9*T4H7IhRB>42C6*uV{^3HomYnnn)9vhXA zb`^u;9Z3teYnNVJTT8fbKp$TVP>=8)BQ4tf;C{J6&5asz`inj`w}&pFQW~^@6#E%_ z3?-j~aye|k2tPV965%RDDu!4#zK3hvb2CAibYW9|_El0pnjh2i*V0ivracnb>V9n) zs}gg=fBeEh(Qn7w2m6qR#c<_ejzf69pnBg%y0tK8GY6{?!qND0J$)O@LjETAI>MKVa_7#qFuC}GT-z8_=ud#~ zVi;&VE*I$7w9PN%w9QZKxmy3DyIjPlZz2j zZvlVZyApbm4$fLoLIxYNmZ-~MeBqIvo<1PRvs~s#ZE#=fTQXrvQ)(#yRO-Vj**x$- zfFJ_{LfoeARQXR1E*-6EzxB?^&+`?Tb9(w=SgNTPSW`lI_AO!mgq==k09$=AT?f05 zER%kAMiNZ$NY1c^Id0TiKQdWcBZ)9^$IHu$XC1p{DM|`x9BkXUQ|DqFQZAM<>=d}w zU)%2@{GhNWk3jg9Q}Z=F1T+a8g1y|{YXeT;3lyaxC)~&jxlfoY@!8Af@?3hKD13{b zUA4ga2)q*ce~+~9#{Nk9H9TQl@JnLQUfk}AP0vmu6g@o>#d1KS(3Xb~pXV`b{Pu|L zd3yhD_{^aM?CcG5dIv9+VaBvquV~B4${OGE^Rwn`KmP1IwiJ&ox7dVIe^!-Y2;I>Y z4MEJe+bBF?l_rq_3Iw$ary{GeA*xQ!vZ2M~^&^iHU+-OQJ$_DS)i%8~tJeFmyvj=) znfpA!>*cW1v9bDCPo>gG*a5aSqp%$riLKH(D3vdZT#%8$G~tn;6DFRJVC;PHp79I- zzwqqPv7wH_eAV}d4xwQqE-rpfMkeclIJR!4u?g%y&rp4W=fgUqI4+fw^+RuBst1Kla%pHjn}-R|@H%+Fq(*m7CqH zWQl!GYzw%D+Ke;T?z#f>&FTTqG`+Z>zr8Uo#`M+aqEhe|4?@`02r~ik&Q6s%who?1P18 zXD04SMkS0*!6bpZAxeduFI6c~Ala<`Y}`+)1kVgY9NlA&2qQ5kUvhtwDfEq_7X4cw z9@+^Tg0lc`w!i!2ht2&O;QU7zs$CHLwFbkxgz_}ugx5Gs*w4Y^0wetH*sS~D7zfWH z;+yT44S9&LAA#3G>b`#^%@rV9G12#2H=#w1L`x{v5=;`IJn85tj9pDs_UPJ*KXh{w zb=|_?4F@(vqU&(rf~}!aBLh|z_DDFufUyn3z@5s|1l)wVm_by)Mwra(t2DcRL;LF0 zt5e`RNPf{zaWEy!l5)_M(RRH?o`tFPN5bz6#&PkB=WyU-@CdFYmt6BGd5*&$3Q8*bVdKJRgUsIpLeN4GQRpend{@-K3g^SU@mfgzFY`i1*@L3)~_P z64w_8Gay^M4p-1ME$z^Qt$RG-BSlU+434fm!-G(j2joQ&UQF;A*a25ZzT{%R6S;^m zy27g4OnaCF?=H*4AK0&qL|!JmM<7S5{B)Pn9?6n2Q(XiGLao+vr%4Vqv*Z+vO%7iz zSwmtq-v*I!PZ%k*Z6>pu`}O`8tPEN>8I?4kZ4VLKQ}E`w%Vdy_V^D<7IV?=ZMbzuH z{^voK*Brv8tDYsPoKZYV$3FE;FZr}jhHr5nG<*pta{D&#-2C5N4Hscc_x7#WU`cr@ zpV}Ooug80c5^N`FnRH0JY<$b4uzRr9G=y-dh4TL;C#yrFUvkYKdV0Qt(EgO&ZK2MJ zte&xlk&%&1gkmyb;*bYD`_8IaLT-7&d&}B&>!8}r#L8M0D}?{keX+vew?+?tCVR7q zhG}?!oyxAZlGD64`Qyv@JRG;Yb8c89(~x|ZaZ=rQps&s@NUjpH|Fqj}4uElkMc<{~ zMg}3?C!DQVb=ZFIJ_nfM|L)ItE67>=c+7sKdw>CxbU?Fd7> zO@|7c{{k?VS5~ejVSx!wHea`%>5q$$V^Rj>H7Ui za9%)PrCj0-_X%zizQT~n$MEVh{pC@sb?S-Jd}XW*)lQpuI<^1NU_*P&9Ulf+{{doxp1gx}B2Tp!yYWnda#*I=utMI6t##%9`RTIurhW&j?Pjq%6!j?fkd2sHF zFH0w;A4#Nw-?h?D1uPTWje2^1{2+`=K<1_0Q?QRBAV3G^ zk};(#0Vp*u?TuD3q}(49R0>QW47ik)%(=bT=@8XqL-<@k0y7;vxCwZ4G3o$DZKmf- z=QUV)FDn*J&Ejh=tS4y9q&A+{t1eNH`!+eQU76ADRrFY*Ai%$;c6}thd)df~0!g4; zsLglwA5{kpd^}krpL%W9t;esPRGy|J5vG4cB}G8X$GcB_zfW@ol0NtH4VNp4#G-*d z-f>8Bwa!~lRqPz%z&3(6L=a~}I{*P{a%D(BJrCWb}U<)HiciTcn zcxnCBYoCM_!*w`X!8nkfTm6+*sKEsCIvTE}IEHsTBO~J{j>9s+KLr4RsIUp+KvFTf zLZ4xB0`%23V~xB{a)t-_YG54KH(KcldLXY~p=UyPjS;2`geiu%+7H8@Wi*=Ke9khS zhQ$++1%GwDCp^4}UePb&z%B7BDXEPlEnHK=W&!U`IGu7{N-yqj(wU!`0Nr^4PG;_K zWfYwIS<2gX{n|D9)a#0xz-e5UGu8kX?`%B-Kc-sgL%A_431Ei9a=||OQ<&H9GWUZw zr8Nth^9qw8+h}iUs^NcWkO&swjo;w-GXd4NFbe9oPa-BW!xk9y2%2iXmcgni5^%tS ziOH6IYY2BTR9CA=@N~NaU;~*_E%a81=r~L+j;a)L*=ceY00IfaQ`k3sv24-+`36JP zXv3t4H7c~ zv`caEDM5&(PvP^SR7h@vff+>h`+xrueA5%&M`TPlx5UpK=wZN6^%Ix)a#ZJ8qcbXY zyHFI=dsO-ACZ%AK{TBX$WTQnw^z#ibVygVhr z#UiM-7LMmLL(M&V`iuT?!iKUn0FW>|d~cjr?*{I#6P&oxXFcgpYimc#%%V0&?EEQa4_`VZ1%s0ZW~Il}8Da zT&MX#O@i+M9i>W1L*x4NDJ?Gf`;~*3@~gXtLdz4khby+uKdjY1eCY5t6W0gVUdr~? zpW6oQtC1H2CD?F;=LoH|coTR|ccln-hmKiSt3hl=Ko5)$^HD>>nM3(LseR4R9KqoY~io}7Q& zziJ248N82dh}s$5(7V8;_6`n5%(~B^lUtwE$R2b|Zwnm4x9!<8`tOjAJJA;f44|BY zPinR#v&6SqargAk_t$zGaxBek1g>Jfz|J5}h4MQ-o}F0l&yJ>b!ed(w=#XqRIf+E5 z`NP31ID9?(w1Xp9r7Y!Mtf9{^nS?VP;VDQk5rngO++^YSr;|qjab6m?(}B5yomY@$ z8@j#DAl-1CniP6Q_HEcFW_eo>UdFJD+6+r;7^#)%GCn_n@&;!NNvJpYEj{TtaeAC$ zH?J0$c<{}Nq&TN@YNHdbW`2B`ipxRHAxCDy3>KznlXu-3ZDF+WO!fTwz*T9 z9t}QmEu2M_lZ8AQylTBUl_kcvesSgoWw1~*xR;98fX<$PhYaX7OV-$fmZKdSD2I}3 zZ!JDoQ7@3@Om;~|??|(olgH1tl6cS47X;5#b>5121u7>6CL=^bJiq_R{FKaz_Gr{C zDmVBRpPHCe`DgaP72De%K14DVUX%%Ny5ID(@X>~`J8YPh_gnIM05i4D@F#^&zd{sx%{+VA4Xpw=&vMGjy0k_sw1u9yQ7eWDpdRdM z96kk?MxG*Afy9BEORrT;KOy_Y#Ul8aw9;d(3cznewk+H9_cR zKeru7;Mm&7i7HG=S~_#<;wLB3-)^ZrrY0}{EbzXat--bSJllPo=Z|a=j(r^;9$)Vj zmln5ba9u!Nqgk^;B8xAJ`ODn;E8>l!N%P|#axiAAuim<8Qym=L;zB}dE5}%F0J+Qz zYW__m@_>NdWHS8Vj-&L3?W2@0ir7RHW-V!7VQdDM4u$W{paEfl$cvv=J0#x@B5se}Q{w&052pZD6>+76(Q^o<53CV^LT zd;FMdVj5iA#dGH#qC$;Ep2Uvw57Hurm1c8Mf&|QX0PBh}2w2GBZ;bQ$_d9?k_KgPh zaX?T|9R_5%EC)nHG?Bs^;F`qncemv(S9hl1s{Kg~23xbO71B;Dj0Yt3jDe0&N)l?edF zL6H47MzE67)8(+ya?ok9gi9-o$?R+~OZ5_WXLip* zwqX!)R0rprNtlSW+|1_qfLil>)pH>=#3)Y7R2X+*K&BADD-C6s6t(n+XO~ch3YdPs z2Q0zbf9+QtiV@tjwoTDkA_s(puP`#|G##XN7r$}iMr*!R8fcnuB=T81oB@)S3E`9f zJ6-K$0se3HYnb%yq2>sY`Th$*N3K5rlN5ulf_}O8zScs!9QdfbgukcI@6Tt5cO#sM zGFy_t`^|S0>3Gr*=ebdV1i^bg61_xKH1T~{7zC~5P{|gZ+_rc%dG7yZu!>(uO5+&t zSD~61t4us;7R}PNiAitwk!$zpP8f5Fh=@=y3SLS`NPsJ{J7j1QA-M6Oy7~g}VIy2u zrH3{^vLqSzy5jx&v(SXWzORMn8P&FJd+}VEPyr?Eh8lFlQm5w8zfi_HnD4FS46n3= z_Y_h}9lBMV9# zMNNkZ|7dsd8vlUMmYck@U%e24l5jVqWMuqVV2pI0c(?bS3_V?4QK015;g`G6L0mr6 zKLd>^!GeN<^;quhaE6*eyN_9N;A>G4EUlB!A~l4coob#%f*^x><0WcWcS7huA1j>> z-s&I#<3agI`J!!%p5_ z)qf{N^Uxx&nEH8jb`ZcR;HiT-_&_w6L?6@fD9Fiq)A1OvNKWpQMjRr2h%tURQEB_(fS}~@O-~{d(?*$@QA&EIWL2z z)KGhS`=MiQD8!N=)szIzrN_|gwtctLL`g zZ$_!787W3A2FJNyXYno0p*hv_3w`iGwR{x!Tw@H%OUjbwYl*9HN=C=5L{H+S zC;Pbd$cHZ*hc8;(FdwW83~$I!xZ=pTcSRHma|J8z0U zZ+ZcOSAde|Q7@tS^5Vf3hD2;!C5}oHrG_Wd5UtVMvwragzq-rhE`D~8-N%KChp*jR z80w5@Vq&Uq`}*})%PyUtzb82-F8Zk`>Pl*R@>bFtaa(>dYmS@?ncMzeut0DqOl7Bl zMPE29`iSpPQCSJrc0L+>B*(se5nuj*<*3A_LU*|0!-pF9A1AU@_Qall7KwPJ0>W$Z zI#`iLI7tpkSE1sK2j}m9iOUirE$jymN`wA@OLqS?5ym}x(qJ2$h+6dMGY*xlgu65n zZym~Czp$`0^z)R7ZcCUGyzGC&Q?!rRR*G9s6*fjZSle?rlkbt*Tm{#Fxq5#E@rY4+fAOkD6<3Gx6AcjVa$w$Eojbq^1x zV0J+cVFSMD3aGYuV6uhf6aVE8ypB}?Rbe%zh>zBQ{$!lCyQin}wQG@B+(shGImQ|w z2gG2ECWlXg?-dDB-VnBylZI%gTflW~5G+JqPk`Fc`>w7o#E};VxliT?Vc!m-r+E?E zK&i;l;c|*{7Oa;Y7a*bq8nrM6kud?Lcm~bs5rnTJDBTDq0|twe{mURb)bDxA!KD^ zY9T2q=RGqsJ6JVmpOO$s|ASn2ntVEdUyc3nQ8{5UVc9Urd#z;=F?v|_2DQ zUm>0rpYZb+bs@#FO?Ih@Yvi}Tk>66qj{Vq(4n&Cm;!bu}9jraiEbX|Ul)A>F>!)rb zFjo-2dF;kbn?ilw>%bnc9{d=}jE+9`#aAUY+$puABH|E=z?q@UYGT?HNWzcrFG9KU zWj;Nr^XChO$MZGpa6HL#E2z)GK@t1PU)=QnqwwCHGeF_}O&c5hprsdy5hE z=c`uPTh__QySZ&NtFVfgG<(H917@v0N#&4^wl+Hq_F$*z-Ey-LOVK|tFb>_`lgv;l z!-FSi;tPMA{YXb48-U-R*1QSto7sOEPCSdwHoAUXUoHe~-Cn;@iQ%rF7psXdgqD1$ zu!E}QK>dq7f4lbljn6V{QeG~S445utUXEav@DsG|6!mwij+I%*b9-o3N`~w5c1ho= z{V}nOwaqU^4eMg>DoutWi_~HwDQRfjAQLGo9&te8{jG;6;$|iWd2ZCQOixYu@pZ(@ zGtYykQlj2Wd|v0Ek6jsY_tebI5`T~X84*gjoY|O|{@J!{aO0*FIL@X_jtE_j`D)^a zq#BRxv5|s830WMMIi2DIa33hh&bBJGg@KYtwa;V^RQPDco*tYO!t_Q(| zr2vLcl&A)gspCpYL?soL0BcuC{;}t!;=o)=@Ulzjg3WKhGmWo{MV~i|&6;(U`I0-P zvyD>(LIpmE2w&kH^p_5mPF4=aHgYNpJMs04SpzDR644~;LklB`VK=f`Qx~o(MTdtM zcm9DFJBlM|fQrumDHdbhl7*l;AtaKnIgXy10nh*)@6D%<(>*lAP1xf&6@h&D8)GMJ zX@ptkVkAV*N?unt7Q99*W*++CNQmh&y`*kNLnhZh?0KLycwo_32uojImM0&g!H38^ z7rYEUCw0u3xWuG-u`3e;2j&vuI3}*R_uP4~`LaTcjpI~jQ03OZ(+!~1h)yUcxh>Ay2QXcNLh)$;S!zNBoEKF}BNtzS*-T?l@d0?!5 zBi0$fI+LhJ5!0!5?@l683@>l?$I*5`MzGs)XZJWLGBVQizJ_*VhhxXJ7y4gbnX0C1 zxsY$6*B9w{WqMua+0n8ETV!`zcyVJba`PB}X?7Ik=9Yy+C-&&1&mA*OOb#qHKdk};z&`XF%J<*5WINBNB4lOO%WT8Hki&s*S8H>7si04n( zE^zGKdkT5FEz~M29KsOEeA8*{Ur4cj4;TR6o1bRGZ>R1;mQ&Cmb0Gw!5HU~aG0f}5L zl4VBd54VQBs6CR~gsaHhX|!5`U^$8puI&_7rF#)S-??*V3ZAB55-JL+m{PjC|NPT%w&&KOMB#znO9Toe|EB|sfth3w zhl?L?+Vu{#d>xX9e^5{&wq_sN76jg(o0}uH8)la>-Y%R*ofgDr8i^&{Jvy3(oI^o- z_!Jo#8Sz|HTv;$&1{{n2I7$KBj(w?w?2Az&Ra~Rb@a2vK%$M>-{k4;kk=w+?WY!Md z?o@B@ZP?9y=uN!TY(kl|J_0X-MrCmk3PFqyIVlZD|@Pa)DU|4oM=>JA8P`5-Q?! z1)u5b1Zb~8eCJ7XaQ?(EK_?G4Ibt3x4;|_Tkl0l)cJ;MDX($9g-T}y(qSS~6iqz57 zO++xcLQRAYTa0V)UUfU3#uu=bh!>o_TKxVMmKe zK6(B;75_2IQ^fTs+P$p}BUlE&&2FBajR1*MbaV!qwgKk;Y-PA`rvUuwF!PHWcz~$P zh#QF+zo>PL*5P0NO!U>9fo79DXdhGv{^e}0r zh9a5(QN#*Hj8u{2J$BZxP}s;qkA=V2uQbM!f=cI`^QyoChb2rZnj%jjvrUVCn>^e``1;0TJIJ7(LIG3{ zXl;QJL&;iL^IE`eezq4gGBT){<%#_b-|j)QYco_+;0qn>@W4BYAY&R*)Knmb8iKu) zg?RyRowbf=K}@5+BX`4!JkdZG{YLibaF3dZJ+YGfOH z)b{=U{n~N!ABBEm@rPYcWRBnlg5gQR39@$E!4!~iMf1u8y%;MSnz&VXD*SC0u|P<# zk}BPq3}rnziZ;o$0BFYPJ#XGxti?>3gSbRc(o~yV8^1a+-YV+*8&GHf zlmXXx2+Oncu?K~rmyCyV>nG;D;cpEGbIr#rYOV0B0}p=Y<4KoOdeZC`H!bP!#YPV% zy`AExlx|sjF%i!q$sLKAWUr`vk`i7Ve!RFro@x=wePaov5%<+j*>_=+t6e1f;x#Ci z1dEq9XR4iT-n&_9wK{|NQkIv!Hum$?CF*(dV{USx`~}Z=7GF1>&#U;<&UsyGpJTdI zJ1=tz9ML`h=sIPl?uWXM7e9Za1wU`po~MM~cGUa?sEahm`59sZVcIlpxnEIl!AM5N zdPVKB?E{04osoT*O{@dzVv$69=QHyDUW|hmA{~Hej$7%E@LWz{3*&$FJGuPYfj@EI z`s8=g3ni9Ze=k*-_OtoOP92>D{9@cZg`a=2nvC1@6+6M!f{107w8gzAigf+yu^uT| z*+ycJzF~UlyVOWku~&RB?VD$N=W*ZL43JGG$HVN$ z9-%`Pi^c|KACwXPGs7}3ae}$8!@N`$q^{A1xB;BGL@ECr{W~whbt(N6h}C+atvG~_ zeoT@NKur!kEFuWEi3~nI(eW9iL{D#TZ-!mkFg-UhHiY)gae^}-Iyh*FGfN6s{|v`$ zP||XTi|3h&qF+JqaUYYL$pc?*2NwkM@(9$9Hlnd#_V#VL_R$2?H3Kb~afL%Z(L`?1 z&Abc*B$sxj&@U*+KtwOIg?I~Omjq;H?A0gpbGYok(RLny+q-v8PR_z)P7#rM_Bj0Z zDJUrL^Y#wM#Fk_&V%JNv9w{NrKcp5+s8)BzJ%?53Sp<_tRBAk?-^F;i7{4LVDS!g~ zx8j~T^?n)~{nE0K$9}M~__)&bmJA&N8l(NGlcnk(9Grwm+E#rQ=jZ;?Z`&4*#MZ32 zZtdC+U%wW;wUPcMEQumY8NLA2A@{J6iUaPcTX!MZiqyo`rCL zUINI0D$XRf$ucoVP=?3M7Gs%Ql)CLuK~{i}<7>3m2-Jg+W z!;z1~m3XG;Z!=`CV-r-BSfu4r`Fa#XO%3?6YCm$px?-ihU+6_~?$?}l&R03@ipFug%%Bv53D7Z$U1in?^?e5pND@@s?Bwv{ zrMyMr&_~MW;NYm$uxi(_@KgQJxl8=@>(|)-+?Mtjqv0{APyx>aK;R!5n)2{!V+bd;njD1`L>PC9rrayVscoj8bLa`mEN6eQ;0c_0ILXYs_(nXkZ<8;(J>`+^e8RSyVJ#A<9u?+I z2hnP1LPmZG*wEA69gm`F0Q2a_#+KU4%=|OW)6(2uH`7fO((k$mOcrQ?ZZ8^TsY7jx ze^P8>ZD&`heY79LVu>ng&(Zd*@sZoqe3Lg+&_-eZ6UmN_4wb^!qT)3N(+VPqLfg^N z(YD85CDnRZ^Y2WP8|z{PS4?3YR5iAF(?xmZM*p<&--@8SG3}82`?p2Yy}N3d(+5OP z{Y?ICCx_b=$7vm+0YkhHVr<$}8I}0rcT|tPbD?kJ=FN&2UQF@s#_4O*P>vLOW(3ksVf`0M|EdHMU^sl>gB@vP3ZEb2a z-0HWKsTY52z_!)Zah`m0*Uxh@=g-$;TPj_;)aN0%_{(|$PfnrL9*Pk{ZjJ|8S;Y}O z8$d3{q9t-*F95?d(PpbCS-P$V4@a%%xrbW~(rD(-=%@$h(rZY4WEYj`yd_T%(R|E{ zMGh&8Mx4^%p;VQ_K~BzkM4|{ZY|5{_IOpq0&mS2Q60#fyP8k(@Fx685YW+WxaseuL z$ul%wp^I5@@=$dj->F=rj%D_39+P%}h?z_58&+bq65TTX+1r<6H?ONBxnn;q4f(58 zU?Yjz9ZPQP*RKPn1*rOFke8A;X&1i}NxiaD$Sqkp39PpigNXf9`bS-J|9gw!f#f_0 z;M9(z9~i~)oX`58tspDwiIXhJNQNppyh|_e%6oZ@1C%5pjO5+wyhBbw7fk?MbWBYO zCjyp76iCHu@eLxQq85iUup^1#5A=JFBe)PN1{IvCgv9czIwPC2pkV4JBHO^q4d~4= zR$3To_if;izMWlT>THG&$aSS)%B^>gZC`|x>*DO}Mh!(qjdRI1k;rkAbdaJF>GWM|?`Q8CubFq= z@Av!u`TF_!Flh4*$P%1y-yxZ@Ej&+PY=N12>RtLV1(Y9_AHJNX6%283Cp zO=sum2Q3Tn*IztJsseLL_w_xmtZITnCa;W##Qc<=s+jBAFO+EqvFM**O=wl=LuHk;-zFTw)< z(O<)2nIz5*-tPQ`3mQmFQL+N&=?&TwFr$D4^6?i;@=`y1MEvSz7PhohcsH^cqrg|+ z8LfQeB8rNMIfAUerki&PLfl9cgqmM|mknfzF4l^g9pY8E_h{&k2;aVX_3He}y0eY8 zYFZO#9SUlWg+M4;Z-8vuMjdXP?s(IlystI%L*e(2V;x25g>XiL7hrJq)wT_9>y8sm z0L7=-Gv&KiNDX?J+rsRK~kXI5Y}UJ@ER&#D1AYt zv%!IdxS5djxpax*Ths_j5aQ_`xPAm2>uRJV4sLGong(vQMaN`M!1yJq47NZB zK^UiH_>jXu@veDgaYb;(5X~H>=!RNaIP>JEn>cO{8^IMk$T=~+<=G&DctKx}<{)k4 zqZ}VFTh2wIXRs*N0e)QrZiNFHiqns9tDb{AcM%>9!*(L#2iESh)#%g`OLRkJXZQ`A zmsLjZX%;Rw7e-jKm93SHLa4H|q0!K^kDmkUv~#uc$5BEu6b0-bmP+l*d`~eyBfo$rO{u>PQqZf6-lNmAW zX$*(!W+`{N`PzXyb`*L}EcgbFQP=$RgHyV*fiKLTFM=CmlU2W7Pmu74%(60n+Pm1; zM2%;h?Ybdm-##&dd(?b}Xv?w51ZThd?CUc+Y~I)PXiTW`kZ9P9H?ryj<5>7G$F4N) ziwftlmR3CqE!a0zjcaXNtpUD4V@nGizjPCy{=tt6zTsoqLph?%o>}Z4te|a1@pkFv zc0UdkvQOfe&^3Y#!&Qto<(7)!llC0>pQplT-=ADmJ}s{%$~RP|F8^7D&iakRw0q@@ zVCU22E1#z2T+zG*F=z(SC#6T>W%hNH6U~@U_;BvKA9|W9-_3JoJxX`gGYxVcEQvQV zRpIF?Z%~ z@V3>e-Xvr8d@k-lp2mUzLds zYog-$dyZVWGL{p&X%oyfDlAz(R!ZAogzoI5cR%FgrA7uNR8{x%6j#+cRUDe)u|_?? zMPVQ2lFCCb&aZo1AUd>?jw9ls^BEyWF;VJC@|DO4e{$xjgXLrmy_;Q_<~Rc22}Q89 zlKIvIb$sk_%_q_{L7P`8k|GKKh^U2~AQfJYa%)hO!gt9>l9yESE>R5VB5iTrIhn^M zBr2drUA}S?HY~A(qqwj)J_yB&GvMl%fZhPs7JTZN5+g2`D<~01pbD_&+PLg0AYEe` zv*z2|JMj5L&>EE@zlk`g-yu7Mw^Im;h1&RiUT16G{7T1kmsFsGE6{?73ha|m)p}D~ zW}myRegTrH_A}VOst*SbzlqXJ#=9T{6ksxB)SuDo1 z?&ZttdH>?Qm>_x5e@La@r{Sw}dR?9(_-4R>3)y}9q^IG!8fZ|qtaQE1vKanWf$Opz z!NG**UNJbZe_h)}QL-?ANm!d!Y7v2e0qoQil=ANFM*WMx7_=9dG+s+)&;2LH zhTAeO(N91XUbPBU!Rf6pu$>E#@eyQ8Faaf@;Kx9QV-EnHtO8k)Ev2PfQ)LU+Zw{}s zL#6hXReZ`kn}Pz%D?JK51MR4aN_?Z{JA31;(f5TA5vo5Cz+-sBOlJSUxSg}S=BBEp z%Pu6AnDy*c@%a$ld0v`_hZ>%hVW~dG_cb!K5b{~>`l32V*kFtQ z-DHGZKPGz*zaGGuGvjAuB;HB^)({oO?%#t=vezD8HT}IcR(4VnQ{i4MH(Z9J*s2nS zD5IJ`KO`NBbiJ?eC>vRQ)Q@u}>xX`n+5*#&laB4tE`_B`GiklE$H~oz;VHteZV$C@ zZ?WvX7{5=^m0-`hB+Hv+Ss!j(ZV?d;q2W=842tc9s}v*8h*(2pc;Sk8s@oyufGt{b z*vG;mv!X&vySFZKXW6vr?`f<~DQuDUg6+!2@vp7X!VZxhFt@{?SphAPT|ySe=}wqn z2aYOu2NO)??!^40^pxTzODAd_WR!8m--}~hgvXEB4f?l`wR>FnZ0Q==6EF|9u6$A2 z@LAsoqp6EnTT{d^`BP1u9~Bpy9*SQ2$BmgaM(IF=Gq&Mcpf8=HQJs(^XRk4$eQZEt zkLrPlv*I-}8O8XHLNy^A#;)R_G_TYCc2q|~NnZ^7j$MO{Sd#+e534y0Z`{vTZ1F;U z!ipHq=*q}>QMqf6|K=?m!^R2uA!gUTxFSePX_cLRsfYoi6fvx7VD})ln<**mJN>(l zk9xk&vGwaMt(f#srL}MjlH~CU?||%OhjkBkXQAjY0qNnAzY1zN&1|BPI>PI^h~qR) zbE)k?f!|PRp~6dooP_Mhe0!JKDhAjF5Emhmn`HDX?37Ct2~BTFP~%R?ai^qO)xol8 zO|i;f$WE``u(DRca`#Kl5dh9b%-B{ z2>+`?Il^`2msN5aZCe7W-N_8Q(y4cDTYlY?k>}Ri$w`*Fusg=zTO*8hSFB%`{aEkq zHG}A&jhQAxwoGs=r#_AK8HZ4-N@!0*#g@m}*Zg9HlRzee^7w4-2Qh zT2VHC^SDK6x?^mHPPqx-Eta(rd zy>nRfuYo7VvI!77=q?FWP2Nf0Ad67!+hW?c%NiQ`+4k(JH?DCn8iKPl{xUCfXX~4B zGUlS_w!0m>Su1@Ah=en6DmltJ zIcrPjX4D=65NXf$Yt^yu5L|Xb6e?%>_cp3EDTd@@7=tc@=(EC8Io!w zD6t0YPkC4qsCKTe-)BGGya;62B%*kMTpI+DvHoloe*8D(o1UECPAn!3urmNbLZsK8 z)|tzs^|I<*zqtTiDLf~}0{EO!+2@lqVN!V;;C$M_kbYIXcySbcTjDz(_r4vURK5w6 z!ZS~H4CANRYEvGD-a@~lgy|Bc=~slRP144?(D(~S4=82eNdQo!wV04g3G z7gyU9XTDw#WJgs`${Rq_D}{nX16)L4+BN8hyVNbu`_@P`aMP*gh6*t~8DMoV`I;ib zB7136R+-?V#|J@36qBX~{Z1A@1w>FM(N^_s`}5=Q4kFY`c4`?~VpMX&L_MSNS2u3l zaNIq7S`>&+AZUBPt$RqU74SQ&o<1|bG{sbcW{2ru&Ur&Oii~Kl+m`kKk|?6Vu%G{` z1b5;`f}RAQ`=~cJ%b++q_@A4VSm55i`e)o9xqclkLmLt8^zouq#zSw{BX%q;z@7Mv#)Ouj< z!ExGy#ZiUfjX3%cU9^ar3MNSg>bv}^yqX@C{%hiHfTadiRU%3I*KgdIYpTy1y=zw^ zBsLIxxH==`XutcovtylaS*8WV6n;+MCrku5iXI-nj*$q4PhESAI<5_j95g&o+5^%O zOQ^s7|2aG1C+BHNx)5W5v`@)#`@Z1d;4|y$(~*|%ZHcGDN5$f+ z4kvm9^S$EM)s%2?%jODiqF{h3N%fFij-k}2t$%3EA~t7TH0!iAem+d6(E%6O*gj%hM-y7myUcsKej$3G;~$fhWzDt-@Qfqjcg3P zihifk85Z4C!;!CYO-vYL-<%S1@#4jtiPL#sN4HeA_+~#mg4nz) zjK?L`4d@2*xIYR~CzhK{o}A6sT9f+4@roN2TMtOr@%U3uITKmU7+4Fw)ss06oei2X z#P^^o@j;b83%k>ONZWltn8;cTILYz=7^l=K>J&h_GF0k(ppZI6PhTblNTcI93hYwMpMV$tc(R^{(XDmO!`8ZYAmzF-^v@)C_W zn!(Py2id<5+@@iTrts*hHF5Le&C2K;D)YSr%jFdG#YLrPf^Z*5%F2>)j3Y>eJ`ieB z(}U|H3!1?SDuV;{YK1nz$N0c?P^V4k9cu5u&hwww*RR|NeycBNsF`qTWoFZ+`C_$C zsL_-@7reP?SpBrT4)>JNDKP@}5Umn-qt$B(G)Foesum>6gFJHX{z<5S&f%ff{0mgw zq^l7xV$utX{W~(YwA;W;&eWf4oxMqd!nhHdXgFGas#jF(-@jxQ1_`Jm(f>9)4H-qa zsFBZI9(ssFwcxRn9jeHYb4A!tz?p2qYE46Jh+xeoRP=`hsOD<~fj>@OxE(aqBpoh* zDB@B9q>GM-STX}bT#ulpVk^3S6WAH{s0|T=1fHt8p;pL=^)Y_FLY@aC#MCP`;7}!W zs|+Q((A@vxkklPze`vE}h~*1vfH(Zf4lmp;--5DrQx%JKy*MatJK)PsJqw^>a%zV-};-Vr*RnTB69dvXX5lTDMQGR=HF!3&- zRwXjnwR*~4qi14-gu_~~$>DPVBcT_u7InPdy=6J*>icJGIY$CaBBm4N@Lb3H72KP| z;G9~QGZW87e1Bs2g5e3R6i)Y2~fut9b$P0CJFO4gd06!=S{&KSz zjZ(nJ(SxoA*eumbrMCCy>hx1{9Pnue3y6dSjB0SLd zv(I=~&^lad#zip?H3|jKCyGNhN=SRi+js9FBnOFX^`#rO*4DfQ3t~E;_^p8kGdd$9 z19V(5u{AK|7RXfl|%rEZU@)PTlvz`Au;FDM2r z>yOQi6kNd=yQbtwkXJH^Ft)KuW;hwCODT)9oX3 zhdo?--&gb#jU4SRbTWJhF||E_S~N7{ZKRAm`ob;g=W-Wj^XK><^S4mK|8N$!^UGxa W&(+#om(rE02JaY`YKtqZ3pP{$esLbQVej*w`V6-E$JStHdDLW(>LA*4i2 zEepZ1s1-yHhEr~R2UJxzh_OZ*WwE~=nlH=e~ zFL4lTuV64&Uiq?(LXx%zLjTV0jq;Z`K(m6|ormY-DB~micR9%qUV*W_`_U5X$!9QY z!C{N1U#zGCG!WV133%YvBv-S!*Ap1P{vnfLdP7tz+F0=PZLn?$v>m3#cAo!-$!D-O z>)VA-zc>fH+SmsJo=rH-&pga49hiC3rpMBnWHFa>qyR*o-H!LY7bkV|wQ|ap$WfTX z%~6IQ>y<&Iq^w3IA*|fzlgRC7!wB9dpqD3FQTnmhH|(aJFyt`J5d7vOu^1xTToT6A z-yN|Tf@X*E7P?0h1VvexPKKseKG#*#pax)sUsYxpxIwEX;7d5!qkV;ZK8#bii5N)- zUq1}NA+W@&I1s|yI4)JZ#?#ZYbPe9vBRgKTz#QeC;|u2SGd7HM?56NiEgtPhW#m>sqJ;# zIShS%gwSRA<8&PV2OcQgG}fnlFm|%1@l1up?q~wbAZyT^&p-HkBS1^GBEe^l9OZf` z&q=g@O`ekp=+Fn{ff-|Oz}Za|W5#pq(@z|Mq>?iLSb+8#*U-&nji@~}e@z<9IPyR~R|2EA zS;^#UeXI4?c&b09W4^KexpYvW-bb%y>Ic{P7jR^CF zZk^INsW$afHz_tb=GdZ$36aC&(P20AGr6PnRppuS?yBv3>MChM^uXGF0lG*8i!=fK zV;Io2xw|tSjqZ_H8qpqtbAMjgdFu~*$_&b^NX@*gn_YdcEcIOi#_P}nN8YXPf-}k~ z@7d@1oN8WQkX5k7f3v;gzRz+_Eu^2$abEWQ=etj>vJTz!35OGqyx-aOV{7+WUfBT! z0X7s>&swzgX07_;bx)G3#2PA}AE`+HBB?3VomnqCw)n^xl4)RFE&1CQsSpxwKS3{r{6WMMU9H4!YrL!mCv+)6VezUa=vQrr;X3r;?!(lot&A822g+vN zo|>j+R$$Gt^-Epe$?g@rh;8w8mBm<9pM7C5IoLNYhmgn;8tDjZLgYV%y#b=NO_4GX zo65L$q0{GlswY$&$>{Z$*d@qy{kkU0O#R@86;2kRD!G_NXVjDihckCDYW}4tGaQ!B zE}4DNFxEk@@8$?$(Bf!;?nWfAsdO{BdVK}k9B)^csMzZJ57l7humJ7%fupMLY)kM! zb&}G;Wxuli(g;fws;X*Rq;&1_i$!2%lSA0_1wjljL@E|t$d{vZ5lq6ep(?K*DvQ~> zXir$B2!hN}Im#hn7emxAd}wci_VF`Zv%f&tU1+S5!ySX_mMOYOY!y_vW3)Iub@b-3!2?Z1^sK(Kx|$aVQ9^z(F;nW@qE~-aOA~2{w%n zKw%s*uhMz|%hIIa4(-*QYT@?%M6Yho(aF5|ULBgO6{FX1tO^1YTk}a*_UwdETQAPT zAD043@*_&bah}M=CuO`wZ58r!w`*|_iJX#1G?gyN?MT3SLk&W%x(LzHq932T(GZ>f zCaR#&T;Rq=`AaMV+H;)1FQfB#fmK-1s!mK-!2slPbyvKvA&}TEz~v^WwSHS6Ko1<+ z11XRq>X2i{qKSU>fM}~Mqz-Bo2aq`5m(sWw)DW{%nz>F1$~a-`Na$*rgg{BW+w03a zC;!F((RFo&LO5f+7>W(^NgO8cxPH{!aYp1F6zH=ctBi$qN7rPWFFX`<=O#YGOXdHH;rQB1h%kfJo362)F{wevHjS(E}6t)>9OTu?_- zi!pXrSndTICYy>x!kM{SKTS>oKH=6%t;90n(U=;n?j1JfSn$@ieu^xBg5!g~vQu!j1A7FI~P|8P#MrG;) zyKu0Q!^OEPNHq3EJY#scQEdapHG{pG6o~^nPt_=kmZCwAOlw)Xj2UtDUS;N7bN%J5 z4-(T9i)Pj^uudYfafC`ce@+@RQVmdmL;xHIjiK}JWwajM1^2RHJ<;#%XXLd10yBRL AegFUf literal 0 HcmV?d00001 From 66440d3e1b3b4f8f97daaacd236f170aa6a61df7 Mon Sep 17 00:00:00 2001 From: salamer Date: Sun, 27 Jun 2021 13:51:08 +0800 Subject: [PATCH 17/22] chore: update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 04fdf15..d50bf7a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@
+ # Hora a approximate nearest neighbor search library, written in Rust From 386579187fe35a4bd799d4a3aca9d24782e7777a Mon Sep 17 00:00:00 2001 From: Mongkii Date: Sun, 27 Jun 2021 13:53:39 +0800 Subject: [PATCH 18/22] chore: update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d50bf7a..aa07059 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -
+
From 971f86a955cb4853612a5f6307395343cd6ff5a8 Mon Sep 17 00:00:00 2001 From: salamer Date: Sun, 27 Jun 2021 14:40:47 +0800 Subject: [PATCH 19/22] chore: update readme --- README.md | 7 ++----- asset/logo.png | Bin 14042 -> 0 bytes asset/logo.svg | 1 + 3 files changed, 3 insertions(+), 5 deletions(-) delete mode 100644 asset/logo.png create mode 100644 asset/logo.svg diff --git a/README.md b/README.md index aa07059..8978752 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,11 @@ -
- +
+
# Hora a approximate nearest neighbor search library, written in Rust -# Introduction - Hora, `ほら` in Japanese, sound like `[hōlə]`, means `You see!` or `Look at that!`. # Key Features @@ -65,7 +63,6 @@ Hora, `ほら` in Japanese, sound like `[hōlə]`, means `You see!` or `Look at # Contents - [Hora](#hora) -- [Introduction](#introduction) - [Key Features](#key-features) - [Contents](#contents) - [Installation](#installation) diff --git a/asset/logo.png b/asset/logo.png deleted file mode 100644 index d760c13cf2a843222198ab98b66f9791c2085aac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14042 zcmeHNYgAKL7Czw>rE02JaY`YKtqZ3pP{$esLbQVej*w`V6-E$JStHdDLW(>LA*4i2 zEepZ1s1-yHhEr~R2UJxzh_OZ*WwE~=nlH=e~ zFL4lTuV64&Uiq?(LXx%zLjTV0jq;Z`K(m6|ormY-DB~micR9%qUV*W_`_U5X$!9QY z!C{N1U#zGCG!WV133%YvBv-S!*Ap1P{vnfLdP7tz+F0=PZLn?$v>m3#cAo!-$!D-O z>)VA-zc>fH+SmsJo=rH-&pga49hiC3rpMBnWHFa>qyR*o-H!LY7bkV|wQ|ap$WfTX z%~6IQ>y<&Iq^w3IA*|fzlgRC7!wB9dpqD3FQTnmhH|(aJFyt`J5d7vOu^1xTToT6A z-yN|Tf@X*E7P?0h1VvexPKKseKG#*#pax)sUsYxpxIwEX;7d5!qkV;ZK8#bii5N)- zUq1}NA+W@&I1s|yI4)JZ#?#ZYbPe9vBRgKTz#QeC;|u2SGd7HM?56NiEgtPhW#m>sqJ;# zIShS%gwSRA<8&PV2OcQgG}fnlFm|%1@l1up?q~wbAZyT^&p-HkBS1^GBEe^l9OZf` z&q=g@O`ekp=+Fn{ff-|Oz}Za|W5#pq(@z|Mq>?iLSb+8#*U-&nji@~}e@z<9IPyR~R|2EA zS;^#UeXI4?c&b09W4^KexpYvW-bb%y>Ic{P7jR^CF zZk^INsW$afHz_tb=GdZ$36aC&(P20AGr6PnRppuS?yBv3>MChM^uXGF0lG*8i!=fK zV;Io2xw|tSjqZ_H8qpqtbAMjgdFu~*$_&b^NX@*gn_YdcEcIOi#_P}nN8YXPf-}k~ z@7d@1oN8WQkX5k7f3v;gzRz+_Eu^2$abEWQ=etj>vJTz!35OGqyx-aOV{7+WUfBT! z0X7s>&swzgX07_;bx)G3#2PA}AE`+HBB?3VomnqCw)n^xl4)RFE&1CQsSpxwKS3{r{6WMMU9H4!YrL!mCv+)6VezUa=vQrr;X3r;?!(lot&A822g+vN zo|>j+R$$Gt^-Epe$?g@rh;8w8mBm<9pM7C5IoLNYhmgn;8tDjZLgYV%y#b=NO_4GX zo65L$q0{GlswY$&$>{Z$*d@qy{kkU0O#R@86;2kRD!G_NXVjDihckCDYW}4tGaQ!B zE}4DNFxEk@@8$?$(Bf!;?nWfAsdO{BdVK}k9B)^csMzZJ57l7humJ7%fupMLY)kM! zb&}G;Wxuli(g;fws;X*Rq;&1_i$!2%lSA0_1wjljL@E|t$d{vZ5lq6ep(?K*DvQ~> zXir$B2!hN}Im#hn7emxAd}wci_VF`Zv%f&tU1+S5!ySX_mMOYOY!y_vW3)Iub@b-3!2?Z1^sK(Kx|$aVQ9^z(F;nW@qE~-aOA~2{w%n zKw%s*uhMz|%hIIa4(-*QYT@?%M6Yho(aF5|ULBgO6{FX1tO^1YTk}a*_UwdETQAPT zAD043@*_&bah}M=CuO`wZ58r!w`*|_iJX#1G?gyN?MT3SLk&W%x(LzHq932T(GZ>f zCaR#&T;Rq=`AaMV+H;)1FQfB#fmK-1s!mK-!2slPbyvKvA&}TEz~v^WwSHS6Ko1<+ z11XRq>X2i{qKSU>fM}~Mqz-Bo2aq`5m(sWw)DW{%nz>F1$~a-`Na$*rgg{BW+w03a zC;!F((RFo&LO5f+7>W(^NgO8cxPH{!aYp1F6zH=ctBi$qN7rPWFFX`<=O#YGOXdHH;rQB1h%kfJo362)F{wevHjS(E}6t)>9OTu?_- zi!pXrSndTICYy>x!kM{SKTS>oKH=6%t;90n(U=;n?j1JfSn$@ieu^xBg5!g~vQu!j1A7FI~P|8P#MrG;) zyKu0Q!^OEPNHq3EJY#scQEdapHG{pG6o~^nPt_=kmZCwAOlw)Xj2UtDUS;N7bN%J5 z4-(T9i)Pj^uudYfafC`ce@+@RQVmdmL;xHIjiK}JWwajM1^2RHJ<;#%XXLd10yBRL AegFUf diff --git a/asset/logo.svg b/asset/logo.svg new file mode 100644 index 0000000..e71b390 --- /dev/null +++ b/asset/logo.svg @@ -0,0 +1 @@ + \ No newline at end of file From dc48d7b5b18df47efddc01ab88d59a4687176e2f Mon Sep 17 00:00:00 2001 From: salamer Date: Thu, 1 Jul 2021 22:05:08 +0800 Subject: [PATCH 20/22] chore: remove arguments --- src/core/ann_index.rs | 16 ++---- src/core/arguments.rs | 96 -------------------------------- src/core/calc.rs | 108 ------------------------------------ src/core/mod.rs | 1 - src/index/bruteforce_idx.rs | 12 +--- src/index/hnsw_idx.rs | 12 +--- src/index/pq_idx.rs | 23 ++------ src/index/rpt_idx.rs | 12 +--- src/index/ssg_idx.rs | 21 ++----- 9 files changed, 25 insertions(+), 276 deletions(-) delete mode 100644 src/core/arguments.rs diff --git a/src/core/ann_index.rs b/src/core/ann_index.rs index 186a636..fa25a39 100644 --- a/src/core/ann_index.rs +++ b/src/core/ann_index.rs @@ -1,4 +1,3 @@ -use crate::core::arguments; use crate::core::metrics; use crate::core::node; @@ -72,12 +71,7 @@ pub trait ANNIndex: Send + Sync { } /// search for k nearest neighbors node internal method - fn node_search_k( - &self, - item: &node::Node, - k: usize, - args: &arguments::Args, - ) -> Vec<(node::Node, E)>; + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)>; /// search for k nearest neighbors and return full info /// @@ -86,7 +80,7 @@ pub trait ANNIndex: Send + Sync { /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic fn search_full(&self, item: &[E], k: usize) -> Vec<(node::Node, E)> { assert_eq!(item.len(), self.dimension()); - self.node_search_k(&node::Node::new(item), k, &arguments::Args::new()) + self.node_search_k(&node::Node::new(item), k) } /// search for k nearest neighbors @@ -96,7 +90,7 @@ pub trait ANNIndex: Send + Sync { /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic fn search(&self, item: &[E], k: usize) -> Vec { assert_eq!(item.len(), self.dimension()); - self.node_search_k(&node::Node::new(item), k, &arguments::Args::new()) + self.node_search_k(&node::Node::new(item), k) .iter() .map(|x| x.0.idx().as_ref().unwrap().clone()) .collect::>() @@ -150,7 +144,7 @@ pub trait SerializableIndex< >: Send + Sync + ANNIndex { /// load file with path - fn load(_path: &str, _args: &arguments::Args) -> Result + fn load(_path: &str) -> Result where Self: Sized, { @@ -158,7 +152,7 @@ pub trait SerializableIndex< } /// dump the file into the path - fn dump(&mut self, _path: &str, _args: &arguments::Args) -> Result<(), &'static str> { + fn dump(&mut self, _path: &str) -> Result<(), &'static str> { Err("empty implementation") } } diff --git a/src/core/arguments.rs b/src/core/arguments.rs deleted file mode 100644 index b2667bd..0000000 --- a/src/core/arguments.rs +++ /dev/null @@ -1,96 +0,0 @@ -#![allow(dead_code)] -#[cfg(feature = "without_std")] -use hashbrown::HashMap; -#[cfg(not(feature = "without_std"))] -use std::collections::HashMap; - -// TODO:L find a way to make the arguments generic; -#[derive(Clone, Debug)] -pub enum ArgsBox { - Float(f32), - Int(i32), - Str(String), - Usize(usize), -} - -// TODO: make this optional -pub struct Args { - args: HashMap, -} - -impl Default for Args { - fn default() -> Self { - Self::new() - } -} - -impl Args { - pub fn new() -> Self { - Args { - args: HashMap::new(), - } - } - - pub fn fget(&self, key: &str) -> Option { - let val = self.args.get(key)?; - match val { - ArgsBox::Float(s) => Some(*s), - _ => None, - } - } - - pub fn iget(&self, key: &str) -> Option { - let val = self.args.get(key)?; - match val { - ArgsBox::Int(s) => Some(*s), - _ => None, - } - } - - pub fn sget(&self, key: &str) -> Option { - let val = self.args.get(key)?; - match val { - ArgsBox::Str(s) => Some(s.clone()), - _ => None, - } - } - - pub fn uget(&self, key: &str) -> Option { - let val = self.args.get(key)?; - match val { - ArgsBox::Usize(s) => Some(*s), - _ => None, - } - } - - pub fn get(&self, key: &str) -> Option { - let val = self.args.get(key)?; - Some(val.clone()) - } - - pub fn fset(&mut self, key: &str, value: f32) -> &mut Args { - self.args.insert(key.to_string(), ArgsBox::Float(value)); - self - } - - pub fn iset(&mut self, key: &str, value: i32) -> &mut Args { - self.args.insert(key.to_string(), ArgsBox::Int(value)); - self - } - - pub fn uset(&mut self, key: &str, value: usize) -> &mut Args { - self.args.insert(key.to_string(), ArgsBox::Usize(value)); - self - } - - pub fn sset(&mut self, key: &str, value: &str) -> &mut Args { - self.args - .insert(key.to_string(), ArgsBox::Str(value.to_string())); - self - } - - pub fn set(&mut self, key: &str, value: ArgsBox) -> &mut Args { - self.args.insert(key.to_string(), value); - self - } -} diff --git a/src/core/calc.rs b/src/core/calc.rs index e00d502..fdfa069 100644 --- a/src/core/calc.rs +++ b/src/core/calc.rs @@ -41,118 +41,10 @@ pub fn split_imbalance(vec1: &[T], vec2: &[T]) -> f64 { #[cfg(test)] mod tests { - use super::*; - use crate::core::simd_metrics::SIMDOptmized; - - use rand::distributions::Standard; - - use rand::Rng; - use std::time::SystemTime; - fn make_normal_distribution_clustering( - clustering_n: usize, - node_n: usize, - dimension: usize, - range: f64, - ) -> ( - Vec>, // center of cluster - Vec>, // cluster data - ) { - let _rng = rand::thread_rng(); - - let mut bases: Vec> = Vec::new(); - let mut ns: Vec> = Vec::new(); - for _i in 0..clustering_n { - let mut rng = rand::thread_rng(); - let mut base: Vec = Vec::with_capacity(dimension); - for _i in 0..dimension { - let n: f64 = rng.gen::() * range; // base number - base.push(n); - } - - let v_iter: Vec = rng - .sample_iter(&Standard) - .take(dimension * node_n) - .collect::>() - .clone(); - for _i in 0..node_n { - let mut vec_item = Vec::with_capacity(dimension); - for i in 0..dimension { - let vv = v_iter[_i * dimension..(_i + 1) * dimension][i] + base[i]; // add normal distribution noise - vec_item.push(vv); - } - ns.push(vec_item); - } - bases.push(base); - } - - (bases, ns) - } #[test] fn test_dot() { let a = [1., 2., 3.]; let b = [1., 2., 3.]; assert_eq!(dot(&a, &b).unwrap(), 14.0); } - - #[test] - fn bench_dot() { - let dimension = 8024; - let nodes_every_cluster = 600; - let node_n = 50; - let (_, nso) = - make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 100000.0); - println!("hello world {:?}", nso.len()); - let ns: Vec> = nso - .iter() - .map(|x| x.iter().map(|p| *p as f32).collect()) - .collect(); - - { - let base_start = SystemTime::now(); - let sumbase = ns - .iter() - .map(|nsx| { - // dot(&nsx, &nsx); - // nsx.iter().zip(nsx).map(|(p, q)| p * q).sum::() - nsx.iter() - .zip(nsx) - .map(|(p, q)| (p - q).powi(2)) - .sum::() - }) - .sum::(); - let base_since_the_epoch = SystemTime::now() - .duration_since(base_start) - .expect("Time went backwards"); - println!( - "test for {:?} times, base use {:?} millisecond {:?}", - ns.len(), - base_since_the_epoch.as_millis(), - sumbase - ); - } - - { - let base_start = SystemTime::now(); - let sumsimd = ns - .iter() - .map(|nsx| f32::euclidean_distance(nsx, nsx).unwrap()) - .sum::(); - let base_since_the_epoch = SystemTime::now() - .duration_since(base_start) - .expect("Time went backwards"); - println!( - "test for {:?} times, simd use {:?} millisecond, {:?}", - ns.len(), - base_since_the_epoch.as_millis(), - sumsimd - ); - } - - let b = 25; - println!( - "{:?}, {:?}", - f32::dot_product(&ns[b], &ns[b]), - dot(&ns[b], &ns[b]).unwrap() - ); - } } diff --git a/src/core/mod.rs b/src/core/mod.rs index ce2693a..169b4a4 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,5 +1,4 @@ pub mod ann_index; -pub mod arguments; pub mod calc; pub mod kmeans; pub mod knn; diff --git a/src/index/bruteforce_idx.rs b/src/index/bruteforce_idx.rs index 519c5f3..795460d 100644 --- a/src/index/bruteforce_idx.rs +++ b/src/index/bruteforce_idx.rs @@ -1,6 +1,5 @@ #![allow(dead_code)] use crate::core::ann_index; -use crate::core::arguments; use crate::core::metrics; use crate::core::neighbor; use crate::core::node; @@ -45,12 +44,7 @@ impl ann_index::ANNIndex for Brut fn built(&self) -> bool { true } - fn node_search_k( - &self, - item: &node::Node, - k: usize, - _args: &arguments::Args, - ) -> Vec<(node::Node, E)> { + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)> { let mut heap = BinaryHeap::with_capacity(k + 1); self.nodes .iter() @@ -90,7 +84,7 @@ impl ann_index::ANNIndex for Brut impl ann_index::SerializableIndex for BruteForceIndex { - fn load(path: &str, _args: &arguments::Args) -> Result { + fn load(path: &str) -> Result { let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); let mut instance: BruteForceIndex = bincode::deserialize_from(file).unwrap(); instance.nodes = instance @@ -101,7 +95,7 @@ impl Result<(), &'static str> { + fn dump(&mut self, path: &str) -> Result<(), &'static str> { self.tmp_nodes = self.nodes.iter().map(|x| *x.clone()).collect(); let encoded_bytes = bincode::serialize(&self).unwrap(); let mut file = File::create(path).unwrap(); diff --git a/src/index/hnsw_idx.rs b/src/index/hnsw_idx.rs index ceaeaff..50161a9 100644 --- a/src/index/hnsw_idx.rs +++ b/src/index/hnsw_idx.rs @@ -1,6 +1,5 @@ #![allow(dead_code)] use crate::core::ann_index; -use crate::core::arguments; use crate::core::metrics; use crate::core::neighbor::Neighbor; use crate::core::node; @@ -629,12 +628,7 @@ impl ann_index::ANNIndex for HNSW true } - fn node_search_k( - &self, - item: &node::Node, - k: usize, - _args: &arguments::Args, - ) -> Vec<(node::Node, E)> { + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)> { let mut ret: BinaryHeap> = self.search_knn(item, k).unwrap(); let mut result: Vec<(node::Node, E)> = Vec::with_capacity(k); let mut result_idx: Vec<(usize, E)> = Vec::with_capacity(k); @@ -667,7 +661,7 @@ impl ann_index::ANNIndex for HNSW impl ann_index::SerializableIndex for HNSWIndex { - fn load(path: &str, _args: &arguments::Args) -> Result { + fn load(path: &str) -> Result { let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); let mut instance: HNSWIndex = bincode::deserialize_from(&file).unwrap(); instance._nodes = instance @@ -708,7 +702,7 @@ impl Result<(), &'static str> { + fn dump(&mut self, path: &str) -> Result<(), &'static str> { self._id2neighbor_tmp = Vec::with_capacity(self._id2neighbor.len()); for i in 0..self._id2neighbor.len() { let mut tmp = Vec::with_capacity(self._id2neighbor[i].len()); diff --git a/src/index/pq_idx.rs b/src/index/pq_idx.rs index 361bb38..94cc43b 100644 --- a/src/index/pq_idx.rs +++ b/src/index/pq_idx.rs @@ -1,6 +1,5 @@ #![allow(dead_code)] use crate::core::ann_index; -use crate::core::arguments; use crate::core::kmeans; use crate::core::metrics; use crate::core::neighbor::Neighbor; @@ -208,12 +207,7 @@ impl ann_index::ANNIndex for PQIn true } - fn node_search_k( - &self, - item: &node::Node, - k: usize, - _args: &arguments::Args, - ) -> Vec<(node::Node, E)> { + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)> { let mut ret: BinaryHeap> = self.search_knn_adc(item, k).unwrap(); let mut result: Vec<(node::Node, E)> = Vec::new(); let mut result_idx: Vec<(usize, E)> = Vec::new(); @@ -246,7 +240,7 @@ impl ann_index::ANNIndex for PQIn impl ann_index::SerializableIndex for PQIndex { - fn load(path: &str, _args: &arguments::Args) -> Result { + fn load(path: &str) -> Result { let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); let mut instance: PQIndex = bincode::deserialize_from(&file).unwrap(); instance._nodes = instance @@ -257,7 +251,7 @@ impl Result<(), &'static str> { + fn dump(&mut self, path: &str) -> Result<(), &'static str> { self._nodes_tmp = self._nodes.iter().map(|x| *x.clone()).collect(); let encoded_bytes = bincode::serialize(&self).unwrap(); let mut file = File::create(path).unwrap(); @@ -462,12 +456,7 @@ impl ann_index::ANNIndex for IVFP true } - fn node_search_k( - &self, - item: &node::Node, - k: usize, - _args: &arguments::Args, - ) -> Vec<(node::Node, E)> { + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)> { let mut ret: BinaryHeap> = self.search_knn_adc(item, k).unwrap(); let mut result: Vec<(node::Node, E)> = Vec::new(); let mut result_idx: Vec<(usize, E)> = Vec::new(); @@ -500,7 +489,7 @@ impl ann_index::ANNIndex for IVFP impl ann_index::SerializableIndex for IVFPQIndex { - fn load(path: &str, _args: &arguments::Args) -> Result { + fn load(path: &str) -> Result { let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); let mut instance: IVFPQIndex = bincode::deserialize_from(&file).unwrap(); instance._nodes = instance @@ -520,7 +509,7 @@ impl Result<(), &'static str> { + fn dump(&mut self, path: &str) -> Result<(), &'static str> { self._nodes_tmp = self._nodes.iter().map(|x| *x.clone()).collect(); for i in 0..self._n_kmeans_center { self._pq_list[i]._nodes_tmp = diff --git a/src/index/rpt_idx.rs b/src/index/rpt_idx.rs index 0b00c13..dbc3845 100644 --- a/src/index/rpt_idx.rs +++ b/src/index/rpt_idx.rs @@ -1,6 +1,5 @@ #![allow(dead_code)] use crate::core::ann_index; -use crate::core::arguments; use crate::core::calc; use crate::core::metrics; use crate::core::neighbor; @@ -603,12 +602,7 @@ impl ann_index::ANNIndex for BPTI self._built } - fn node_search_k( - &self, - item: &node::Node, - k: usize, - _args: &arguments::Args, - ) -> Vec<(node::Node, E)> { + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)> { self._search_k(item.vectors(), k).unwrap() } @@ -624,7 +618,7 @@ impl ann_index::ANNIndex for BPTI impl ann_index::SerializableIndex for BPTIndex { - fn load(path: &str, _args: &arguments::Args) -> Result { + fn load(path: &str) -> Result { let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); let mut instance: BPTIndex = bincode::deserialize_from(&file).unwrap(); @@ -637,7 +631,7 @@ impl Result<(), &'static str> { + fn dump(&mut self, path: &str) -> Result<(), &'static str> { self.leaves .iter_mut() .for_each(|x| x.tmp_node = Some(*x.node.clone())); diff --git a/src/index/ssg_idx.rs b/src/index/ssg_idx.rs index bc1aef4..92a068f 100644 --- a/src/index/ssg_idx.rs +++ b/src/index/ssg_idx.rs @@ -1,6 +1,5 @@ #![allow(dead_code)] use crate::core::ann_index; -use crate::core::arguments; use crate::core::kmeans; use crate::core::metrics; use crate::core::neighbor; @@ -405,12 +404,7 @@ impl SSGIndex { ); } - fn search( - &self, - query: &node::Node, - k: usize, - _args: &arguments::Args, - ) -> Vec<(node::Node, E)> { + fn search(&self, query: &node::Node, k: usize) -> Vec<(node::Node, E)> { // let mut search_flags = HashSet::with_capacity(self.nodes.len()); let mut search_flags = FixedBitSet::with_capacity(self.nodes.len()); let mut heap: BinaryHeap> = BinaryHeap::new(); // max-heap @@ -503,7 +497,7 @@ impl SSGIndex { impl ann_index::SerializableIndex for SSGIndex { - fn load(path: &str, _args: &arguments::Args) -> Result { + fn load(path: &str) -> Result { let file = File::open(path).unwrap_or_else(|_| panic!("unable to open file {:?}", path)); let mut instance: SSGIndex = bincode::deserialize_from(&file).unwrap(); instance.nodes = instance @@ -514,7 +508,7 @@ impl Result<(), &'static str> { + fn dump(&mut self, path: &str) -> Result<(), &'static str> { self.tmp_nodes = self.nodes.iter().map(|x| *x.clone()).collect(); let encoded_bytes = bincode::serialize(&self).unwrap(); let mut file = File::create(path).unwrap(); @@ -538,13 +532,8 @@ impl ann_index::ANNIndex for SSGI fn built(&self) -> bool { true } - fn node_search_k( - &self, - item: &node::Node, - k: usize, - args: &arguments::Args, - ) -> Vec<(node::Node, E)> { - self.search(item, k, args) + fn node_search_k(&self, item: &node::Node, k: usize) -> Vec<(node::Node, E)> { + self.search(item, k) } fn name(&self) -> &'static str { From 87ae74cbba6c8cfe4e425c91a87f97d365963608 Mon Sep 17 00:00:00 2001 From: salamer Date: Thu, 1 Jul 2021 23:41:26 +0800 Subject: [PATCH 21/22] chore: format whole project --- examples/Cargo.toml | 2 +- examples/src/ann_bench.rs | 26 +-- src/core/ann_index.rs | 4 +- src/core/calc.rs | 3 +- src/core/kmeans.rs | 4 +- src/core/knn.rs | 253 ++++++++++----------- src/core/metrics.rs | 11 +- src/core/node.rs | 6 +- src/core/simd_metrics.rs | 2 +- src/index/{rpt_idx.rs => bpt_idx.rs} | 4 +- src/index/{rpt_params.rs => bpt_params.rs} | 0 src/index/mod.rs | 4 +- src/index/ssg_params.rs | 4 +- src/lib.rs | 126 ++++++++++ 14 files changed, 286 insertions(+), 163 deletions(-) rename src/index/{rpt_idx.rs => bpt_idx.rs} (99%) rename src/index/{rpt_params.rs => bpt_params.rs} (100%) diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 8499a29..27207a4 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -5,7 +5,7 @@ authors = ["salamer "] edition = "2018" [dependencies] -real_hora = { package = "hora", path = "../", features=["simd"]} +hora = { package = "hora", path = "../", features=["simd"]} hdf5 = {version = "0.7.1"} rayon = "^1.5" rand = "0.7.3" \ No newline at end of file diff --git a/examples/src/ann_bench.rs b/examples/src/ann_bench.rs index 6320d57..5b5028f 100644 --- a/examples/src/ann_bench.rs +++ b/examples/src/ann_bench.rs @@ -1,9 +1,7 @@ #![deny(clippy::all)] -use real_hora::core; -use real_hora::core::ann_index::ANNIndex; - +use hora::core; +use hora::core::ann_index::ANNIndex; use std::collections::HashSet; - use std::time::SystemTime; struct StatMetrics { @@ -58,19 +56,19 @@ fn bench_ssg( neighbors: &Vec>, ) { let params_set = vec![ - real_hora::index::ssg_params::SSGParams::::default() + hora::index::ssg_params::SSGParams::::default() .angle(60.0) .init_k(20) .index_size(20) .neighbor_neighbor_size(30) .root_size(256), - real_hora::index::ssg_params::SSGParams::default() + hora::index::ssg_params::SSGParams::default() .angle(60.0) .init_k(50) .index_size(50) .neighbor_neighbor_size(50) .root_size(256), - real_hora::index::ssg_params::SSGParams::default() + hora::index::ssg_params::SSGParams::default() .angle(60.0) .init_k(50) .index_size(50) @@ -81,7 +79,7 @@ fn bench_ssg( let mut metrics_stats: Vec = Vec::new(); for params in params_set.iter() { println!("start params {:?}", params); - let mut ssg_idx = Box::new(real_hora::index::ssg_idx::SSGIndex::::new( + let mut ssg_idx = Box::new(hora::index::ssg_idx::SSGIndex::::new( dimension, params, )); make_idx_baseline(train, &mut ssg_idx); @@ -107,21 +105,21 @@ fn bench_hnsw( neighbors: &Vec>, ) { let params_set = vec![ - real_hora::index::hnsw_params::HNSWParams::::default() + hora::index::hnsw_params::HNSWParams::::default() .max_item(10000000) .n_neighbor(16) .n_neighbor0(32) .ef_build(500) .ef_search(16) .has_deletion(false), - real_hora::index::hnsw_params::HNSWParams::::default() + hora::index::hnsw_params::HNSWParams::::default() .max_item(10000000) .n_neighbor(8) .n_neighbor0(16) .ef_build(500) .ef_search(16) .has_deletion(false), - real_hora::index::hnsw_params::HNSWParams::::default() + hora::index::hnsw_params::HNSWParams::::default() .max_item(10000000) .n_neighbor(16) .n_neighbor0(32) @@ -132,7 +130,7 @@ fn bench_hnsw( let mut metrics_stats: Vec = Vec::new(); for params in params_set.iter() { - let mut hnsw_idx = Box::new(real_hora::index::hnsw_idx::HNSWIndex::::new( + let mut hnsw_idx = Box::new(hora::index::hnsw_idx::HNSWIndex::::new( dimension, params, )); make_idx_baseline(train, &mut hnsw_idx); @@ -157,7 +155,7 @@ fn bench_ivfpq( test: &Vec>, neighbors: &Vec>, ) { - let params_set = vec![real_hora::index::pq_params::IVFPQParams::::default() + let params_set = vec![hora::index::pq_params::IVFPQParams::::default() .n_sub(16) .sub_bits(4) .n_kmeans_center(256) @@ -166,7 +164,7 @@ fn bench_ivfpq( let mut metrics_stats: Vec = Vec::new(); for params in params_set.iter() { - let mut ivfpq_idx = Box::new(real_hora::index::pq_idx::IVFPQIndex::::new( + let mut ivfpq_idx = Box::new(hora::index::pq_idx::IVFPQIndex::::new( dimension, params, )); make_idx_baseline(train, &mut ivfpq_idx); diff --git a/src/core/ann_index.rs b/src/core/ann_index.rs index fa25a39..f5c2f7d 100644 --- a/src/core/ann_index.rs +++ b/src/core/ann_index.rs @@ -45,7 +45,7 @@ pub trait ANNIndex: Send + Sync { /// add multiple node one time /// /// return `Err(&'static str)` if there is something wrong with the adding process, and the `static str` is the debug reason - fn add_batch(&mut self, vss: &[&[E]], indices: &[T]) -> Result<(), &'static str> { + fn madd(&mut self, vss: &[&[E]], indices: &[T]) -> Result<(), &'static str> { if vss.len() != indices.len() { return Err("vector's size is different with index"); } @@ -78,7 +78,7 @@ pub trait ANNIndex: Send + Sync { /// it will return the all node's info including the original vectors, and the metric distance /// /// it require the item is the slice with the same dimension with index dimension, otherwise it will panic - fn search_full(&self, item: &[E], k: usize) -> Vec<(node::Node, E)> { + fn search_nodes(&self, item: &[E], k: usize) -> Vec<(node::Node, E)> { assert_eq!(item.len(), self.dimension()); self.node_search_k(&node::Node::new(item), k) } diff --git a/src/core/calc.rs b/src/core/calc.rs index fdfa069..38c4332 100644 --- a/src/core/calc.rs +++ b/src/core/calc.rs @@ -41,10 +41,11 @@ pub fn split_imbalance(vec1: &[T], vec2: &[T]) -> f64 { #[cfg(test)] mod tests { + use crate::core::calc::dot; #[test] fn test_dot() { let a = [1., 2., 3.]; let b = [1., 2., 3.]; - assert_eq!(dot(&a, &b).unwrap(), 14.0); + assert_eq!(dot::(&a, &b).unwrap(), -14.0); } } diff --git a/src/core/kmeans.rs b/src/core/kmeans.rs index 90947ad..73f4a19 100644 --- a/src/core/kmeans.rs +++ b/src/core/kmeans.rs @@ -339,8 +339,8 @@ mod tests { #[test] fn test_general_kmeans() { let dimension = 2; - let nodes_every_cluster = 10; - let node_n = 10; + let nodes_every_cluster = 5; + let node_n = 5; let (_, nso) = make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 100000.0); println!("{:?}", nso); diff --git a/src/core/knn.rs b/src/core/knn.rs index ad0da2b..c87003a 100644 --- a/src/core/knn.rs +++ b/src/core/knn.rs @@ -422,130 +422,129 @@ impl<'a, E: FloatElement, T: IdxType> NNDescentHandler<'a, E, T> { } } -#[cfg(test)] -mod tests { - use super::*; - - use crate::core::node; - use rand::distributions::{Distribution, Standard}; - use rand::Rng; - use std::collections::HashMap; - use std::collections::HashSet; - - use std::iter::FromIterator; - use std::time::SystemTime; - fn make_normal_distribution_clustering( - clustering_n: usize, - node_n: usize, - dimension: usize, - range: f64, - ) -> ( - Vec>, // center of cluster - Vec>, // cluster data - ) { - let mut bases: Vec> = Vec::new(); - let mut ns: Vec> = Vec::new(); - for _i in 0..clustering_n { - let mut rng = rand::thread_rng(); - let mut base: Vec = Vec::with_capacity(dimension); - for _i in 0..dimension { - let n: f64 = rng.gen::() * range; // base number - base.push(n); - } - - let v_iter: Vec = rng - .sample_iter(&Standard) - .take(dimension * node_n) - .collect::>() - .clone(); - for _i in 0..node_n { - let mut vec_item = Vec::with_capacity(dimension); - for i in 0..dimension { - let vv = v_iter[_i * dimension..(_i + 1) * dimension][i] + base[i]; // add normal distribution noise - vec_item.push(vv); - } - ns.push(vec_item); - } - bases.push(base); - } - - (bases, ns) - } - - #[test] - fn knn_nn_descent() { - let dimension = 2; - let nodes_every_cluster = 10; - let node_n = 1000; - let (_, ns) = - make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 10000000.0); - println!("hello world {:?}", ns.len()); - - let mut data = Vec::new(); - for i in 0..ns.len() { - data.push(Box::new(node::Node::new_with_idx(&ns[i], i))); - } - - let mut graph: Vec>> = vec![Vec::new(); data.len()]; - let base_start = SystemTime::now(); - naive_build_knn_graph::(&data, metrics::Metric::Euclidean, 100, &mut graph); - let base_since_the_epoch = SystemTime::now() - .duration_since(base_start) - .expect("Time went backwards"); - println!( - "test for {:?} times, base use {:?} millisecond", - ns.len(), - base_since_the_epoch.as_millis() - ); - - let base_start = SystemTime::now(); - let mut nn_descent_handler = - NNDescentHandler::new(&data, metrics::Metric::Euclidean, 100, 0.2); - nn_descent_handler.init(); - - let try_times = 8; - let mut ground_truth: HashMap> = HashMap::new(); - for i in 0..graph.len() { - ground_truth.insert(i, HashSet::from_iter(graph[i].iter().map(|x| x.idx()))); - } - // let guard = pprof::ProfilerGuard::new(100).unwrap(); - for _p in 0..try_times { - let cc = nn_descent_handler.iterate(); - let mut error = 0; - for i in 0..nn_descent_handler.graph.len() { - let nn_descent_handler_val: Vec> = nn_descent_handler.graph[i] - .lock() - .unwrap() - .iter() - .cloned() - .collect(); - for j in 0..nn_descent_handler_val.len() { - if !ground_truth[&i].contains(&nn_descent_handler_val[j].idx()) { - error += 1; - } - } - } - println!( - "error {} /{:?} cc {:?} cost {:?} update_cnt {:?}", - error, - data.len() * 10, - cc, - nn_descent_handler.cost(), - nn_descent_handler.ths_update_cnt(), - ); - } - // if let Ok(report) = guard.report().build() { - // let file = File::create("flamegraph.svg").unwrap(); - // report.flamegraph(file).unwrap(); - // }; - - let base_since_the_epoch = SystemTime::now() - .duration_since(base_start) - .expect("Time went backwards"); - println!( - "test for {:?} times, base use {:?} millisecond", - ns.len(), - base_since_the_epoch.as_millis() - ); - } -} +// #[cfg(test)] +// mod tests { +// use super::*; + +// use crate::core::node; +// use rand::distributions::{Distribution, Standard}; +// use rand::Rng; +// use std::collections::HashMap; +// use std::collections::HashSet; + +// use std::iter::FromIterator; +// use std::time::SystemTime; +// fn make_normal_distribution_clustering( +// clustering_n: usize, +// node_n: usize, +// dimension: usize, +// range: f64, +// ) -> ( +// Vec>, // center of cluster +// Vec>, // cluster data +// ) { +// let mut bases: Vec> = Vec::new(); +// let mut ns: Vec> = Vec::new(); +// for _i in 0..clustering_n { +// let mut rng = rand::thread_rng(); +// let mut base: Vec = Vec::with_capacity(dimension); +// for _i in 0..dimension { +// let n: f64 = rng.gen::() * range; // base number +// base.push(n); +// } + +// let v_iter: Vec = rng +// .sample_iter(&Standard) +// .take(dimension * node_n) +// .collect::>() +// .clone(); +// for _i in 0..node_n { +// let mut vec_item = Vec::with_capacity(dimension); +// for i in 0..dimension { +// let vv = v_iter[_i * dimension..(_i + 1) * dimension][i] + base[i]; // add normal distribution noise +// vec_item.push(vv); +// } +// ns.push(vec_item); +// } +// bases.push(base); +// } + +// (bases, ns) +// } + +// #[test] +// fn knn_nn_descent() { +// let dimension = 2; +// let nodes_every_cluster = 10; +// let node_n = 1000; +// let (_, ns) = +// make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 10000000.0); + +// let mut data = Vec::new(); +// for i in 0..ns.len() { +// data.push(Box::new(node::Node::new_with_idx(&ns[i], i))); +// } + +// let mut graph: Vec>> = vec![Vec::new(); data.len()]; +// let base_start = SystemTime::now(); +// naive_build_knn_graph::(&data, metrics::Metric::Euclidean, 100, &mut graph); +// let base_since_the_epoch = SystemTime::now() +// .duration_since(base_start) +// .expect("Time went backwards"); +// println!( +// "test for {:?} times, base use {:?} millisecond", +// ns.len(), +// base_since_the_epoch.as_millis() +// ); + +// let base_start = SystemTime::now(); +// let mut nn_descent_handler = +// NNDescentHandler::new(&data, metrics::Metric::Euclidean, 100, 0.2); +// nn_descent_handler.init(); + +// let try_times = 8; +// let mut ground_truth: HashMap> = HashMap::new(); +// for i in 0..graph.len() { +// ground_truth.insert(i, HashSet::from_iter(graph[i].iter().map(|x| x.idx()))); +// } +// // let guard = pprof::ProfilerGuard::new(100).unwrap(); +// for _p in 0..try_times { +// let cc = nn_descent_handler.iterate(); +// let mut error = 0; +// for i in 0..nn_descent_handler.graph.len() { +// let nn_descent_handler_val: Vec> = nn_descent_handler.graph[i] +// .lock() +// .unwrap() +// .iter() +// .cloned() +// .collect(); +// for j in 0..nn_descent_handler_val.len() { +// if !ground_truth[&i].contains(&nn_descent_handler_val[j].idx()) { +// error += 1; +// } +// } +// } +// println!( +// "error {} /{:?} cc {:?} cost {:?} update_cnt {:?}", +// error, +// data.len() * 10, +// cc, +// nn_descent_handler.cost(), +// nn_descent_handler.ths_update_cnt(), +// ); +// } +// // if let Ok(report) = guard.report().build() { +// // let file = File::create("flamegraph.svg").unwrap(); +// // report.flamegraph(file).unwrap(); +// // }; + +// let base_since_the_epoch = SystemTime::now() +// .duration_since(base_start) +// .expect("Time went backwards"); +// println!( +// "test for {:?} times, base use {:?} millisecond", +// ns.len(), +// base_since_the_epoch.as_millis() +// ); +// } +// } diff --git a/src/core/metrics.rs b/src/core/metrics.rs index bff2577..0941b2b 100644 --- a/src/core/metrics.rs +++ b/src/core/metrics.rs @@ -20,7 +20,6 @@ impl Default for Metric { } } -// TODO: SIMD support // TODO: make these func private pub fn metric(vec1: &[T], vec2: &[T], mt: Metric) -> Result where @@ -50,7 +49,7 @@ where metric(&vec1[begin..end], &vec2[begin..end], mt) } -pub fn dot_product(vec1: &[T], vec2: &[T]) -> Result +fn dot_product(vec1: &[T], vec2: &[T]) -> Result where T: FloatElement, { @@ -62,21 +61,21 @@ where } } -pub fn manhattan_distance(vec1: &[T], vec2: &[T]) -> Result +fn manhattan_distance(vec1: &[T], vec2: &[T]) -> Result where T: FloatElement, { T::manhattan_distance(vec1, vec2) } -pub fn euclidean_distance(vec1: &[T], vec2: &[T]) -> Result +fn euclidean_distance(vec1: &[T], vec2: &[T]) -> Result where T: FloatElement, { T::euclidean_distance(vec1, vec2) } -pub fn cosine_similarity(vec1: &[T], vec2: &[T]) -> Result +fn cosine_similarity(vec1: &[T], vec2: &[T]) -> Result where T: FloatElement, { @@ -91,7 +90,7 @@ where // (a/|a| - b/|b|)^2 // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b| // = 2 - 2cos -pub fn angular_distance(vec1: &[T], vec2: &[T]) -> Result +fn angular_distance(vec1: &[T], vec2: &[T]) -> Result where T: FloatElement, { diff --git a/src/core/node.rs b/src/core/node.rs index 9e9f443..02bc256 100644 --- a/src/core/node.rs +++ b/src/core/node.rs @@ -206,9 +206,9 @@ impl core::fmt::Display for Node { #[test] fn node_test() { // f64 - let v = vec![0.1, 0.2]; - let v2 = vec![0.2, 0.1]; + let v = vec![1.0, 1.0]; + let v2 = vec![2.0, 2.0]; let n = Node::::new(&v); let n2 = Node::::new(&v2); - n.metric(&n2, metrics::Metric::Manhattan).unwrap(); + assert_eq!(n.metric(&n2, metrics::Metric::Manhattan).unwrap(), 2.0); } diff --git a/src/core/simd_metrics.rs b/src/core/simd_metrics.rs index 00ee57e..ea7ea7c 100644 --- a/src/core/simd_metrics.rs +++ b/src/core/simd_metrics.rs @@ -1,6 +1,6 @@ use crate::core::calc::same_dimension; #[cfg(feature = "simd")] -use packed_simd::{f32x16, f32x4, f32x8, f64x4, f64x8}; +use packed_simd::{f32x16, f64x8}; pub trait SIMDOptmized { fn dot_product(a: &[T], b: &[T]) -> Result; diff --git a/src/index/rpt_idx.rs b/src/index/bpt_idx.rs similarity index 99% rename from src/index/rpt_idx.rs rename to src/index/bpt_idx.rs index dbc3845..5318d66 100644 --- a/src/index/rpt_idx.rs +++ b/src/index/bpt_idx.rs @@ -5,7 +5,7 @@ use crate::core::metrics; use crate::core::neighbor; use crate::core::node; use crate::core::random; -use crate::index::rpt_params::BPTParams; +use crate::index::bpt_params::BPTParams; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use std::cmp::Ordering; @@ -185,7 +185,7 @@ pub struct BPTIndex { } impl BPTIndex { - pub fn new(dimension: usize, params: BPTParams) -> BPTIndex { + pub fn new(dimension: usize, params: &BPTParams) -> BPTIndex { BPTIndex { _built: false, _dimension: dimension, diff --git a/src/index/rpt_params.rs b/src/index/bpt_params.rs similarity index 100% rename from src/index/rpt_params.rs rename to src/index/bpt_params.rs diff --git a/src/index/mod.rs b/src/index/mod.rs index 8cfe04b..196d094 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -1,10 +1,10 @@ +// pub mod bpt_idx; +// pub mod bpt_params; pub mod bruteforce_idx; pub mod bruteforce_params; pub mod hnsw_idx; pub mod hnsw_params; pub mod pq_idx; pub mod pq_params; -pub mod rpt_idx; -pub mod rpt_params; pub mod ssg_idx; pub mod ssg_params; diff --git a/src/index/ssg_params.rs b/src/index/ssg_params.rs index 51f3aaa..f270ece 100644 --- a/src/index/ssg_params.rs +++ b/src/index/ssg_params.rs @@ -43,11 +43,11 @@ impl SSGParams { impl Default for SSGParams { fn default() -> Self { SSGParams { - angle: E::from_f32(30.0).unwrap(), + angle: E::from_f32(60.0).unwrap(), init_k: 100, index_size: 100, neighbor_neighbor_size: 100, - root_size: 30, + root_size: 100, } } } diff --git a/src/lib.rs b/src/lib.rs index da01de0..5d693c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,128 @@ pub mod core; pub mod index; + +#[cfg(test)] +mod tests { + use super::*; + + use crate::core::ann_index::ANNIndex; + use rand::distributions::Standard; + use rand::Rng; + use std::collections::HashSet; + + use std::sync::Arc; + use std::sync::Mutex; + fn make_normal_distribution_clustering( + clustering_n: usize, + node_n: usize, + dimension: usize, + range: f64, + ) -> ( + Vec>, // center of cluster + Vec>, // cluster data + ) { + let mut bases: Vec> = Vec::new(); + let mut ns: Vec> = Vec::new(); + for _i in 0..clustering_n { + let mut rng = rand::thread_rng(); + let mut base: Vec = Vec::with_capacity(dimension); + for _i in 0..dimension { + let n: f64 = rng.gen::() * range; // base number + base.push(n); + } + + let v_iter: Vec = rng + .sample_iter(&Standard) + .take(dimension * node_n) + .collect::>() + .clone(); + for _i in 0..node_n { + let mut vec_item = Vec::with_capacity(dimension); + for i in 0..dimension { + let vv = v_iter[_i * dimension..(_i + 1) * dimension][i] + base[i]; // add normal distribution noise + vec_item.push(vv); + } + ns.push(vec_item); + } + bases.push(base); + } + + (bases, ns) + } + + #[test] + fn test_all_index() { + let dimension = 10; + let nodes_every_cluster = 3; + let node_n = 5000; + + let (_, ns) = + make_normal_distribution_clustering(node_n, nodes_every_cluster, dimension, 100.0); + let mut bf_idx = Box::new(index::bruteforce_idx::BruteForceIndex::::new( + dimension, + &index::bruteforce_params::BruteForceParams::default(), + )); + // let bpt_idx = Box::new( + // index::bpt_idx::BPTIndex::::new(dimension, &index::bpt_params::BPTParams::default()), + // ); + let hnsw_idx = Box::new(index::hnsw_idx::HNSWIndex::::new( + dimension, + &index::hnsw_params::HNSWParams::::default(), + )); + + let pq_idx = Box::new(index::pq_idx::PQIndex::::new( + dimension, + &index::pq_params::PQParams::::default(), + )); + let ssg_idx = Box::new(index::ssg_idx::SSGIndex::::new( + dimension, + &index::ssg_params::SSGParams::default(), + )); + + let mut indices: Vec>> = + vec![pq_idx, ssg_idx, hnsw_idx]; + let accuracy = Arc::new(Mutex::new(Vec::new())); + for i in 0..indices.len() { + make_idx_baseline(ns.clone(), &mut indices[i]); + accuracy.lock().unwrap().push(0.); + } + make_idx_baseline(ns.clone(), &mut bf_idx); + let test_time = 10; + for _i in 0..test_time { + let mut rng = rand::thread_rng(); + + let target: usize = rng.gen_range(0..ns.len()); + let w = ns.get(target).unwrap(); + + let base_set: HashSet = bf_idx + .search_full(&w, 100) + .iter() + .map(|(n, _dist)| n.idx().unwrap()) + .collect(); + + for j in 0..indices.len() { + accuracy.lock().unwrap()[j] = 0.0; + let result = indices[j].search_full(&w, 100); + for (n, _dist) in result.iter() { + if base_set.contains(&n.idx().unwrap()) { + accuracy.lock().unwrap()[j] += 1.0; + } + } + } + } + } + + fn make_idx_baseline< + E: core::node::FloatElement, + T: core::ann_index::ANNIndex + ?Sized, + >( + embs: Vec>, + idx: &mut Box, + ) { + for i in 0..embs.len() { + idx.add_node(&core::node::Node::::new_with_idx(&embs[i], i)) + .unwrap(); + } + idx.build(core::metrics::Metric::Euclidean).unwrap(); + } +} From 7cae5c636e461d1a492f416282105fe2a76c7b79 Mon Sep 17 00:00:00 2001 From: salamer Date: Fri, 2 Jul 2021 00:05:11 +0800 Subject: [PATCH 22/22] add: add no_std branch --- Cargo.toml | 6 +++++- src/core/ann_index.rs | 5 +++++ src/core/heap.rs | 2 ++ src/core/kmeans.rs | 9 ++++++++- src/core/knn.rs | 9 ++++++--- src/core/mod.rs | 1 + src/core/node.rs | 7 +++++-- src/index/bpt_idx.rs | 11 ++++++++--- src/index/bruteforce_idx.rs | 11 ++++++++--- src/index/hnsw_idx.rs | 11 ++++++++--- src/index/pq_idx.rs | 11 ++++++++--- src/index/ssg_idx.rs | 14 ++++++++++---- src/lib.rs | 3 ++- 13 files changed, 76 insertions(+), 24 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 72bb432..9d12b97 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,7 @@ bincode = "1.3.2" fixedbitset = "0.4.0" hashbrown = {version = "0.11.2", optional = true} log = "^0.4" -num = "0.4.0" +# num = "0.4.0" packed_simd = {version = "0.3.5", package = "packed_simd_2", optional = true} rand = "0.8.3" rayon = "^1.5" @@ -53,6 +53,10 @@ smallvec = {version = "1.6.1", features = ["serde"], optional = true} [dev-dependencies] criterion = "0.3.4" +[dependencies.num] +version = "0.4" +default-features = false + [[bench]] harness = false name = "bench_metrics" diff --git a/src/core/ann_index.rs b/src/core/ann_index.rs index f5c2f7d..c926367 100644 --- a/src/core/ann_index.rs +++ b/src/core/ann_index.rs @@ -1,5 +1,10 @@ use crate::core::metrics; use crate::core::node; +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; +use core::cmp::Reverse; +use crate::core::heap::BinaryHeap; use serde::de::DeserializeOwned; diff --git a/src/core/heap.rs b/src/core/heap.rs index 0936aa5..f0cf23c 100644 --- a/src/core/heap.rs +++ b/src/core/heap.rs @@ -2,6 +2,8 @@ use core::mem::{swap, ManuallyDrop}; use core::ptr; +extern crate alloc; +use alloc::vec::Vec; pub struct BinaryHeap { data: Vec, diff --git a/src/core/kmeans.rs b/src/core/kmeans.rs index 73f4a19..af288d4 100644 --- a/src/core/kmeans.rs +++ b/src/core/kmeans.rs @@ -4,7 +4,14 @@ use crate::core::node; use metrics::metric; use rand::prelude::*; use rayon::prelude::*; -use std::sync::Mutex; +use core::sync::Mutex; + +extern crate alloc; +use alloc::vec::Vec; +use alloc::vec; +use alloc::boxed::Box; +use core::cmp::Reverse; +use crate::core::heap::BinaryHeap; #[derive(Default, Debug)] pub struct Kmeans { diff --git a/src/core/knn.rs b/src/core/knn.rs index c87003a..4706057 100644 --- a/src/core/knn.rs +++ b/src/core/knn.rs @@ -6,10 +6,13 @@ use fixedbitset::FixedBitSet; use rand::seq::SliceRandom; use rand::Rng; use rayon::prelude::*; -use std::collections::BinaryHeap; -use std::sync::mpsc; -use std::sync::{Arc, Mutex}; +use core::sync::mpsc; + +use core::sync::{Arc, Mutex}; +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; pub fn naive_build_knn_graph( nodes: &[Box>], diff --git a/src/core/mod.rs b/src/core/mod.rs index 169b4a4..5ea8dd9 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,3 +7,4 @@ pub mod neighbor; pub mod node; pub mod random; pub mod simd_metrics; +pub mod heap; diff --git a/src/core/node.rs b/src/core/node.rs index 02bc256..225f313 100644 --- a/src/core/node.rs +++ b/src/core/node.rs @@ -7,6 +7,9 @@ use core::hash::Hash; use core::iter::Sum; use num::traits::{FromPrimitive, NumAssign}; use serde::{Deserialize, Serialize}; +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; #[cfg(feature = "use_smallvec")] use smallvec; @@ -27,8 +30,8 @@ pub trait FloatElement: + PartialEq + PartialOrd + NumAssign - + num::Signed - + num::Float + + num::r#trait::Signed + + num::r#trait::Float + Sync + Send + Sum diff --git a/src/index/bpt_idx.rs b/src/index/bpt_idx.rs index 5318d66..805807c 100644 --- a/src/index/bpt_idx.rs +++ b/src/index/bpt_idx.rs @@ -9,10 +9,15 @@ use crate::index::bpt_params::BPTParams; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use std::cmp::Ordering; -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::Write; +use core::fs::File; + +use core::io::Write; + +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; +use crate::core::heap::BinaryHeap; // TODO: leaf as a trait with getter setter function #[derive(Default, Clone, Debug, Serialize, Deserialize)] diff --git a/src/index/bruteforce_idx.rs b/src/index/bruteforce_idx.rs index 795460d..c6a0de8 100644 --- a/src/index/bruteforce_idx.rs +++ b/src/index/bruteforce_idx.rs @@ -6,11 +6,16 @@ use crate::core::node; use crate::index::bruteforce_params::BruteForceParams; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; -use std::collections::BinaryHeap; -use std::fs::File; -use std::io::Write; +use core::fs::File; + +use core::io::Write; + +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; +use crate::core::heap::BinaryHeap; #[derive(Debug, Serialize, Deserialize)] pub struct BruteForceIndex { diff --git a/src/index/hnsw_idx.rs b/src/index/hnsw_idx.rs index 50161a9..3eb65ae 100644 --- a/src/index/hnsw_idx.rs +++ b/src/index/hnsw_idx.rs @@ -13,17 +13,22 @@ use rand::prelude::*; use rayon::{iter::IntoParallelIterator, prelude::*}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; -use std::collections::BinaryHeap; + #[cfg(not(feature = "without_std"))] use std::collections::HashMap; #[cfg(not(feature = "without_std"))] use std::collections::HashSet; -use std::fs::File; -use std::io::Write; +use core::fs::File; +use core::io::Write; use std::sync::RwLock; +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; +use crate::core::heap::BinaryHeap; + #[derive(Default, Debug, Serialize, Deserialize)] pub struct HNSWIndex { _dimension: usize, // dimension diff --git a/src/index/pq_idx.rs b/src/index/pq_idx.rs index 94cc43b..58b72b4 100644 --- a/src/index/pq_idx.rs +++ b/src/index/pq_idx.rs @@ -8,13 +8,18 @@ use crate::index::pq_params::IVFPQParams; use crate::index::pq_params::PQParams; use rayon::prelude::*; use serde::de::DeserializeOwned; -use std::collections::BinaryHeap; + use serde::{Deserialize, Serialize}; -use std::fs::File; +use core::fs::File; + +use core::io::Write; -use std::io::Write; +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; +use crate::core::heap::BinaryHeap; #[derive(Default, Debug, Serialize, Deserialize)] pub struct PQIndex { diff --git a/src/index/ssg_idx.rs b/src/index/ssg_idx.rs index 92a068f..dbdf649 100644 --- a/src/index/ssg_idx.rs +++ b/src/index/ssg_idx.rs @@ -13,15 +13,21 @@ use rayon::prelude::*; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use std::cmp::Reverse; -use std::collections::BinaryHeap; + #[cfg(not(feature = "without_std"))] use std::collections::HashSet; use std::collections::LinkedList; use std::collections::VecDeque; -use std::fs::File; -use std::io::Write; -use std::sync::{Arc, Mutex}; +use core::fs::File; +use core::io::Write; +use core::sync::{Arc, Mutex}; + +extern crate alloc; +use alloc::vec::Vec; +use alloc::boxed::Box; +use core::cmp::Reverse; +use crate::core::heap::BinaryHeap; #[derive(Debug, Serialize, Deserialize)] pub struct SSGIndex { diff --git a/src/lib.rs b/src/lib.rs index 5d693c8..e13bb13 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +#![no_std] pub mod core; pub mod index; @@ -11,7 +12,7 @@ mod tests { use std::collections::HashSet; use std::sync::Arc; - use std::sync::Mutex; + use core::sync::Mutex; fn make_normal_distribution_clustering( clustering_n: usize, node_n: usize,