Document how to use the library

odespesse · Nov 27, 2020 · 088b91e · 088b91e
1 parent dbb7dd7
commit 088b91e
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 1 deletion.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,12 @@
 [package]
 name = "index-bloom"
+description = "A lightweight search index based solely on bloom filters."
 version = "0.3.0"
 authors = ["Olivier Despesse <[email protected]>"]
 edition = "2018"
+repository="https://github.com/odespesse/index-bloom"
+readme="README.md"
+keywords=["bloom-filter", "search-engine", "library"]
 
 [dependencies]
 blake2 = "0.9.0"

diff --git a/README.md b/README.md
@@ -1 +1,44 @@
 # index-bloom
+
+A fast and lightweight full-text search engine aims to provide basic search functionality.
+
+`index-bloom` is an alternative to a heavy full-text search engine when you need to retrieve only ids instead of documents.
+It is best used in a scenario where the memory footprint is a strong constraint and a low percentage of false positives is acceptable.
+At its core `index-bloom` uses Bloom filters to store a reduced representation of the document while minimizing search time.
+Such a filter ensures that a negative response (absence of the document) is certain, while a positive response (presence of the document) is accurate according to a customizable probability.
+
+## Example
+An example of how to create a new index, ingest content and search for keywords :
+
+```rust
+    use index_bloom::Index;
+    use index_bloom::Error;
+
+    fn main() -> Result<(), Error>{
+        // Create a new index
+        let mut index = Index::new(0.00001);
+        // Index contents
+        let first_content = "A very very long content...";
+        index.ingest("foo".to_string(), first_content)?;
+        let second_content = "Another content !";
+        index.ingest("bar".to_string(), second_content)?;
+        // Search for various words
+        let hits = index.search("content")?;
+        println!("{:?}", hits.unwrap()); // ["bar", "foo"]
+        let hits = index.search("very")?;
+        println!("{:?}", hits.unwrap()); // ["foo"]
+        let hits = index.search("unknown")?;
+        println!("{:?}", hits); // None
+        Ok(())
+    }
+```
+
+## License
+
+`index-bloom` is released under the MIT license ([LICENSE](https://github.com/odespesse/index-bloom/blob/master/LICENSE)).
+
+## Resources
+
+- Inspired by the article [Writing a full-text search engine using Bloom filters](https://www.stavros.io/posts/bloom-filter-search-engine/)
+- Bloom filters originally published in this paper [Burton H. Bloom, Space/Time Trade-offs in Hash Coding with Allowable Errors](https://dl.acm.org/doi/10.1145/362686.362692)
+- If you want to evaluate a filter theoretical properties you can use the online tool [Bloom Filter Calculator](https://hur.st/bloomfilter/)
diff --git a/src/index.rs b/src/index.rs
@@ -5,25 +5,72 @@ use crate::bloom_filter::BloomFilter;
 use crate::tokens::Tokens;
 use crate::errors::Error;
 
+/// An full-text search index.
 #[derive(Serialize, Deserialize)]
 pub struct Index {
     error_rate: f32,
     bloom_filters: HashMap<String, BloomFilter>
 }
 
 impl Index {
+    /// Constructs a new, empty `Index` with the specified error_rate.
+    ///
+    /// The `error_rate` is the probability of false positive when searching for keywords
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use index_bloom::Index;
+    /// let mut index = Index::new(0.00001);
+    /// ```
     pub fn new(error_rate: f32) -> Self {
         Index {
             error_rate,
             bloom_filters: HashMap::new()
         }
     }
 
+    /// Restore an `Index` from a previous dump.
+    ///
+    /// A dump is an `Index` serialized in JSON format.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the content is not a valid `Index` representation.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use index_bloom::Index;
+    /// let index_dump = "{\"error_rate\":0.1,\"bloom_filters\":{\"file1.txt\":{\"key_size\":4,\"bitfield\":[8,130,65,18,131,164],\"bitfield_size\":48}}}";
+    /// let index = Index::restore(&index_dump);
+    /// ```
     pub fn restore(content: &str) -> Self {
         let deserialized: Index = serde_json::from_str(&content).expect("Unable to parse dump file");
         return deserialized;
     }
 
+    /// Ingest a new document.
+    ///
+    /// Insert each word of `content` in the index and identifies them under the given `name`.
+    /// To ingest the same key twice will replace its content in the `Index`.
+    ///
+    /// # Errors
+    ///
+    /// If a word in the content cannot be hashed then an error is returned.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use index_bloom::Index;
+    /// # use index_bloom::Error;
+    /// # fn search_index() -> Result<(), Error> {
+    /// let mut index = Index::new(0.00001);
+    /// let first_content = "A very very long content...";
+    /// index.ingest("foo".to_string(), first_content)?;
+    /// # Ok(())
+    /// # }
+    /// ```
     pub fn ingest(&mut self, name: String, content: &str) -> Result<(), Error> {
         let tokens_agg = self.aggregate_tokens(content);
         let capacity = tokens_agg.len();
@@ -35,6 +82,34 @@ impl Index {
         Ok(())
     }
 
+    /// Search keywords in every documents.
+    ///
+    /// Splits `keywords` and searches for each word in all documents with a boolean AND.
+    /// The result may contain false positives (documents not containing all the keywords) according to an error rate set at the creation of the `Index` (see [`Index::new`]).
+    ///
+    /// # Errors
+    ///
+    /// If a word in the content cannot be hashed then an error is returned.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # use index_bloom::Index;
+    /// # use index_bloom::Error;
+    /// # fn search_index() -> Result<(), Error> {
+    /// # let mut index = Index::new(0.00001);
+    /// let hits = index.search("content")?;
+    /// match hits {
+    ///      Some(documents) => {
+    ///          for doc in documents {
+    ///             println!("Found at {}", doc);
+    ///          }
+    ///      },
+    ///      None => println!("Not found")
+    /// }
+    /// # Ok(())
+    /// # }
+    /// ```
     pub fn search(&self, keywords: &str) -> Result<Option<Vec<&String>>, Error> {
         let mut result :Vec<&String> = Vec::new();
         for (name, filter) in &self.bloom_filters {

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,3 +1,27 @@
+//! # index-bloom
+//!
+//! The `index-bloom` crate provide a lightweight full-text search index focused on speed and space efficiency.
+//!
+//! It is able to ingest UTF-8 text content and search for matching words at the expense of customizable error probability of false positive (documents not containing all the keywords).
+//! By its very nature, the original words are lost in the ingestion process. Therefore, it is not possible to estimate a relevance score for each document based on the index content.
+//!
+//! _Note_: When the same `name` is used to identify ingested content, the last one replace the previous one in the `Index`.
+//!
+//! # Quick start
+//!
+//! ```rust
+//! use index_bloom::Index;
+//! #   use index_bloom::Error;
+//!
+//! # fn main() -> Result<(), Error> {
+//! let mut index = Index::new(0.00001);
+//! index.ingest("foo".to_string(), "A very very long content...")?;
+//! let hits = index.search("content")?;
+//! println!("{:?}", hits.unwrap());
+//! # Ok(())
+//! # }
+//! ```
+
 mod index;
 pub use crate::index::Index;
 mod errors;

diff --git a/test/data/test_restore.json b/test/data/test_restore.json
@@ -1 +1 @@
-{"capacity":10,"error_rate":0.1,"bloom_filters":{"file1.txt":{"key_size":4,"bitfield":[8,130,65,18,131,164],"bitfield_size":48}}}
+{"error_rate":0.1,"bloom_filters":{"file1.txt":{"key_size":4,"bitfield":[8,130,65,18,131,164],"bitfield_size":48}}}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"capacity":10,"error_rate":0.1,"bloom_filters":{"file1.txt":{"key_size":4,"bitfield":[8,130,65,18,131,164],"bitfield_size":48}}}
		{"error_rate":0.1,"bloom_filters":{"file1.txt":{"key_size":4,"bitfield":[8,130,65,18,131,164],"bitfield_size":48}}}