From 587381d424dfb1f5d6c5feaf00361474b43bdf8d Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Thu, 26 May 2022 20:54:16 +1200 Subject: [PATCH] Initial support for filtering --- .github/workflows/release.yml | 9 ++ .github/workflows/test.yml | 4 + pagefind/features/base.feature | 1 + pagefind/features/build_options.feature | 2 + pagefind/features/exact_phrase.feature | 3 + pagefind/features/exclusions.feature | 2 + pagefind/features/filtering.feature | 76 ++++++++++- pagefind/features/fragments.feature | 22 +++- pagefind/features/partial_matching.feature | 1 + pagefind/features/scoring.feature | 3 + pagefind/features/stemming.feature | 2 + pagefind/src/fossick/mod.rs | 33 +++-- pagefind/src/fossick/parser.rs | 141 +++++++++++++++++---- pagefind/src/fragments/mod.rs | 4 +- pagefind/src/index/index_filter.rs | 21 +++ pagefind/src/index/index_metadata.rs | 12 +- pagefind/src/index/mod.rs | 93 +++++++++++--- pagefind/src/output/mod.rs | 8 ++ pagefind/src/output/stubs/search.js | 45 ++++++- pagefind_web/src/excerpt.rs | 2 +- pagefind_web/src/filter.rs | 56 ++++++++ pagefind_web/src/filter_index.rs | 58 +++++++++ pagefind_web/src/lib.rs | 81 ++++++++++-- pagefind_web/src/metadata.rs | 24 ++-- pagefind_web/src/search.rs | 21 +-- 25 files changed, 627 insertions(+), 97 deletions(-) create mode 100644 pagefind/src/index/index_filter.rs create mode 100644 pagefind_web/src/filter.rs create mode 100644 pagefind_web/src/filter_index.rs diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ce3797ae..fae3b646 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -185,9 +185,18 @@ jobs: ls -lh ./vendor/ cargo package --allow-dirty + - name: Test Web + working-directory: ./pagefind_web + run: cargo test + + - name: Test Lib + working-directory: ./pagefind + run: cargo test --lib + - name: Test CLI working-directory: ./pagefind run: TEST_BINARY=../target/release/pagefind cargo test --release --test cucumber -- -c 16 --tags "not @skip" + - name: Build working-directory: ./pagefind run: RELEASE_VERSION=${GITHUB_REF#refs/tags/} cargo build --release --target ${{ matrix.target }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f19a123a..db5c02ac 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -68,6 +68,10 @@ jobs: working-directory: ./pagefind_web run: cargo test + - name: Test Lib + working-directory: ./pagefind + run: cargo test --lib + - name: Test CLI working-directory: ./pagefind run: TEST_BINARY=../target/release/pagefind cargo test --release --test cucumber -- -c 16 --tags "not @skip" diff --git a/pagefind/features/base.feature b/pagefind/features/base.feature index 5402f074..a795c48e 100644 --- a/pagefind/features/base.feature +++ b/pagefind/features/base.feature @@ -29,4 +29,5 @@ Feature: Base Tests document.querySelector('[data-url]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-url]" should contain "/cat/" diff --git a/pagefind/features/build_options.feature b/pagefind/features/build_options.feature index cee483f2..fc25e48f 100644 --- a/pagefind/features/build_options.feature +++ b/pagefind/features/build_options.feature @@ -38,6 +38,7 @@ Feature: Build Options document.querySelector('[data-url]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-url]" should contain "/cat/" Scenario: Output path can be configured @@ -69,6 +70,7 @@ Feature: Build Options document.querySelector('[data-url]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-url]" should contain "/cat/" @skip diff --git a/pagefind/features/exact_phrase.feature b/pagefind/features/exact_phrase.feature index cd4ac4c1..3190cbe8 100644 --- a/pagefind/features/exact_phrase.feature +++ b/pagefind/features/exact_phrase.feature @@ -36,6 +36,7 @@ Feature: Exact Phrase Matching document.querySelector('[data-result]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-count]" should contain "1 result(s)" Then The selector "[data-result]" should contain "/cat/" @@ -69,6 +70,7 @@ Feature: Exact Phrase Matching document.querySelector('[data-result]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-count]" should contain "1 result(s)" Then The selector "[data-result]" should contain "/cattwo/" @@ -102,6 +104,7 @@ Feature: Exact Phrase Matching document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', '); } """ + Then There should be no logs Then The selector "[data-count]" should contain "2 result(s)" Then The selector "[data-result]" should contain "/cat/, /dog/" diff --git a/pagefind/features/exclusions.feature b/pagefind/features/exclusions.feature index 161b3d00..3a7e5b85 100644 --- a/pagefind/features/exclusions.feature +++ b/pagefind/features/exclusions.feature @@ -34,6 +34,7 @@ Feature: Exclusions document.querySelector('[data-search-two]').innerText = `${searchtwo.length} result(s)`; } """ + Then There should be no logs Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Huzzah!" Then The selector "[data-search-two]" should contain "0 result(s)" @@ -75,5 +76,6 @@ Feature: Exclusions document.querySelector('[data-search-two]').innerText = `${searchtwo.length} result(s)`; } """ + Then There should be no logs Then The selector "[data-search-one]" should contain "Hello World, from Pagefind. Hooray!" Then The selector "[data-search-two]" should contain "0 result(s)" diff --git a/pagefind/features/filtering.feature b/pagefind/features/filtering.feature index eebcb65d..26436a78 100644 --- a/pagefind/features/filtering.feature +++ b/pagefind/features/filtering.feature @@ -1,4 +1,3 @@ -@skip Feature: Filtering Background: Given I have a "public/index.html" file with the content: @@ -44,16 +43,17 @@ Feature: Filtering document.querySelector('[data-results]').innerText = data.map(d => d.url).sort().join(', '); } """ + Then There should be no logs Then The selector "[data-results]" should contain "/ali/, /cheeka/, /theodore/" Scenario: Filtering on tagged elements When I evaluate: - """js + """ async function() { let pagefind = await import("/_pagefind/pagefind.js"); let results = await pagefind.search("Cat", { - filter: { + filters: { color: "Orange" } }); @@ -62,16 +62,17 @@ Feature: Filtering document.querySelector('[data-results]').innerText = data.map(d => d.url).sort().join(', '); } """ + Then There should be no logs Then The selector "[data-results]" should contain "/theodore/" Scenario: Filtering on tagged values When I evaluate: - """js + """ async function() { let pagefind = await import("/_pagefind/pagefind.js"); let results = await pagefind.search("Cat", { - filter: { + filters: { color: "Tabby" } }); @@ -80,16 +81,17 @@ Feature: Filtering document.querySelector('[data-results]').innerText = data.map(d => d.url).sort().join(', '); } """ + Then There should be no logs Then The selector "[data-results]" should contain "/ali/" Scenario: Filtering returns multiple results When I evaluate: - """js + """ async function() { let pagefind = await import("/_pagefind/pagefind.js"); let results = await pagefind.search("Cat", { - filter: { + filters: { color: "White" } }); @@ -98,4 +100,64 @@ Feature: Filtering document.querySelector('[data-results]').innerText = data.map(d => d.url).sort().join(', '); } """ + Then There should be no logs Then The selector "[data-results]" should contain "/cheeka/, /theodore/" + + @skip + # Currently only an AND filtering is supported. Need to restructure to support boolean logic + Scenario: Filtering to multiple values + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let results = await pagefind.search("Cat", { + filters: { + color: ["Tabby", "Orange"] + } + }); + let data = await Promise.all(results.map(result => result.data())); + + document.querySelector('[data-results]').innerText = data.map(d => d.url).sort().join(', '); + } + """ + Then There should be no logs + Then The selector "[data-results]" should contain "/ali/, /theodore/" + + @skip + Scenario: Non-existent filters return no results + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let results = await pagefind.search("Cat", { + filters: { + name: "Ali" + } + }); + + document.querySelector('[data-results]').innerText = results.length; + } + """ + Then There should be no logs + Then The selector "[data-results]" should contain "0" + + @skip + Scenario: Non-existent values return no results + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let results = await pagefind.search("Cat", { + filters: { + color: "Green" + } + }); + + document.querySelector('[data-results]').innerText = results.length; + } + """ + Then There should be no logs + Then The selector "[data-results]" should contain "0" \ No newline at end of file diff --git a/pagefind/features/fragments.feature b/pagefind/features/fragments.feature index d0c91116..fb86f42c 100644 --- a/pagefind/features/fragments.feature +++ b/pagefind/features/fragments.feature @@ -8,7 +8,10 @@ Feature: Fragments Given I have a "public/cat/index.html" file with the content: """ -

Cat Post.

+

+ Cat Post. +

+ cats

A post about the 'felines'

This post has some gnarly things to test the fragment formatting.

@@ -31,6 +34,7 @@ Feature: Fragments document.querySelector('[data-result]').innerText = data.title; } """ + Then There should be no logs Then The selector "[data-result]" should contain "Cat Post." Scenario: Search results return nicely formatted content @@ -45,6 +49,7 @@ Feature: Fragments document.querySelector('[data-result]').innerText = data.content; } """ + Then There should be no logs Then The selector "[data-result]" should contain "Cat Post. A post about the 'felines'. This post has some gnarly things to test the fragment formatting." Scenario: Search results return highlighted search exerpt @@ -59,11 +64,24 @@ Feature: Fragments document.querySelector('[data-result]').innerText = data.excerpt; } """ + Then There should be no logs # NB: The HTML encoding below is a test artifact Then The selector "[data-result]" should contain "Cat Post. A post about the <mark>'felines'.</mark> This post has some gnarly things to test the fragment formatting." - @skip Scenario: Search results return tagged filters + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let results = await pagefind.search("cat"); + + let data = await results[0].data(); + document.querySelector('[data-result]').innerText = Object.entries(data.filters).map(([f, v]) => `${f}: ${v}`).sort().join(', '); + } + """ + Then There should be no logs + Then The selector "[data-result]" should contain "animal: cats, title: Cat Post." @skip Scenario: Search results return tagged metadata diff --git a/pagefind/features/partial_matching.feature b/pagefind/features/partial_matching.feature index 7bc148d4..f6612cfc 100644 --- a/pagefind/features/partial_matching.feature +++ b/pagefind/features/partial_matching.feature @@ -24,5 +24,6 @@ Feature: Partial Matching document.querySelector('[data-url]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-url]" should contain "/cat/" diff --git a/pagefind/features/scoring.feature b/pagefind/features/scoring.feature index 90e12a9f..b5256e1f 100644 --- a/pagefind/features/scoring.feature +++ b/pagefind/features/scoring.feature @@ -52,6 +52,7 @@ Feature: Result Scoring document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', '); } """ + Then There should be no logs Then The selector "[data-count]" should contain "2 result(s)" Then The selector "[data-result]" should contain "/dog/, /cat/" @@ -69,6 +70,7 @@ Feature: Result Scoring document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', '); } """ + Then There should be no logs Then The selector "[data-count]" should contain "2 result(s)" Then The selector "[data-result]" should contain "/dog/, /cat/" When I evaluate: @@ -83,5 +85,6 @@ Feature: Result Scoring document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', '); } """ + Then There should be no logs Then The selector "[data-count]" should contain "2 result(s)" Then The selector "[data-result]" should contain "/cat/, /dog/" diff --git a/pagefind/features/stemming.feature b/pagefind/features/stemming.feature index 06f45f63..54e9c399 100644 --- a/pagefind/features/stemming.feature +++ b/pagefind/features/stemming.feature @@ -30,6 +30,7 @@ Feature: Word Stemming document.querySelector('[data-result]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-result]" should contain "/cat/" Scenario: Search is case independent @@ -55,5 +56,6 @@ Feature: Word Stemming document.querySelector('[data-result]').innerText = data.url; } """ + Then There should be no logs Then The selector "[data-result]" should contain "/cat/" diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index c92c74db..f12f4aec 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -12,26 +12,28 @@ use crate::utils::full_hash; use crate::SearchOptions; use parser::DomParser; +use self::parser::DomParserResult; + mod parser; +#[derive(Debug)] pub struct FossickedData { pub file_path: PathBuf, pub fragment: PageFragment, pub word_data: HashMap>, } +#[derive(Debug)] pub struct Fossicker { file_path: PathBuf, - title: String, - digest: String, + data: Option, } impl Fossicker { pub fn new(file_path: PathBuf) -> Self { Self { file_path, - title: String::new(), - digest: String::new(), + data: None, } } @@ -51,9 +53,7 @@ impl Fossicker { } } - let data = rewriter.wrap(); - self.digest = data.digest; - self.title = data.title; + self.data = Some(rewriter.wrap()); Ok(()) } @@ -71,7 +71,15 @@ impl Fossicker { // so that separate bodies of text don't return exact string // matches across the boundaries. - for (word_index, word) in self.digest.to_lowercase().split_whitespace().enumerate() { + for (word_index, word) in self + .data + .as_ref() + .unwrap() + .digest + .to_lowercase() + .split_whitespace() + .enumerate() + { let mut word = special_chars.replace_all(word, "").into_owned(); word = en_stemmer.stem(&word).into_owned(); // if words_to_remove.contains(&word) { @@ -95,7 +103,9 @@ impl Fossicker { } let word_data = self.retrieve_words_from_digest(); - let hash = full_hash(self.digest.as_bytes()); + + let data = self.data.as_ref().unwrap(); + let hash = full_hash(data.digest.as_bytes()); Ok(FossickedData { file_path: self.file_path.clone(), @@ -104,8 +114,9 @@ impl Fossicker { page_number: 0, data: PageFragmentData { url: build_url(&self.file_path, options), - title: self.title.clone(), - content: self.digest.clone(), + title: data.title.clone(), + content: data.digest.clone(), + filters: data.filters.clone(), attributes: HashMap::new(), word_count: word_data.len(), }, diff --git a/pagefind/src/fossick/parser.rs b/pagefind/src/fossick/parser.rs index bf24911e..faf3f0d8 100644 --- a/pagefind/src/fossick/parser.rs +++ b/pagefind/src/fossick/parser.rs @@ -1,3 +1,4 @@ +use hashbrown::HashMap; use lazy_static::lazy_static; use lol_html::{element, text, HtmlRewriter, Settings}; use regex::Regex; @@ -17,36 +18,50 @@ lazy_static! { vec!("script", "noscript", "label", "form", "svg", "footer", "header", "nav", "iframe"); } +// We aren't transforming HTML, just parsing, so we dump the output. +#[derive(Default)] struct EmptySink; impl lol_html::OutputSink for EmptySink { fn handle_chunk(&mut self, _: &[u8]) {} } +/// Houses the HTML parsing instance and the internal data while parsing pub struct DomParser<'a> { rewriter: HtmlRewriter<'a, EmptySink>, data: Rc>, } -// TODO: Store digest as a tree so that we can drop nodes correctly -// i.e. when we reach the end of a
, we can drop everything within. +// The internal state while parsing, +// with a reference to the deepest HTML element +// that we're currently reading #[derive(Default, Debug)] struct DomParserData { current_node: Rc>, title: Option, + filters: HashMap>, } +// A single HTML element that we're reading into. +// Contains a reference to the parent element, +// and since we collapse this tree upwards while we parse, +// we don't need to store tree structure. #[derive(Default, Debug)] struct DomParsingNode { current_value: String, parent: Option>>, + filter: Option, ignore: bool, } +/// The search-relevant data that was retrieved from the given input +#[derive(Debug)] pub struct DomParserResult { pub digest: String, pub title: String, + pub filters: HashMap>, } +// Some shorthand to clean up our use of Rc> in the lol_html macros // From https://github.com/rust-lang/rfcs/issues/2407#issuecomment-385291238 macro_rules! enclose { ( ($( $x:ident ),*) $y:expr ) => { @@ -60,65 +75,94 @@ macro_rules! enclose { impl<'a> DomParser<'a> { pub fn new() -> Self { let data = Rc::new(RefCell::new(DomParserData::default())); - let empty = EmptySink {}; let rewriter = HtmlRewriter::new( Settings { element_content_handlers: vec![ enclose! { (data) element!("body *", move |el| { - let data = Rc::clone(&data); + let should_ignore_el = el.has_attribute("data-pagefind-ignore") || REMOVE_SELECTORS.contains(&el.tag_name().as_str()); + let filter = el.get_attribute("data-pagefind-filter"); - - let mut node = DomParsingNode{ + let node = Rc::new(RefCell::new(DomParsingNode{ parent: Some(Rc::clone(&data.borrow().current_node)), + ignore: should_ignore_el, + filter, ..DomParsingNode::default() - }; - if el.has_attribute("data-pagefind-ignore") || REMOVE_SELECTORS.contains(&el.tag_name().as_str()) { - node.ignore = true; - } - let node = Rc::new(RefCell::new(node)); + })); + { let mut data = data.borrow_mut(); data.current_node = Rc::clone(&node); } - let tail_data = Rc::clone(&data); - let tail_node = Rc::clone(&node); - - let can_have_content = el.on_end_tag(move |end| { + let can_have_content = el.on_end_tag(enclose! { (data, node) move |end| { let mut data = data.borrow_mut(); let mut node = node.borrow_mut(); + // When we reach an end tag, we need to + // make sure to move focus back to the parent node. if let Some(parent) = &node.parent { data.current_node = Rc::clone(parent); } + // Process filters before we continue + // (Filters are valid on ignored elements) + if let Some((filter, value)) = node.get_filter() { + match data.filters.get_mut(&filter) { + Some(filter_arr) => filter_arr.push(normalize_content(&value)), + None => { + data.filters.insert(filter, vec![ + normalize_content(&value) + ]); + } + } + } + + // If we bail out now, the content won't be persisted anywhere + // and the node + children will be dropped. if node.ignore { return Ok(()); } let tag_name = end.name(); if SENTENCE_SELECTORS.contains(&tag_name.as_str()) { + // For block elements, we want to make sure sentences + // don't hug each other without whitespace. + // We normalize repeated whitespace later, so we + // can add this indiscriminately. let mut padded = " ".to_owned(); padded.push_str(&node.current_value); node.current_value = padded; + // Similarly, we want to separate block elements + // with punctuation, so that the excerpts read nicely. + // (As long as it doesn't already end with, say, a . or ?) if node.current_value.chars() .last() .filter(|c| SENTENCE_CHARS.is_match(&c.to_string())) .is_some() { - node.current_value.push_str(". "); + node.current_value.push('.'); } + node.current_value.push(' '); } + // Huck all of the content we have onto the end of the + // content that the parent node has (so far) + // This will include all of our children's content, + // and the order of tree traversal will mean that it + // is inserted in the correct position in the parent's content. let mut parent = data.current_node.borrow_mut(); parent.current_value.push_str(&node.current_value); + Ok(()) - }); + }}); + // Try to handle tags like which have no end tag, + // and thus will never hit the logic to reset the current node. + // TODO: This could still be missed for tags with implied ends? if can_have_content.is_err() { - let mut data = tail_data.borrow_mut(); - let node = tail_node.borrow(); + let mut data = data.borrow_mut(); + let node = node.borrow(); if let Some(parent) = &node.parent { data.current_node = Rc::clone(parent); } @@ -134,7 +178,7 @@ impl<'a> DomParser<'a> { })}, // Track the first h1 on the page as the title to return in search // TODO: This doesn't handle a chunk boundary, - // we can instead handle this by marking the node as a title and handling it in end_node + // we can instead handle this by marking the node as a title and handling it in end_node enclose! { (data) text!("h1", move |el| { let mut data = data.borrow_mut(); let text = normalize_content(el.as_str()); @@ -146,23 +190,42 @@ impl<'a> DomParser<'a> { ], ..Settings::default() }, - empty, + EmptySink::default(), ); Self { rewriter, data } } + /// Writes a chunk of data to the underlying HTML parser pub fn write(&mut self, data: &[u8]) -> Result<(), lol_html::errors::RewritingError> { self.rewriter.write(data) } + /// Performs any post-processing and returns the summated search results pub fn wrap(self) -> DomParserResult { drop(self.rewriter); // Clears the extra Rcs on and within data let data = Rc::try_unwrap(self.data).unwrap().into_inner(); - let node = data.current_node.borrow(); + let mut node = data.current_node; + + // Fallback: If we are left with a tree, collapse it up into the parents + // until we get to the root node. + while node.borrow().parent.is_some() { + { + let node = node.borrow(); + let mut parent_node = node.parent.as_ref().unwrap().borrow_mut(); + parent_node.current_value.push_str(&node.current_value); + } + let old_node = node.borrow(); + let new_node = Rc::clone(old_node.parent.as_ref().unwrap()); + drop(old_node); + node = new_node; + } + + let node = node.borrow(); DomParserResult { digest: normalize_content(&node.current_value), title: data.title.unwrap_or_default(), + filters: data.filters, } } } @@ -175,6 +238,22 @@ fn normalize_content(content: &str) -> String { content.to_string() } +impl DomParsingNode { + fn get_filter(&self) -> Option<(String, String)> { + if self.current_value.is_empty() { + return None; + } + if let Some(filter) = &self.filter { + match filter.split_once(":") { + Some((filter, value)) => Some((filter.to_owned(), value.to_owned())), + None => Some((filter.to_owned(), self.current_value.to_owned())), + } + } else { + None + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -187,6 +266,24 @@ mod tests { assert_eq!(&output, "Hello Wor ld?"); } + #[test] + fn get_filter_from_node() { + let mut node = DomParsingNode::default(); + assert_eq!(node.get_filter(), None); + + node.filter = Some("color".into()); + assert_eq!(node.get_filter(), None); + + node.current_value = "White".into(); + assert_eq!(node.get_filter(), Some(("color".into(), "White".into()))); + + node.filter = Some("color:auburn".into()); + assert_eq!(node.get_filter(), Some(("color".into(), "auburn".into()))); + + node.filter = Some("color:ye:llow".into()); + assert_eq!(node.get_filter(), Some(("color".into(), "ye:llow".into()))); + } + fn test_parse(input: Vec<&'static str>) -> DomParserResult { let mut rewriter = DomParser::new(); let _ = rewriter.write(b""); diff --git a/pagefind/src/fragments/mod.rs b/pagefind/src/fragments/mod.rs index 3b0a2688..aab7cd31 100644 --- a/pagefind/src/fragments/mod.rs +++ b/pagefind/src/fragments/mod.rs @@ -2,15 +2,17 @@ use hashbrown::HashMap; use serde::Serialize; -#[derive(Serialize)] +#[derive(Serialize, Debug)] pub struct PageFragmentData { pub url: String, pub title: String, pub content: String, pub word_count: usize, + pub filters: HashMap>, pub attributes: HashMap, } +#[derive(Debug)] pub struct PageFragment { pub hash: String, pub page_number: usize, diff --git a/pagefind/src/index/index_filter.rs b/pagefind/src/index/index_filter.rs new file mode 100644 index 00000000..d10ed38c --- /dev/null +++ b/pagefind/src/index/index_filter.rs @@ -0,0 +1,21 @@ +use minicbor::Encode; + +/// The filter index chunks in `_pagefind/filter/` + +/// A single filter index chunk: `_pagefind/filter/*.pf_filter` +#[derive(Encode)] +pub struct FilterIndex { + #[n(0)] + pub filter: String, + #[n(1)] + pub values: Vec, +} + +/// A single filter value as an inverse index of all locations on the site +#[derive(Encode, Clone, Debug)] +pub struct PackedValue { + #[n(0)] + pub value: String, + #[n(1)] + pub pages: Vec, // Won't exceed u32 but saves us some into()s +} diff --git a/pagefind/src/index/index_metadata.rs b/pagefind/src/index/index_metadata.rs index 16be6d71..7ad308ca 100644 --- a/pagefind/src/index/index_metadata.rs +++ b/pagefind/src/index/index_metadata.rs @@ -10,9 +10,9 @@ pub struct MetaIndex { #[n(1)] pub pages: Vec, #[n(2)] - pub stops: Vec, - #[n(3)] pub index_chunks: Vec, + #[n(3)] + pub filters: Vec, } /// Communicates the _pagefind/index/*.pf_index file we need to load @@ -34,3 +34,11 @@ pub struct MetaPage { #[n(1)] pub word_count: u32, } + +#[derive(Encode)] +pub struct MetaFilter { + #[n(0)] + pub filter: String, + #[n(1)] + pub hash: String, +} diff --git a/pagefind/src/index/mod.rs b/pagefind/src/index/mod.rs index 6c007911..e23ca2c8 100644 --- a/pagefind/src/index/mod.rs +++ b/pagefind/src/index/mod.rs @@ -1,14 +1,20 @@ use hashbrown::HashMap; -use crate::{fossick::FossickedData, fragments::PageFragment, utils::full_hash, SearchOptions}; +use crate::{ + fossick::FossickedData, fragments::PageFragment, index::index_metadata::MetaFilter, + utils::full_hash, SearchOptions, +}; +use index_filter::{FilterIndex, PackedValue}; use index_metadata::{MetaChunk, MetaIndex, MetaPage}; use index_words::{PackedPage, PackedWord, WordIndex}; +mod index_filter; mod index_metadata; mod index_words; pub struct PagefindIndexes { pub word_indexes: HashMap>, + pub filter_indexes: HashMap>, pub meta_index: Vec, pub fragments: HashMap, } @@ -20,30 +26,17 @@ where let mut meta = MetaIndex { version: options.version.into(), pages: Vec::new(), - stops: stop_words::get(stop_words::LANGUAGE::English), // TODO: i18n index_chunks: Vec::new(), + filters: Vec::new(), }; let mut word_map: HashMap = HashMap::new(); + let mut filter_map: HashMap>> = HashMap::new(); let mut fragments: HashMap = HashMap::new(); for (page_number, mut page) in pages.enumerate() { page.fragment.page_number = page_number; - let mut short_hash = &page.fragment.hash[0..=6]; - // If we hit a collision, extend both hashes until we stop colliding - while let Some(collision) = fragments.remove(short_hash) { - let new_length = short_hash.len() + 1; - - fragments.insert(collision.hash[0..=new_length].to_string(), collision); - short_hash = &page.fragment.hash[0..=new_length]; - - if short_hash.len() == page.fragment.hash.len() { - break; - } - } - fragments.insert(short_hash.to_string(), page.fragment); - for (word, positions) in page.word_data { let packed_page = PackedPage { page_number, @@ -63,6 +56,38 @@ where } } } + + for (filter, values) in &page.fragment.data.filters { + for value in values { + match filter_map.get_mut(filter) { + Some(value_map) => match value_map.get_mut(value) { + Some(page_array) => page_array.push(page_number), + None => { + value_map.insert(value.clone(), vec![page_number]); + } + }, + None => { + let mut value_map = HashMap::new(); + value_map.insert(value.clone(), vec![page_number]); + filter_map.insert(filter.clone(), value_map); + } + } + } + } + + let mut short_hash = &page.fragment.hash[0..=6]; + // If we hit a collision, extend both hashes until we stop colliding + while let Some(collision) = fragments.remove(short_hash) { + let new_length = short_hash.len() + 1; + + fragments.insert(collision.hash[0..=new_length].to_string(), collision); + short_hash = &page.fragment.hash[0..=new_length]; + + if short_hash.len() == page.fragment.hash.len() { + break; + } + } + fragments.insert(short_hash.to_string(), page.fragment); } meta.pages = fragments @@ -76,12 +101,47 @@ where meta.pages .sort_by_cached_key(|p| fragments.get(&p.hash).unwrap().page_number); + // TODO: Change filter indexes to BTree to give them a stable hash. + let mut filter_indexes = HashMap::new(); + for (filter, values) in filter_map { + let mut filter_index: Vec = Vec::new(); + let _ = minicbor::encode::>( + FilterIndex { + filter: filter.clone(), + values: values + .into_iter() + .map(|(value, pages)| PackedValue { value, pages }) + .collect(), + }, + filter_index.as_mut(), + ); + let hash = full_hash(&filter_index); + let mut short_hash = &hash[0..=6]; + + // If we hit a collision, extend one hash until we stop colliding + // TODO: DRY + while filter_indexes.contains_key(short_hash) { + let new_length = short_hash.len() + 1; + short_hash = &hash[0..=new_length]; + + if short_hash.len() == hash.len() { + break; + } + } + filter_indexes.insert(short_hash.to_string(), filter_index); + meta.filters.push(MetaFilter { + filter, + hash: short_hash.to_string(), + }) + } + if TryInto::::try_into(meta.pages.len()).is_err() { panic!("Too many documents to index"); } println!("Indexed {:?} pages", meta.pages.len()); println!("Indexed {:?} words", word_map.len()); + println!("Indexed {:?} filters", meta.filters.len()); // TODO: Parameterize these chunk sizes via options let chunks = chunk_index(word_map, 20000); @@ -118,6 +178,7 @@ where PagefindIndexes { word_indexes, + filter_indexes, meta_index, fragments, } diff --git a/pagefind/src/output/mod.rs b/pagefind/src/output/mod.rs index 5017f4fc..cea2a4fe 100644 --- a/pagefind/src/output/mod.rs +++ b/pagefind/src/output/mod.rs @@ -67,6 +67,14 @@ impl PagefindIndexes { ) })); + files.extend(self.filter_indexes.iter().map(|(hash, index)| { + write( + outdir.join(format!("filter/{}.pf_filter", hash)), + vec![index], + Compress::GZ, + ) + })); + join_all(files).await; } } diff --git a/pagefind/src/output/stubs/search.js b/pagefind/src/output/stubs/search.js index 74cb1b6e..ba462be2 100644 --- a/pagefind/src/output/stubs/search.js +++ b/pagefind/src/output/stubs/search.js @@ -7,6 +7,7 @@ class Pagefind { this.searchMeta = null; this.raw_ptr = null; this.loaded_chunks = []; + this.loaded_filters = []; this.base_path = "/_pagefind/"; this.init(); } @@ -49,6 +50,18 @@ class Pagefind { this.loaded_chunks.push(hash); } + async loadFilterChunk(hash) { + if (this.loaded_filters.includes(hash)) return; + + let compressed_chunk = await fetch(`${this.base_path}filter/${hash}.pf_filter`); + compressed_chunk = await compressed_chunk.arrayBuffer(); + let chunk = gunzip(new Uint8Array(compressed_chunk)); + + let ptr = await this.getPtr(); + this.raw_ptr = this.backend.load_filter_chunk(ptr, chunk); + this.loaded_filters.push(hash); + } + // TODO: Due for a rework (chunking) // TODO: Large test "fishing" has the wrong mark // TODO: Large test "hades" returns some strange results @@ -78,18 +91,38 @@ class Pagefind { return this.raw_ptr; } - async search(term) { + async search(term, options) { + options = { + verbose: false, + filters: {}, + ...options, + }; + const log = str => { if (options.verbose) console.log(str) }; let start = Date.now(); let ptr = await this.getPtr(); term = term.toLowerCase(); - let chunks = this.backend.request_indexes(ptr, term); - await Promise.all(chunks.split(' ').map(chunk => this.loadChunk(chunk))); + let filter_list = []; + for (let [filter, values] of Object.entries(options.filters)) { + if (Array.isArray(values)) { + for (let value of values) { + filter_list.push(`${filter}:${value}`); + } + } else { + filter_list.push(`${filter}:${values}`); + } + } + + filter_list = filter_list.join("__PF_FILTER_DELIM__"); + + let chunks = this.backend.request_indexes(ptr, term).split(' ').filter(v => v).map(chunk => this.loadChunk(chunk)); + let filter_chunks = this.backend.request_filter_indexes(ptr, filter_list).split(' ').filter(v => v).map(chunk => this.loadFilterChunk(chunk)); + await Promise.all([...chunks, ...filter_chunks]); // pointer may have updated from the loadChunk calls ptr = await this.getPtr(); let searchStart = Date.now(); - let results = this.backend.search(ptr, term); + let results = this.backend.search(ptr, term, filter_list); results = results.length ? results.split(" ") : []; let resultsInterface = results.map(result => { @@ -104,11 +137,11 @@ class Pagefind { } }); - // console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`); + log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`); return resultsInterface; } } const pagefind = new Pagefind(); -export const search = async (term) => await pagefind.search(term); +export const search = async (term, options) => await pagefind.search(term, options); diff --git a/pagefind_web/src/excerpt.rs b/pagefind_web/src/excerpt.rs index 1f471661..2a46f976 100644 --- a/pagefind_web/src/excerpt.rs +++ b/pagefind_web/src/excerpt.rs @@ -1,5 +1,5 @@ // TODO: MVP — Implement something smarter -pub fn calculate_excerpt(word_positions: &Vec, excerpt_length: u32) -> u32 { +pub fn calculate_excerpt(word_positions: &[u32], excerpt_length: u32) -> u32 { let start_distance = excerpt_length / 3; if word_positions.is_empty() { return 0; diff --git a/pagefind_web/src/filter.rs b/pagefind_web/src/filter.rs new file mode 100644 index 00000000..a956e167 --- /dev/null +++ b/pagefind_web/src/filter.rs @@ -0,0 +1,56 @@ +use bit_set::BitSet; + +use crate::util::*; +use crate::SearchIndex; + +impl SearchIndex { + pub fn filter(&self, filter: &str) -> Option { + let filters = filter.split("__PF_FILTER_DELIM__"); + + let mut maps = Vec::new(); + + for filter in filters { + if let Some((filter, value)) = filter.split_once(":") { + debug!({ + format! {"Filtering for {}: {}", filter, value} + }); + if let Some(filter_map) = self.filters.get(filter) { + debug!({ + format! {"Found a map for {}: {:#?}", filter, filter_map} + }); + if let Some(filter_pages) = filter_map.get(value) { + debug!({ + format! {"Found the value {}", value} + }); + let mut set = BitSet::new(); + for page in filter_pages { + set.insert(*page as usize); + } + maps.push(set); + } else { + debug!({ + format! {"No value exists for {}", value} + }); + } + } else { + debug!({ + format! {"No map exists for {}", filter} + }); + } + } else { + debug!({ + format! {"Bad filter (no `:`): {:?}", filter} + }) + } + } + + let mut maps = maps.drain(..); + let mut results = maps.next()?; + + for map in maps { + results.intersect_with(&map); + } + + Some(results) + } +} diff --git a/pagefind_web/src/filter_index.rs b/pagefind_web/src/filter_index.rs new file mode 100644 index 00000000..73552dc6 --- /dev/null +++ b/pagefind_web/src/filter_index.rs @@ -0,0 +1,58 @@ +use std::collections::HashMap; + +use super::SearchIndex; +use crate::util::*; +use minicbor::{decode, Decoder}; + +/* +{} = fixed length array +{ + String, // filter name + [ + { + String, // filter value + [ + u32 // page number + ... + ] + }, + ... + ] +} +*/ + +impl SearchIndex { + pub fn decode_filter_index_chunk(&mut self, filter_bytes: &[u8]) -> Result<(), decode::Error> { + debug!({ format!("Decoding {:#?} filter index bytes", filter_bytes.len()) }); + let mut decoder = Decoder::new(filter_bytes); + + consume_fixed_arr!(decoder); + + debug!({ "Reading filter name" }); + let filter = consume_string!(decoder); + + debug!({ "Reading values array" }); + let values = consume_arr_len!(decoder); + + debug!({ format!("Reading {:#?} values", values) }); + let mut value_map = HashMap::new(); + for _ in 0..values { + consume_fixed_arr!(decoder); + let value = consume_string!(decoder); + + let pages = consume_arr_len!(decoder); + let mut page_arr = Vec::with_capacity(pages as usize); + for _ in 0..pages { + page_arr.push(consume_num!(decoder)); + } + + value_map.insert(value, page_arr); + } + + self.filters.insert(filter, value_map); + + debug!({ "Finished reading values" }); + + Ok(()) + } +} diff --git a/pagefind_web/src/lib.rs b/pagefind_web/src/lib.rs index 2c38451e..0a60f225 100644 --- a/pagefind_web/src/lib.rs +++ b/pagefind_web/src/lib.rs @@ -5,12 +5,13 @@ static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; use std::collections::HashMap; -use bit_set::BitSet; use excerpt::calculate_excerpt; -use rust_stemmers::{Algorithm, Stemmer}; // TODO: too big +use util::*; use wasm_bindgen::prelude::*; mod excerpt; +mod filter; +mod filter_index; mod index; mod metadata; mod search; @@ -37,8 +38,9 @@ pub struct SearchIndex { generator_version: Option, pages: Vec, chunks: Vec, - stops: Vec, + filter_chunks: HashMap, words: HashMap>, + filters: HashMap>>, } #[cfg(debug_assertions)] @@ -62,8 +64,9 @@ pub fn init_pagefind(metadata_bytes: &[u8]) -> *mut SearchIndex { generator_version: None, pages: Vec::new(), chunks: Vec::new(), - stops: Vec::new(), + filter_chunks: HashMap::new(), words: HashMap::new(), + filters: HashMap::new(), }; match search_index.decode_metadata(metadata_bytes) { @@ -78,15 +81,27 @@ pub fn init_pagefind(metadata_bytes: &[u8]) -> *mut SearchIndex { #[wasm_bindgen] pub fn load_index_chunk(ptr: *mut SearchIndex, chunk_bytes: &[u8]) -> *mut SearchIndex { - #[cfg(debug_assertions)] - debug_log("Loading Index Chunk"); + debug!({ "Loading Index Chunk" }); let mut search_index = unsafe { Box::from_raw(ptr) }; match search_index.decode_index_chunk(chunk_bytes) { Ok(_) => Box::into_raw(search_index), Err(e) => { - #[cfg(debug_assertions)] - debug_log(&format!("{:#?}", e)); + debug!({ format!("{:#?}", e) }); + std::ptr::null_mut::() + } + } +} + +#[wasm_bindgen] +pub fn load_filter_chunk(ptr: *mut SearchIndex, chunk_bytes: &[u8]) -> *mut SearchIndex { + debug!({ "Loading Filter Chunk" }); + let mut search_index = unsafe { Box::from_raw(ptr) }; + + match search_index.decode_filter_index_chunk(chunk_bytes) { + Ok(_) => Box::into_raw(search_index), + Err(e) => { + debug!({ format!("{:#?}", e) }); std::ptr::null_mut::() } } @@ -94,8 +109,9 @@ pub fn load_index_chunk(ptr: *mut SearchIndex, chunk_bytes: &[u8]) -> *mut Searc #[wasm_bindgen] pub fn request_indexes(ptr: *mut SearchIndex, query: &str) -> String { - #[cfg(debug_assertions)] - debug_log(&format! {"Finding the index chunks needed for {:?}", query}); + debug!({ + format! {"Finding the index chunks needed for {:?}", query} + }); let search_index = unsafe { Box::from_raw(ptr) }; let mut indexes = Vec::new(); @@ -107,16 +123,56 @@ pub fn request_indexes(ptr: *mut SearchIndex, query: &str) -> String { .iter() .find(|chunk| term >= &chunk.from && term <= &chunk.to); if let Some(index) = term_index { + debug!({ + format! {"Need {:?} for {:?}", index.hash, term} + }); indexes.push(index.hash.clone()) + } else { + debug!({ + format! {"No hash found for {:?}", term} + }) + } + } + + let _ = Box::into_raw(search_index); + indexes.sort(); + indexes.dedup(); + indexes.join(" ") +} + +#[wasm_bindgen] +pub fn request_filter_indexes(ptr: *mut SearchIndex, filters: &str) -> String { + debug!({ + format! {"Finding the filter chunks needed for {:?}", filters} + }); + + let search_index = unsafe { Box::from_raw(ptr) }; + let mut indexes = Vec::new(); + let filters = filters.split("__PF_FILTER_DELIM__"); + + for filter in filters { + if let Some((filter, _)) = filter.split_once(":") { + if let Some(hash) = search_index.filter_chunks.get(filter) { + debug!({ + format! {"Need {:?} for {:?}", hash, filter} + }); + indexes.push(hash.clone()); + } else { + debug!({ + format! {"No hash found for {:?}", filter} + }) + } } } let _ = Box::into_raw(search_index); + indexes.sort(); + indexes.dedup(); indexes.join(" ") } #[wasm_bindgen] -pub fn search(ptr: *mut SearchIndex, query: &str) -> String { +pub fn search(ptr: *mut SearchIndex, query: &str, filter: &str) -> String { let search_index = unsafe { Box::from_raw(ptr) }; if let Some(generator_version) = search_index.generator_version.as_ref() { @@ -126,7 +182,8 @@ pub fn search(ptr: *mut SearchIndex, query: &str) -> String { } } - let results = search_index.search_term(query); + let filter_set = search_index.filter(filter); + let results = search_index.search_term(query, filter_set); let result_string = results .into_iter() diff --git a/pagefind_web/src/metadata.rs b/pagefind_web/src/metadata.rs index dd05b33c..8cab53b4 100644 --- a/pagefind_web/src/metadata.rs +++ b/pagefind_web/src/metadata.rs @@ -15,6 +15,13 @@ use minicbor::{decode, Decoder}; String, // hash of index chunk }, ... + ], + [ + { + String, // value of filter chunk + String, // hash of filter chunk + }, + ... ] } */ @@ -41,14 +48,6 @@ impl SearchIndex { }); } - debug!({ "Reading stop words array" }); - let stop_words = consume_arr_len!(decoder); - debug!({ format!("Reading {:#?} stop words", stop_words) }); - self.stops = Vec::with_capacity(stop_words as usize); - for _ in 0..stop_words { - self.stops.push(consume_string!(decoder)); - } - debug!({ "Reading index chunks array" }); let index_chunks = consume_arr_len!(decoder); debug!({ format!("Reading {:#?} index chunks", index_chunks) }); @@ -62,6 +61,15 @@ impl SearchIndex { }) } + debug!({ "Reading filter chunks array" }); + let filter_chunks = consume_arr_len!(decoder); + debug!({ format!("Reading {:#?} filter chunks", filter_chunks) }); + for _ in 0..filter_chunks { + consume_fixed_arr!(decoder); + self.filter_chunks + .insert(consume_string!(decoder), consume_string!(decoder)); + } + debug!({ "Finished decoding metadata" }); Ok(()) diff --git a/pagefind_web/src/search.rs b/pagefind_web/src/search.rs index a804c87f..20cad377 100644 --- a/pagefind_web/src/search.rs +++ b/pagefind_web/src/search.rs @@ -1,8 +1,7 @@ +use crate::util::*; use bit_set::BitSet; use rust_stemmers::{Algorithm, Stemmer}; // TODO: too big, Stemming should be performed on the JS side -#[cfg(debug_assertions)] -use crate::debug_log; use crate::SearchIndex; pub struct PageSearchResult { @@ -12,15 +11,16 @@ pub struct PageSearchResult { } impl SearchIndex { - pub fn search_term(&self, term: &str) -> Vec { + pub fn search_term(&self, term: &str, filter_results: Option) -> Vec { let terms = term.split(' '); // TODO: i18n // TODO: Stemming should be performed on the JS side of the boundary // As the snowball implementation there seems a lot smaller and just as fast. let en_stemmer = Stemmer::create(Algorithm::English); - #[cfg(debug_assertions)] - debug_log(&format! {"Searching {:?}", term}); + debug!({ + format! {"Searching {:?}", term} + }); let mut maps = Vec::new(); let mut words = Vec::new(); @@ -49,6 +49,10 @@ impl SearchIndex { results.intersect_with(&map); } + if let Some(filter) = filter_results { + results.intersect_with(&filter); + } + let mut pages: Vec = vec![]; for page in results.iter() { @@ -71,10 +75,9 @@ impl SearchIndex { word_locations, }; - #[cfg(debug_assertions)] - debug_log( - &format! {"Page {} has {} matching terms (in {} total words), giving the word frequency {:?}", search_result.page, search_result.word_locations.len(), page.word_count, search_result.word_frequency}, - ); + debug!({ + format! {"Page {} has {} matching terms (in {} total words), giving the word frequency {:?}", search_result.page, search_result.word_locations.len(), page.word_count, search_result.word_frequency} + }); pages.push(search_result); }