From 456bf113009a3b5f7983dda5ded7627f70eecf71 Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Tue, 24 May 2022 16:19:21 +1200 Subject: [PATCH] Rank search results by crude word frequency --- pagefind/features/scoring.feature | 49 +++++++++++++-- pagefind/src/fossick/mod.rs | 1 + pagefind/src/fragments/mod.rs | 1 + pagefind/src/index/index_metadata.rs | 10 +++- pagefind/src/index/mod.rs | 13 +++- pagefind/src/output/stubs/search.js | 2 +- pagefind/tests/browser.rs | 69 ++++++++++++++++----- pagefind/tests/steps/web_steps.rs | 12 ++++ pagefind_web/local_build.sh | 4 ++ pagefind_web/local_debug_build.sh | 3 + pagefind_web/src/lib.rs | 90 +++++++++------------------- pagefind_web/src/metadata.rs | 12 ++-- pagefind_web/src/search.rs | 86 ++++++++++++++++++++++++++ 13 files changed, 263 insertions(+), 89 deletions(-) create mode 100755 pagefind_web/local_debug_build.sh create mode 100644 pagefind_web/src/search.rs diff --git a/pagefind/features/scoring.feature b/pagefind/features/scoring.feature index 6f52e345..90e12a9f 100644 --- a/pagefind/features/scoring.feature +++ b/pagefind/features/scoring.feature @@ -1,23 +1,62 @@ -@skip Feature: Result Scoring - - Scenario: Search terms in close proximity rank higher in results + Background: + Given I have a "public/index.html" file with the content: + """ + + """ Given I have a "public/cat/index.html" file with the content: """ -

Happy cats post, that later mentions dogs

+

Happy cat post, that later mentions dogs in the context of cats

""" Given I have a "public/dog/index.html" file with the content: """ -

A post about dogs vs cats

+

A post about dogs vs cats (but mainly dogs)

""" When I run my program Then I should see "Running Pagefind" in stdout When I serve the "public" directory When I load "/" + + Scenario: Search results are ranked by word frequency + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let results = await pagefind.search(`cat`); + + document.querySelector('[data-count]').innerText = `${results.length} result(s)`; + let data = await Promise.all(results.map(result => result.data())); + document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', '); + } + """ + Then There should be no logs + Then The selector "[data-count]" should contain "2 result(s)" + Then The selector "[data-result]" should contain "/cat/, /dog/" + When I evaluate: + """ + async function() { + let pagefind = await import("/_pagefind/pagefind.js"); + + let results = await pagefind.search(`dog`); + + document.querySelector('[data-count]').innerText = `${results.length} result(s)`; + let data = await Promise.all(results.map(result => result.data())); + document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', '); + } + """ + Then The selector "[data-count]" should contain "2 result(s)" + Then The selector "[data-result]" should contain "/dog/, /cat/" + + @skip + Scenario: Search terms in close proximity rank higher in results When I evaluate: """ async function() { diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 99a7f877..45fd2428 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -202,6 +202,7 @@ impl Fossicker { title: self.title.clone(), content: self.digest.clone(), attributes: HashMap::new(), + word_count: word_data.len(), }, }, word_data, diff --git a/pagefind/src/fragments/mod.rs b/pagefind/src/fragments/mod.rs index 6fa5e00a..3b0a2688 100644 --- a/pagefind/src/fragments/mod.rs +++ b/pagefind/src/fragments/mod.rs @@ -7,6 +7,7 @@ pub struct PageFragmentData { pub url: String, pub title: String, pub content: String, + pub word_count: usize, pub attributes: HashMap, } diff --git a/pagefind/src/index/index_metadata.rs b/pagefind/src/index/index_metadata.rs index 95026077..16be6d71 100644 --- a/pagefind/src/index/index_metadata.rs +++ b/pagefind/src/index/index_metadata.rs @@ -8,7 +8,7 @@ pub struct MetaIndex { #[n(0)] pub version: String, #[n(1)] - pub pages: Vec, + pub pages: Vec, #[n(2)] pub stops: Vec, #[n(3)] @@ -26,3 +26,11 @@ pub struct MetaChunk { #[n(2)] pub hash: String, } + +#[derive(Encode)] +pub struct MetaPage { + #[n(0)] + pub hash: String, + #[n(1)] + pub word_count: u32, +} diff --git a/pagefind/src/index/mod.rs b/pagefind/src/index/mod.rs index 54abbd01..6c007911 100644 --- a/pagefind/src/index/mod.rs +++ b/pagefind/src/index/mod.rs @@ -1,7 +1,7 @@ use hashbrown::HashMap; use crate::{fossick::FossickedData, fragments::PageFragment, utils::full_hash, SearchOptions}; -use index_metadata::{MetaChunk, MetaIndex}; +use index_metadata::{MetaChunk, MetaIndex, MetaPage}; use index_words::{PackedPage, PackedWord, WordIndex}; mod index_metadata; @@ -65,9 +65,16 @@ where } } - meta.pages = fragments.keys().cloned().collect(); + meta.pages = fragments + .iter() + .map(|(hash, fragment)| MetaPage { + hash: hash.clone(), + word_count: fragment.data.word_count as u32, + }) + .collect(); + meta.pages - .sort_by_cached_key(|p| fragments.get(p).unwrap().page_number); + .sort_by_cached_key(|p| fragments.get(&p.hash).unwrap().page_number); if TryInto::::try_into(meta.pages.len()).is_err() { panic!("Too many documents to index"); diff --git a/pagefind/src/output/stubs/search.js b/pagefind/src/output/stubs/search.js index 22e2f42b..b68dc72d 100644 --- a/pagefind/src/output/stubs/search.js +++ b/pagefind/src/output/stubs/search.js @@ -103,7 +103,7 @@ class Pagefind { } }); - console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`); + // console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`); return resultsInterface; } } diff --git a/pagefind/tests/browser.rs b/pagefind/tests/browser.rs index 88349ed3..207b869e 100644 --- a/pagefind/tests/browser.rs +++ b/pagefind/tests/browser.rs @@ -1,3 +1,5 @@ +use std::sync::{Arc, Mutex}; + use chromiumoxide::cdp::browser_protocol::log::EventEntryAdded; use chromiumoxide::listeners::EventStream; use futures::{StreamExt, TryFutureExt}; @@ -9,7 +11,7 @@ use chromiumoxide::page::Page; pub struct BrowserTester { browser: Browser, page: Option, - logs: Option>, + log_events: Arc>>, } impl BrowserTester { @@ -26,15 +28,45 @@ impl BrowserTester { Self { browser, page: None, - logs: None, + log_events: Arc::new(Mutex::new(Vec::new())), } } pub async fn load_page(&mut self, url: &str) -> Result<(), Box> { let page = self.page.insert(self.browser.new_page(url).await?); - let events = page.event_listener::().await?; - self.logs = Some(events); + let console_override = vec![ + "function() {", + "const c = console; c.events = [];", + "let l = [c.log, c.warn, c.error, c.debug].map(e => e.bind(c));", + "let p = (m, a) => c.events.push(`${m}: ${Array.from(a).join(' ')}`)", + "c.log = function(){ l[0].apply(c, arguments); p('LOG', arguments); }", + "c.warn = function(){ l[1].apply(c, arguments); p('WRN', arguments); }", + "c.error = function(){ l[2].apply(c, arguments); p('ERR', arguments); }", + "c.debug = function(){ l[3].apply(c, arguments); p('DBG', arguments); }", + "}", + ] + .join("\n"); + + let _ = page.evaluate_function(console_override).await?; + + // TODO: This block isn't working + // https://github.com/mattsse/chromiumoxide/issues/91 + let mut events = page + .event_listener::() + .await?; + + let event_list = Arc::clone(&self.log_events); + let _handle = tokio::task::spawn(async move { + loop { + let event = events.next().await; + if let Some(event) = event { + event_list.lock().unwrap().push(format!("{:#?}", event)); + } + panic!("This block was broken, but now seems to be working? Remove the console override hack 🙂 "); + } + }); + // END TODO Ok(()) } @@ -77,14 +109,23 @@ impl BrowserTester { .await?; Ok(()) } - // pub async fn eval(&mut self, js: &str) -> Result> { - // let result: String = self - // .page - // .as_mut() - // .expect("No page launched") - // .evaluate_function(js) - // .await? - // .into_value()?; - // Ok(result) - // } + + pub async fn get_logs(&mut self) -> Result, Box> { + let res = self + .page + .as_mut() + .expect("No page launched") + .evaluate_function("() => console.events") + .await? + .into_value::>(); + + if let Ok(logs) = res { + Ok(logs) + } else { + panic!("Couldn't load logs from the browser"); + } + + // TODO: This is the real method that should be working: + // Ok(self.log_events.lock().unwrap().iter().cloned().collect()) + } } diff --git a/pagefind/tests/steps/web_steps.rs b/pagefind/tests/steps/web_steps.rs index 66ccfaca..c2bb910a 100644 --- a/pagefind/tests/steps/web_steps.rs +++ b/pagefind/tests/steps/web_steps.rs @@ -57,3 +57,15 @@ async fn selector_contains(world: &mut TestWorld, selector: String, contents: St .expect("Selector does not exist"); assert_eq!(found_contents, contents); } + +#[then(regex = "^There should be no logs$")] +async fn no_logs(world: &mut TestWorld) { + let browser = world.ensure_browser().await; + let logs = browser.get_logs().await.expect("Page is loaded"); + if !logs.is_empty() { + panic!( + "No logs were expected, but logs were found:\n\n{}", + logs.join("\n") + ); + } +} diff --git a/pagefind_web/local_build.sh b/pagefind_web/local_build.sh index 1e671d81..60c6d8e9 100755 --- a/pagefind_web/local_build.sh +++ b/pagefind_web/local_build.sh @@ -1,7 +1,11 @@ #!/usr/bin/env bash rm ../pagefind/vendor/* +if [ $1 = "debug" ]; then +wasm-pack build --debug -t no-modules +else wasm-pack build --release -t no-modules +fi mkdir -p ../pagefind/vendor cp pkg/pagefind_web_bg.wasm ../pagefind/vendor/pagefind_web_bg.0.0.0.wasm cp pkg/pagefind_web.js ../pagefind/vendor/pagefind_web.0.0.0.js diff --git a/pagefind_web/local_debug_build.sh b/pagefind_web/local_debug_build.sh new file mode 100755 index 00000000..bc888809 --- /dev/null +++ b/pagefind_web/local_debug_build.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +./local_build.sh debug diff --git a/pagefind_web/src/lib.rs b/pagefind_web/src/lib.rs index 3dd4220e..2c38451e 100644 --- a/pagefind_web/src/lib.rs +++ b/pagefind_web/src/lib.rs @@ -13,6 +13,7 @@ use wasm_bindgen::prelude::*; mod excerpt; mod index; mod metadata; +mod search; mod util; pub struct PageWord { @@ -26,10 +27,15 @@ pub struct IndexChunk { hash: String, } +pub struct Page { + hash: String, + word_count: u32, +} + pub struct SearchIndex { web_version: &'static str, generator_version: Option, - pages: Vec, + pages: Vec, chunks: Vec, stops: Vec, words: HashMap>, @@ -120,71 +126,33 @@ pub fn search(ptr: *mut SearchIndex, query: &str) -> String { } } - let terms = query.split(' '); - // TODO: i18n - let en_stemmer = Stemmer::create(Algorithm::English); + let results = search_index.search_term(query); + + let result_string = results + .into_iter() + .map(|result| { + format!( + "{}@{},{}@{}", + &result.page, + calculate_excerpt(&result.word_locations, 30), + 30, + result + .word_locations + .iter() + .map(|l| l.to_string()) + .collect::>() + .join(",") + ) + }) + .collect::>() + .join(" "); - #[cfg(debug_assertions)] - debug_log(&format! {"Searching {:?}", query}); - - let mut maps = Vec::new(); - let mut words = Vec::new(); - for term in terms { - let term = en_stemmer.stem(term).into_owned(); - if let Some(word_index) = search_index.words.get(&term) { - words.extend(word_index); - let mut set = BitSet::new(); - for page in word_index { - set.insert(page.page as usize); - } - maps.push(set); - } - } - - let mut maps = maps.drain(..); - let mut results = if let Some(map) = maps.next() { - map - } else { - let _ = Box::into_raw(search_index); - return "".into(); - }; - - for map in maps { - results.intersect_with(&map); - } - - let mut pages: Vec = vec![]; - - for page in results.iter() { - let locs: Vec = words - .iter() - .filter_map(|p| { - if p.page as usize == page { - Some(p.locs.clone()) - } else { - None - } - }) - .flatten() - .collect(); - pages.push(format!( - "{}@{},{}@{}", - &search_index.pages[page], - calculate_excerpt(&locs, 30), - 30, - locs.iter() - .map(|l| l.to_string()) - .collect::>() - .join(",") - )); - } - let o = pages.join(" "); let _ = Box::into_raw(search_index); #[cfg(debug_assertions)] - debug_log(&format! {"{:?}", o}); + debug_log(&format! {"{:?}", result_string}); - o + result_string } #[cfg(test)] diff --git a/pagefind_web/src/metadata.rs b/pagefind_web/src/metadata.rs index bf202d94..dd05b33c 100644 --- a/pagefind_web/src/metadata.rs +++ b/pagefind_web/src/metadata.rs @@ -1,5 +1,5 @@ use super::{IndexChunk, SearchIndex}; -use crate::util::*; +use crate::{util::*, Page}; use minicbor::{decode, Decoder}; /* @@ -29,12 +29,16 @@ impl SearchIndex { debug!({ "Reading version number" }); self.generator_version = Some(consume_string!(decoder)); - debug!({ "Reading page hashes array" }); + debug!({ "Reading pages array" }); let page_hashes = consume_arr_len!(decoder); - debug!({ format!("Reading {:#?} page hashes", page_hashes) }); + debug!({ format!("Reading {:#?} pages", page_hashes) }); self.pages = Vec::with_capacity(page_hashes as usize); for _ in 0..page_hashes { - self.pages.push(consume_string!(decoder)); + consume_fixed_arr!(decoder); + self.pages.push(Page { + hash: consume_string!(decoder), + word_count: consume_num!(decoder), + }); } debug!({ "Reading stop words array" }); diff --git a/pagefind_web/src/search.rs b/pagefind_web/src/search.rs new file mode 100644 index 00000000..a804c87f --- /dev/null +++ b/pagefind_web/src/search.rs @@ -0,0 +1,86 @@ +use bit_set::BitSet; +use rust_stemmers::{Algorithm, Stemmer}; // TODO: too big, Stemming should be performed on the JS side + +#[cfg(debug_assertions)] +use crate::debug_log; +use crate::SearchIndex; + +pub struct PageSearchResult { + pub page: String, + pub word_frequency: f32, // TODO: tf-idf implementation? Paired with the dictionary-in-meta approach + pub word_locations: Vec, +} + +impl SearchIndex { + pub fn search_term(&self, term: &str) -> Vec { + let terms = term.split(' '); + // TODO: i18n + // TODO: Stemming should be performed on the JS side of the boundary + // As the snowball implementation there seems a lot smaller and just as fast. + let en_stemmer = Stemmer::create(Algorithm::English); + + #[cfg(debug_assertions)] + debug_log(&format! {"Searching {:?}", term}); + + let mut maps = Vec::new(); + let mut words = Vec::new(); + for term in terms { + let term = en_stemmer.stem(term).into_owned(); // TODO: Remove this once JS stems + if let Some(word_index) = self.words.get(&term) { + words.extend(word_index); + let mut set = BitSet::new(); + for page in word_index { + set.insert(page.page as usize); + } + maps.push(set); + } + } + + let mut maps = maps.drain(..); + let mut results = if let Some(map) = maps.next() { + map + } else { + return vec![]; + // let _ = Box::into_raw(search_index); + // return "".into(); + }; + + for map in maps { + results.intersect_with(&map); + } + + let mut pages: Vec = vec![]; + + for page in results.iter() { + let word_locations: Vec = words + .iter() + .filter_map(|p| { + if p.page as usize == page { + Some(p.locs.clone()) + } else { + None + } + }) + .flatten() + .collect(); + + let page = &self.pages[page]; + let search_result = PageSearchResult { + page: page.hash.clone(), + word_frequency: word_locations.len() as f32 / page.word_count as f32, + word_locations, + }; + + #[cfg(debug_assertions)] + debug_log( + &format! {"Page {} has {} matching terms (in {} total words), giving the word frequency {:?}", search_result.page, search_result.word_locations.len(), page.word_count, search_result.word_frequency}, + ); + + pages.push(search_result); + } + + pages.sort_by(|a, b| b.word_frequency.partial_cmp(&a.word_frequency).unwrap()); + + pages + } +}