diff --git a/pagefind/features/scoring.feature b/pagefind/features/scoring.feature
index 6f52e345..90e12a9f 100644
--- a/pagefind/features/scoring.feature
+++ b/pagefind/features/scoring.feature
@@ -1,23 +1,62 @@
-@skip
Feature: Result Scoring
-
- Scenario: Search terms in close proximity rank higher in results
+ Background:
+ Given I have a "public/index.html" file with the content:
+ """
+
+ """
Given I have a "public/cat/index.html" file with the content:
"""
- Happy cats post, that later mentions dogs
+ Happy cat post, that later mentions dogs in the context of cats
"""
Given I have a "public/dog/index.html" file with the content:
"""
- A post about dogs vs cats
+ A post about dogs vs cats (but mainly dogs)
"""
When I run my program
Then I should see "Running Pagefind" in stdout
When I serve the "public" directory
When I load "/"
+
+ Scenario: Search results are ranked by word frequency
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/_pagefind/pagefind.js");
+
+ let results = await pagefind.search(`cat`);
+
+ document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
+ let data = await Promise.all(results.map(result => result.data()));
+ document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', ');
+ }
+ """
+ Then There should be no logs
+ Then The selector "[data-count]" should contain "2 result(s)"
+ Then The selector "[data-result]" should contain "/cat/, /dog/"
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/_pagefind/pagefind.js");
+
+ let results = await pagefind.search(`dog`);
+
+ document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
+ let data = await Promise.all(results.map(result => result.data()));
+ document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', ');
+ }
+ """
+ Then The selector "[data-count]" should contain "2 result(s)"
+ Then The selector "[data-result]" should contain "/dog/, /cat/"
+
+ @skip
+ Scenario: Search terms in close proximity rank higher in results
When I evaluate:
"""
async function() {
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 99a7f877..45fd2428 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -202,6 +202,7 @@ impl Fossicker {
title: self.title.clone(),
content: self.digest.clone(),
attributes: HashMap::new(),
+ word_count: word_data.len(),
},
},
word_data,
diff --git a/pagefind/src/fragments/mod.rs b/pagefind/src/fragments/mod.rs
index 6fa5e00a..3b0a2688 100644
--- a/pagefind/src/fragments/mod.rs
+++ b/pagefind/src/fragments/mod.rs
@@ -7,6 +7,7 @@ pub struct PageFragmentData {
pub url: String,
pub title: String,
pub content: String,
+ pub word_count: usize,
pub attributes: HashMap,
}
diff --git a/pagefind/src/index/index_metadata.rs b/pagefind/src/index/index_metadata.rs
index 95026077..16be6d71 100644
--- a/pagefind/src/index/index_metadata.rs
+++ b/pagefind/src/index/index_metadata.rs
@@ -8,7 +8,7 @@ pub struct MetaIndex {
#[n(0)]
pub version: String,
#[n(1)]
- pub pages: Vec,
+ pub pages: Vec,
#[n(2)]
pub stops: Vec,
#[n(3)]
@@ -26,3 +26,11 @@ pub struct MetaChunk {
#[n(2)]
pub hash: String,
}
+
+#[derive(Encode)]
+pub struct MetaPage {
+ #[n(0)]
+ pub hash: String,
+ #[n(1)]
+ pub word_count: u32,
+}
diff --git a/pagefind/src/index/mod.rs b/pagefind/src/index/mod.rs
index 54abbd01..6c007911 100644
--- a/pagefind/src/index/mod.rs
+++ b/pagefind/src/index/mod.rs
@@ -1,7 +1,7 @@
use hashbrown::HashMap;
use crate::{fossick::FossickedData, fragments::PageFragment, utils::full_hash, SearchOptions};
-use index_metadata::{MetaChunk, MetaIndex};
+use index_metadata::{MetaChunk, MetaIndex, MetaPage};
use index_words::{PackedPage, PackedWord, WordIndex};
mod index_metadata;
@@ -65,9 +65,16 @@ where
}
}
- meta.pages = fragments.keys().cloned().collect();
+ meta.pages = fragments
+ .iter()
+ .map(|(hash, fragment)| MetaPage {
+ hash: hash.clone(),
+ word_count: fragment.data.word_count as u32,
+ })
+ .collect();
+
meta.pages
- .sort_by_cached_key(|p| fragments.get(p).unwrap().page_number);
+ .sort_by_cached_key(|p| fragments.get(&p.hash).unwrap().page_number);
if TryInto::::try_into(meta.pages.len()).is_err() {
panic!("Too many documents to index");
diff --git a/pagefind/src/output/stubs/search.js b/pagefind/src/output/stubs/search.js
index 22e2f42b..b68dc72d 100644
--- a/pagefind/src/output/stubs/search.js
+++ b/pagefind/src/output/stubs/search.js
@@ -103,7 +103,7 @@ class Pagefind {
}
});
- console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`);
+ // console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`);
return resultsInterface;
}
}
diff --git a/pagefind/tests/browser.rs b/pagefind/tests/browser.rs
index 88349ed3..207b869e 100644
--- a/pagefind/tests/browser.rs
+++ b/pagefind/tests/browser.rs
@@ -1,3 +1,5 @@
+use std::sync::{Arc, Mutex};
+
use chromiumoxide::cdp::browser_protocol::log::EventEntryAdded;
use chromiumoxide::listeners::EventStream;
use futures::{StreamExt, TryFutureExt};
@@ -9,7 +11,7 @@ use chromiumoxide::page::Page;
pub struct BrowserTester {
browser: Browser,
page: Option,
- logs: Option>,
+ log_events: Arc>>,
}
impl BrowserTester {
@@ -26,15 +28,45 @@ impl BrowserTester {
Self {
browser,
page: None,
- logs: None,
+ log_events: Arc::new(Mutex::new(Vec::new())),
}
}
pub async fn load_page(&mut self, url: &str) -> Result<(), Box> {
let page = self.page.insert(self.browser.new_page(url).await?);
- let events = page.event_listener::().await?;
- self.logs = Some(events);
+ let console_override = vec![
+ "function() {",
+ "const c = console; c.events = [];",
+ "let l = [c.log, c.warn, c.error, c.debug].map(e => e.bind(c));",
+ "let p = (m, a) => c.events.push(`${m}: ${Array.from(a).join(' ')}`)",
+ "c.log = function(){ l[0].apply(c, arguments); p('LOG', arguments); }",
+ "c.warn = function(){ l[1].apply(c, arguments); p('WRN', arguments); }",
+ "c.error = function(){ l[2].apply(c, arguments); p('ERR', arguments); }",
+ "c.debug = function(){ l[3].apply(c, arguments); p('DBG', arguments); }",
+ "}",
+ ]
+ .join("\n");
+
+ let _ = page.evaluate_function(console_override).await?;
+
+ // TODO: This block isn't working
+ // https://github.com/mattsse/chromiumoxide/issues/91
+ let mut events = page
+ .event_listener::()
+ .await?;
+
+ let event_list = Arc::clone(&self.log_events);
+ let _handle = tokio::task::spawn(async move {
+ loop {
+ let event = events.next().await;
+ if let Some(event) = event {
+ event_list.lock().unwrap().push(format!("{:#?}", event));
+ }
+ panic!("This block was broken, but now seems to be working? Remove the console override hack 🙂 ");
+ }
+ });
+ // END TODO
Ok(())
}
@@ -77,14 +109,23 @@ impl BrowserTester {
.await?;
Ok(())
}
- // pub async fn eval(&mut self, js: &str) -> Result> {
- // let result: String = self
- // .page
- // .as_mut()
- // .expect("No page launched")
- // .evaluate_function(js)
- // .await?
- // .into_value()?;
- // Ok(result)
- // }
+
+ pub async fn get_logs(&mut self) -> Result, Box> {
+ let res = self
+ .page
+ .as_mut()
+ .expect("No page launched")
+ .evaluate_function("() => console.events")
+ .await?
+ .into_value::>();
+
+ if let Ok(logs) = res {
+ Ok(logs)
+ } else {
+ panic!("Couldn't load logs from the browser");
+ }
+
+ // TODO: This is the real method that should be working:
+ // Ok(self.log_events.lock().unwrap().iter().cloned().collect())
+ }
}
diff --git a/pagefind/tests/steps/web_steps.rs b/pagefind/tests/steps/web_steps.rs
index 66ccfaca..c2bb910a 100644
--- a/pagefind/tests/steps/web_steps.rs
+++ b/pagefind/tests/steps/web_steps.rs
@@ -57,3 +57,15 @@ async fn selector_contains(world: &mut TestWorld, selector: String, contents: St
.expect("Selector does not exist");
assert_eq!(found_contents, contents);
}
+
+#[then(regex = "^There should be no logs$")]
+async fn no_logs(world: &mut TestWorld) {
+ let browser = world.ensure_browser().await;
+ let logs = browser.get_logs().await.expect("Page is loaded");
+ if !logs.is_empty() {
+ panic!(
+ "No logs were expected, but logs were found:\n\n{}",
+ logs.join("\n")
+ );
+ }
+}
diff --git a/pagefind_web/local_build.sh b/pagefind_web/local_build.sh
index 1e671d81..60c6d8e9 100755
--- a/pagefind_web/local_build.sh
+++ b/pagefind_web/local_build.sh
@@ -1,7 +1,11 @@
#!/usr/bin/env bash
rm ../pagefind/vendor/*
+if [ $1 = "debug" ]; then
+wasm-pack build --debug -t no-modules
+else
wasm-pack build --release -t no-modules
+fi
mkdir -p ../pagefind/vendor
cp pkg/pagefind_web_bg.wasm ../pagefind/vendor/pagefind_web_bg.0.0.0.wasm
cp pkg/pagefind_web.js ../pagefind/vendor/pagefind_web.0.0.0.js
diff --git a/pagefind_web/local_debug_build.sh b/pagefind_web/local_debug_build.sh
new file mode 100755
index 00000000..bc888809
--- /dev/null
+++ b/pagefind_web/local_debug_build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+./local_build.sh debug
diff --git a/pagefind_web/src/lib.rs b/pagefind_web/src/lib.rs
index 3dd4220e..2c38451e 100644
--- a/pagefind_web/src/lib.rs
+++ b/pagefind_web/src/lib.rs
@@ -13,6 +13,7 @@ use wasm_bindgen::prelude::*;
mod excerpt;
mod index;
mod metadata;
+mod search;
mod util;
pub struct PageWord {
@@ -26,10 +27,15 @@ pub struct IndexChunk {
hash: String,
}
+pub struct Page {
+ hash: String,
+ word_count: u32,
+}
+
pub struct SearchIndex {
web_version: &'static str,
generator_version: Option,
- pages: Vec,
+ pages: Vec,
chunks: Vec,
stops: Vec,
words: HashMap>,
@@ -120,71 +126,33 @@ pub fn search(ptr: *mut SearchIndex, query: &str) -> String {
}
}
- let terms = query.split(' ');
- // TODO: i18n
- let en_stemmer = Stemmer::create(Algorithm::English);
+ let results = search_index.search_term(query);
+
+ let result_string = results
+ .into_iter()
+ .map(|result| {
+ format!(
+ "{}@{},{}@{}",
+ &result.page,
+ calculate_excerpt(&result.word_locations, 30),
+ 30,
+ result
+ .word_locations
+ .iter()
+ .map(|l| l.to_string())
+ .collect::>()
+ .join(",")
+ )
+ })
+ .collect::>()
+ .join(" ");
- #[cfg(debug_assertions)]
- debug_log(&format! {"Searching {:?}", query});
-
- let mut maps = Vec::new();
- let mut words = Vec::new();
- for term in terms {
- let term = en_stemmer.stem(term).into_owned();
- if let Some(word_index) = search_index.words.get(&term) {
- words.extend(word_index);
- let mut set = BitSet::new();
- for page in word_index {
- set.insert(page.page as usize);
- }
- maps.push(set);
- }
- }
-
- let mut maps = maps.drain(..);
- let mut results = if let Some(map) = maps.next() {
- map
- } else {
- let _ = Box::into_raw(search_index);
- return "".into();
- };
-
- for map in maps {
- results.intersect_with(&map);
- }
-
- let mut pages: Vec = vec![];
-
- for page in results.iter() {
- let locs: Vec = words
- .iter()
- .filter_map(|p| {
- if p.page as usize == page {
- Some(p.locs.clone())
- } else {
- None
- }
- })
- .flatten()
- .collect();
- pages.push(format!(
- "{}@{},{}@{}",
- &search_index.pages[page],
- calculate_excerpt(&locs, 30),
- 30,
- locs.iter()
- .map(|l| l.to_string())
- .collect::>()
- .join(",")
- ));
- }
- let o = pages.join(" ");
let _ = Box::into_raw(search_index);
#[cfg(debug_assertions)]
- debug_log(&format! {"{:?}", o});
+ debug_log(&format! {"{:?}", result_string});
- o
+ result_string
}
#[cfg(test)]
diff --git a/pagefind_web/src/metadata.rs b/pagefind_web/src/metadata.rs
index bf202d94..dd05b33c 100644
--- a/pagefind_web/src/metadata.rs
+++ b/pagefind_web/src/metadata.rs
@@ -1,5 +1,5 @@
use super::{IndexChunk, SearchIndex};
-use crate::util::*;
+use crate::{util::*, Page};
use minicbor::{decode, Decoder};
/*
@@ -29,12 +29,16 @@ impl SearchIndex {
debug!({ "Reading version number" });
self.generator_version = Some(consume_string!(decoder));
- debug!({ "Reading page hashes array" });
+ debug!({ "Reading pages array" });
let page_hashes = consume_arr_len!(decoder);
- debug!({ format!("Reading {:#?} page hashes", page_hashes) });
+ debug!({ format!("Reading {:#?} pages", page_hashes) });
self.pages = Vec::with_capacity(page_hashes as usize);
for _ in 0..page_hashes {
- self.pages.push(consume_string!(decoder));
+ consume_fixed_arr!(decoder);
+ self.pages.push(Page {
+ hash: consume_string!(decoder),
+ word_count: consume_num!(decoder),
+ });
}
debug!({ "Reading stop words array" });
diff --git a/pagefind_web/src/search.rs b/pagefind_web/src/search.rs
new file mode 100644
index 00000000..a804c87f
--- /dev/null
+++ b/pagefind_web/src/search.rs
@@ -0,0 +1,86 @@
+use bit_set::BitSet;
+use rust_stemmers::{Algorithm, Stemmer}; // TODO: too big, Stemming should be performed on the JS side
+
+#[cfg(debug_assertions)]
+use crate::debug_log;
+use crate::SearchIndex;
+
+pub struct PageSearchResult {
+ pub page: String,
+ pub word_frequency: f32, // TODO: tf-idf implementation? Paired with the dictionary-in-meta approach
+ pub word_locations: Vec,
+}
+
+impl SearchIndex {
+ pub fn search_term(&self, term: &str) -> Vec {
+ let terms = term.split(' ');
+ // TODO: i18n
+ // TODO: Stemming should be performed on the JS side of the boundary
+ // As the snowball implementation there seems a lot smaller and just as fast.
+ let en_stemmer = Stemmer::create(Algorithm::English);
+
+ #[cfg(debug_assertions)]
+ debug_log(&format! {"Searching {:?}", term});
+
+ let mut maps = Vec::new();
+ let mut words = Vec::new();
+ for term in terms {
+ let term = en_stemmer.stem(term).into_owned(); // TODO: Remove this once JS stems
+ if let Some(word_index) = self.words.get(&term) {
+ words.extend(word_index);
+ let mut set = BitSet::new();
+ for page in word_index {
+ set.insert(page.page as usize);
+ }
+ maps.push(set);
+ }
+ }
+
+ let mut maps = maps.drain(..);
+ let mut results = if let Some(map) = maps.next() {
+ map
+ } else {
+ return vec![];
+ // let _ = Box::into_raw(search_index);
+ // return "".into();
+ };
+
+ for map in maps {
+ results.intersect_with(&map);
+ }
+
+ let mut pages: Vec = vec![];
+
+ for page in results.iter() {
+ let word_locations: Vec = words
+ .iter()
+ .filter_map(|p| {
+ if p.page as usize == page {
+ Some(p.locs.clone())
+ } else {
+ None
+ }
+ })
+ .flatten()
+ .collect();
+
+ let page = &self.pages[page];
+ let search_result = PageSearchResult {
+ page: page.hash.clone(),
+ word_frequency: word_locations.len() as f32 / page.word_count as f32,
+ word_locations,
+ };
+
+ #[cfg(debug_assertions)]
+ debug_log(
+ &format! {"Page {} has {} matching terms (in {} total words), giving the word frequency {:?}", search_result.page, search_result.word_locations.len(), page.word_count, search_result.word_frequency},
+ );
+
+ pages.push(search_result);
+ }
+
+ pages.sort_by(|a, b| b.word_frequency.partial_cmp(&a.word_frequency).unwrap());
+
+ pages
+ }
+}