Skip to content

Commit

Permalink
Rank search results by crude word frequency
Browse files Browse the repository at this point in the history
  • Loading branch information
bglw committed May 24, 2022
1 parent d1d4d59 commit 456bf11
Show file tree
Hide file tree
Showing 13 changed files with 263 additions and 89 deletions.
49 changes: 44 additions & 5 deletions pagefind/features/scoring.feature
Original file line number Diff line number Diff line change
@@ -1,23 +1,62 @@
@skip
Feature: Result Scoring

Scenario: Search terms in close proximity rank higher in results
Background:
Given I have a "public/index.html" file with the content:
"""
<ul>
<li data-count>
<li data-result>
</ul>
"""
Given I have a "public/cat/index.html" file with the content:
"""
<body>
<h1>Happy cats post, that later mentions dogs</h1>
<h1>Happy cat post, that later mentions dogs in the context of cats</h1>
</body>
"""
Given I have a "public/dog/index.html" file with the content:
"""
<body>
<h1>A post about dogs vs cats</h1>
<h1>A post about dogs vs cats (but mainly dogs)</h1>
</body>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
When I serve the "public" directory
When I load "/"

Scenario: Search results are ranked by word frequency
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let results = await pagefind.search(`cat`);
document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
let data = await Promise.all(results.map(result => result.data()));
document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', ');
}
"""
Then There should be no logs
Then The selector "[data-count]" should contain "2 result(s)"
Then The selector "[data-result]" should contain "/cat/, /dog/"
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let results = await pagefind.search(`dog`);
document.querySelector('[data-count]').innerText = `${results.length} result(s)`;
let data = await Promise.all(results.map(result => result.data()));
document.querySelector('[data-result]').innerText = data.map(d => d.url).join(', ');
}
"""
Then The selector "[data-count]" should contain "2 result(s)"
Then The selector "[data-result]" should contain "/dog/, /cat/"

@skip
Scenario: Search terms in close proximity rank higher in results
When I evaluate:
"""
async function() {
Expand Down
1 change: 1 addition & 0 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ impl Fossicker {
title: self.title.clone(),
content: self.digest.clone(),
attributes: HashMap::new(),
word_count: word_data.len(),
},
},
word_data,
Expand Down
1 change: 1 addition & 0 deletions pagefind/src/fragments/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pub struct PageFragmentData {
pub url: String,
pub title: String,
pub content: String,
pub word_count: usize,
pub attributes: HashMap<String, String>,
}

Expand Down
10 changes: 9 additions & 1 deletion pagefind/src/index/index_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pub struct MetaIndex {
#[n(0)]
pub version: String,
#[n(1)]
pub pages: Vec<String>,
pub pages: Vec<MetaPage>,
#[n(2)]
pub stops: Vec<String>,
#[n(3)]
Expand All @@ -26,3 +26,11 @@ pub struct MetaChunk {
#[n(2)]
pub hash: String,
}

#[derive(Encode)]
pub struct MetaPage {
#[n(0)]
pub hash: String,
#[n(1)]
pub word_count: u32,
}
13 changes: 10 additions & 3 deletions pagefind/src/index/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use hashbrown::HashMap;

use crate::{fossick::FossickedData, fragments::PageFragment, utils::full_hash, SearchOptions};
use index_metadata::{MetaChunk, MetaIndex};
use index_metadata::{MetaChunk, MetaIndex, MetaPage};
use index_words::{PackedPage, PackedWord, WordIndex};

mod index_metadata;
Expand Down Expand Up @@ -65,9 +65,16 @@ where
}
}

meta.pages = fragments.keys().cloned().collect();
meta.pages = fragments
.iter()
.map(|(hash, fragment)| MetaPage {
hash: hash.clone(),
word_count: fragment.data.word_count as u32,
})
.collect();

meta.pages
.sort_by_cached_key(|p| fragments.get(p).unwrap().page_number);
.sort_by_cached_key(|p| fragments.get(&p.hash).unwrap().page_number);

if TryInto::<u32>::try_into(meta.pages.len()).is_err() {
panic!("Too many documents to index");
Expand Down
2 changes: 1 addition & 1 deletion pagefind/src/output/stubs/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class Pagefind {
}
});

console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`);
// console.log(`Found ${results.length} result${results.length == 1 ? '' : 's'} for "${term}" in ${Date.now() - searchStart}ms (${Date.now() - start}ms realtime)`);
return resultsInterface;
}
}
Expand Down
69 changes: 55 additions & 14 deletions pagefind/tests/browser.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::sync::{Arc, Mutex};

use chromiumoxide::cdp::browser_protocol::log::EventEntryAdded;
use chromiumoxide::listeners::EventStream;
use futures::{StreamExt, TryFutureExt};
Expand All @@ -9,7 +11,7 @@ use chromiumoxide::page::Page;
pub struct BrowserTester {
browser: Browser,
page: Option<Page>,
logs: Option<EventStream<EventEntryAdded>>,
log_events: Arc<Mutex<Vec<String>>>,
}

impl BrowserTester {
Expand All @@ -26,15 +28,45 @@ impl BrowserTester {
Self {
browser,
page: None,
logs: None,
log_events: Arc::new(Mutex::new(Vec::new())),
}
}

pub async fn load_page(&mut self, url: &str) -> Result<(), Box<dyn std::error::Error>> {
let page = self.page.insert(self.browser.new_page(url).await?);

let events = page.event_listener::<EventEntryAdded>().await?;
self.logs = Some(events);
let console_override = vec![
"function() {",
"const c = console; c.events = [];",
"let l = [c.log, c.warn, c.error, c.debug].map(e => e.bind(c));",
"let p = (m, a) => c.events.push(`${m}: ${Array.from(a).join(' ')}`)",
"c.log = function(){ l[0].apply(c, arguments); p('LOG', arguments); }",
"c.warn = function(){ l[1].apply(c, arguments); p('WRN', arguments); }",
"c.error = function(){ l[2].apply(c, arguments); p('ERR', arguments); }",
"c.debug = function(){ l[3].apply(c, arguments); p('DBG', arguments); }",
"}",
]
.join("\n");

let _ = page.evaluate_function(console_override).await?;

// TODO: This block isn't working
// https://github.com/mattsse/chromiumoxide/issues/91
let mut events = page
.event_listener::<chromiumoxide::cdp::browser_protocol::log::EventEntryAdded>()
.await?;

let event_list = Arc::clone(&self.log_events);
let _handle = tokio::task::spawn(async move {
loop {
let event = events.next().await;
if let Some(event) = event {
event_list.lock().unwrap().push(format!("{:#?}", event));
}
panic!("This block was broken, but now seems to be working? Remove the console override hack 🙂 ");
}
});
// END TODO

Ok(())
}
Expand Down Expand Up @@ -77,14 +109,23 @@ impl BrowserTester {
.await?;
Ok(())
}
// pub async fn eval(&mut self, js: &str) -> Result<String, Box<dyn std::error::Error>> {
// let result: String = self
// .page
// .as_mut()
// .expect("No page launched")
// .evaluate_function(js)
// .await?
// .into_value()?;
// Ok(result)
// }

pub async fn get_logs(&mut self) -> Result<Vec<String>, Box<dyn std::error::Error>> {
let res = self
.page
.as_mut()
.expect("No page launched")
.evaluate_function("() => console.events")
.await?
.into_value::<Vec<String>>();

if let Ok(logs) = res {
Ok(logs)
} else {
panic!("Couldn't load logs from the browser");
}

// TODO: This is the real method that should be working:
// Ok(self.log_events.lock().unwrap().iter().cloned().collect())
}
}
12 changes: 12 additions & 0 deletions pagefind/tests/steps/web_steps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,15 @@ async fn selector_contains(world: &mut TestWorld, selector: String, contents: St
.expect("Selector does not exist");
assert_eq!(found_contents, contents);
}

#[then(regex = "^There should be no logs$")]
async fn no_logs(world: &mut TestWorld) {
let browser = world.ensure_browser().await;
let logs = browser.get_logs().await.expect("Page is loaded");
if !logs.is_empty() {
panic!(
"No logs were expected, but logs were found:\n\n{}",
logs.join("\n")
);
}
}
4 changes: 4 additions & 0 deletions pagefind_web/local_build.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/usr/bin/env bash

rm ../pagefind/vendor/*
if [ $1 = "debug" ]; then
wasm-pack build --debug -t no-modules
else
wasm-pack build --release -t no-modules
fi
mkdir -p ../pagefind/vendor
cp pkg/pagefind_web_bg.wasm ../pagefind/vendor/pagefind_web_bg.0.0.0.wasm
cp pkg/pagefind_web.js ../pagefind/vendor/pagefind_web.0.0.0.js
Expand Down
3 changes: 3 additions & 0 deletions pagefind_web/local_debug_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

./local_build.sh debug
90 changes: 29 additions & 61 deletions pagefind_web/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use wasm_bindgen::prelude::*;
mod excerpt;
mod index;
mod metadata;
mod search;
mod util;

pub struct PageWord {
Expand All @@ -26,10 +27,15 @@ pub struct IndexChunk {
hash: String,
}

pub struct Page {
hash: String,
word_count: u32,
}

pub struct SearchIndex {
web_version: &'static str,
generator_version: Option<String>,
pages: Vec<String>,
pages: Vec<Page>,
chunks: Vec<IndexChunk>,
stops: Vec<String>,
words: HashMap<String, Vec<PageWord>>,
Expand Down Expand Up @@ -120,71 +126,33 @@ pub fn search(ptr: *mut SearchIndex, query: &str) -> String {
}
}

let terms = query.split(' ');
// TODO: i18n
let en_stemmer = Stemmer::create(Algorithm::English);
let results = search_index.search_term(query);

let result_string = results
.into_iter()
.map(|result| {
format!(
"{}@{},{}@{}",
&result.page,
calculate_excerpt(&result.word_locations, 30),
30,
result
.word_locations
.iter()
.map(|l| l.to_string())
.collect::<Vec<String>>()
.join(",")
)
})
.collect::<Vec<String>>()
.join(" ");

#[cfg(debug_assertions)]
debug_log(&format! {"Searching {:?}", query});

let mut maps = Vec::new();
let mut words = Vec::new();
for term in terms {
let term = en_stemmer.stem(term).into_owned();
if let Some(word_index) = search_index.words.get(&term) {
words.extend(word_index);
let mut set = BitSet::new();
for page in word_index {
set.insert(page.page as usize);
}
maps.push(set);
}
}

let mut maps = maps.drain(..);
let mut results = if let Some(map) = maps.next() {
map
} else {
let _ = Box::into_raw(search_index);
return "".into();
};

for map in maps {
results.intersect_with(&map);
}

let mut pages: Vec<String> = vec![];

for page in results.iter() {
let locs: Vec<u32> = words
.iter()
.filter_map(|p| {
if p.page as usize == page {
Some(p.locs.clone())
} else {
None
}
})
.flatten()
.collect();
pages.push(format!(
"{}@{},{}@{}",
&search_index.pages[page],
calculate_excerpt(&locs, 30),
30,
locs.iter()
.map(|l| l.to_string())
.collect::<Vec<String>>()
.join(",")
));
}
let o = pages.join(" ");
let _ = Box::into_raw(search_index);

#[cfg(debug_assertions)]
debug_log(&format! {"{:?}", o});
debug_log(&format! {"{:?}", result_string});

o
result_string
}

#[cfg(test)]
Expand Down
Loading

0 comments on commit 456bf11

Please sign in to comment.