From a4e50c97539d126f1d66141b405fa90f9edb3fa5 Mon Sep 17 00:00:00 2001 From: Frank Elsinga Date: Tue, 16 Apr 2024 22:04:00 +0200 Subject: [PATCH] Migrated the testcase to use `_vectors` instead --- src/search.rs | 112 +++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 47 deletions(-) diff --git a/src/search.rs b/src/search.rs index 079ae080..83d6e191 100644 --- a/src/search.rs +++ b/src/search.rs @@ -350,6 +350,12 @@ pub struct SearchQuery<'a, Http: HttpClient> { #[cfg(feature = "experimental-vector-search")] #[serde(skip_serializing_if = "Option::is_none")] pub hybrid: Option>, + + /// EXPERIMENTAL + /// Defines what vectors an userprovided embedder has gotten for semantic searching + #[cfg(feature = "experimental-vector-search")] + #[serde(skip_serializing_if = "Option::is_none")] + pub vector: Option<&'a [f32]>, } #[allow(missing_docs)] @@ -380,6 +386,8 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { index_uid: None, #[cfg(feature = "experimental-vector-search")] hybrid: None, + #[cfg(feature = "experimental-vector-search")] + vector: None, } } pub fn with_query<'b>(&'b mut self, query: &'a str) -> &'b mut SearchQuery<'a, Http> { @@ -563,6 +571,8 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { self.index_uid = Some(&self.index.uid); self } + /// EXPERIMENTAL + /// Defines whether to utilise previously defined embedders for semantic searching #[cfg(feature = "experimental-vector-search")] pub fn with_hybrid<'b>( &'b mut self, @@ -575,6 +585,13 @@ impl<'a, Http: HttpClient> SearchQuery<'a, Http> { }); self } + /// EXPERIMENTAL + /// Defines what vectors an userprovided embedder has gotten for semantic searching + #[cfg(feature = "experimental-vector-search")] + pub fn with_vector<'b>(&'b mut self, vector: &'a [f32]) -> &'b mut SearchQuery<'a, Http> { + self.vector = Some(vector); + self + } #[must_use] pub fn build(&mut self) -> SearchQuery<'a, Http> { @@ -650,7 +667,6 @@ mod tests { use meilisearch_test_macro::meilisearch_test; use serde::{Deserialize, Serialize}; use serde_json::{json, Map, Value}; - use std::time::Duration; #[derive(Debug, Serialize, Deserialize, PartialEq)] struct Nested { @@ -664,6 +680,7 @@ mod tests { kind: String, number: i32, nested: Nested, + _vectors: HashMap>, } impl PartialEq> for Document { @@ -677,31 +694,25 @@ mod tests { async fn setup_test_index(client: &Client, index: &Index) -> Result<(), Error> { let t0 = index.add_documents(&[ - Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") } }, - Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") } }, - Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") } }, - Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") } }, - Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") } }, - Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") } }, - Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") } }, - Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") } }, - Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") } }, - Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") } }, + Document { id: 0, kind: "text".into(), number: 0, value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), nested: Nested { child: S("first") }, _vectors: HashMap::from([(S("default"), vec![1000.0])])}, + Document { id: 1, kind: "text".into(), number: 10, value: S("dolor sit amet, consectetur adipiscing elit"), nested: Nested { child: S("second") }, _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, + Document { id: 2, kind: "title".into(), number: 20, value: S("The Social Network"), nested: Nested { child: S("third") }, _vectors: HashMap::from([(S("default"), vec![3000.0])]) }, + Document { id: 3, kind: "title".into(), number: 30, value: S("Harry Potter and the Sorcerer's Stone"), nested: Nested { child: S("fourth") }, _vectors: HashMap::from([(S("default"), vec![4000.0])]) }, + Document { id: 4, kind: "title".into(), number: 40, value: S("Harry Potter and the Chamber of Secrets"), nested: Nested { child: S("fift") }, _vectors: HashMap::from([(S("default"), vec![5000.0])]) }, + Document { id: 5, kind: "title".into(), number: 50, value: S("Harry Potter and the Prisoner of Azkaban"), nested: Nested { child: S("sixth") }, _vectors: HashMap::from([(S("default"), vec![6000.0])]) }, + Document { id: 6, kind: "title".into(), number: 60, value: S("Harry Potter and the Goblet of Fire"), nested: Nested { child: S("seventh") }, _vectors: HashMap::from([(S("default"), vec![7000.0])]) }, + Document { id: 7, kind: "title".into(), number: 70, value: S("Harry Potter and the Order of the Phoenix"), nested: Nested { child: S("eighth") }, _vectors: HashMap::from([(S("default"), vec![8000.0])]) }, + Document { id: 8, kind: "title".into(), number: 80, value: S("Harry Potter and the Half-Blood Prince"), nested: Nested { child: S("ninth") }, _vectors: HashMap::from([(S("default"), vec![9000.0])]) }, + Document { id: 9, kind: "title".into(), number: 90, value: S("Harry Potter and the Deathly Hallows"), nested: Nested { child: S("tenth") }, _vectors: HashMap::from([(S("default"), vec![10000.0])]) }, ], None).await?; let t1 = index .set_filterable_attributes(["kind", "value", "number"]) .await?; let t2 = index.set_sortable_attributes(["title"]).await?; - // the vector search has longer indexing times leading to the timeout being triggered - let timeout = if cfg!(feature = "experimental-vector-search") { - Some(Duration::from_secs(120)) - } else { - None - }; - t2.wait_for_completion(client, None, timeout).await?; - t1.wait_for_completion(client, None, timeout).await?; - t0.wait_for_completion(client, None, timeout).await?; + t2.wait_for_completion(client, None, None).await?; + t1.wait_for_completion(client, None, None).await?; + t0.wait_for_completion(client, None, None).await?; Ok(()) } @@ -780,7 +791,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("second") } + nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![2000.0])]), }, &results.hits[0].result ); @@ -952,7 +964,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -967,7 +980,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -988,7 +1002,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap()); @@ -1003,7 +1018,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet…"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1028,7 +1044,8 @@ mod tests { value: S("(ꈍᴗꈍ)sed do eiusmod tempor incididunt ut(ꈍᴗꈍ)"), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]), }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1055,7 +1072,8 @@ mod tests { value: S("The (⊃。•́‿•̀。)⊃ Social ⊂(´• ω •`⊂) Network"), kind: S("title"), number: 20, - nested: Nested { child: S("third") } + nested: Nested { child: S("third") }, + _vectors: HashMap::from([(S("default"), vec![3000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1077,7 +1095,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("first") } + nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]), }, results.hits[0].formatted_result.as_ref().unwrap(), ); @@ -1092,7 +1111,8 @@ mod tests { value: S("dolor sit amet, consectetur adipiscing elit"), kind: S("text"), number: 10, - nested: Nested { child: S("first") } + nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, results.hits[0].formatted_result.as_ref().unwrap() ); @@ -1223,8 +1243,7 @@ mod tests { #[cfg(feature = "experimental-vector-search")] #[meilisearch_test] async fn test_hybrid(client: Client, index: Index) -> Result<(), Error> { - use crate::settings::{Embedder, HuggingFaceEmbedderSettings}; - log::warn!("You are executing the vector search test. This WILL take a while and might lead to timeouts in other tests. You can disable this testcase by not enabling the `experimental-vector-search`-feature and running this "); + use crate::settings::{Embedder, UserProvidedEmbedderSettings}; // enable vector searching and configure an embedder let features = crate::features::ExperimentalFeatures::new(&client) .set_vector_store(true) @@ -1232,11 +1251,8 @@ mod tests { .await .expect("could not enable the vector store"); assert_eq!(features.vector_store, true); - let embedder_setting = Embedder::HuggingFace(HuggingFaceEmbedderSettings { - model: Some("BAAI/bge-base-en-v1.5".into()), - revision: None, - document_template: Some("{{ doc.value }}".into()), - }); + let embedder_setting = + Embedder::UserProvided(UserProvidedEmbedderSettings { dimensions: 1 }); let t3 = index .set_settings(&crate::settings::Settings { embedders: Some(HashMap::from([("default".to_string(), embedder_setting)])), @@ -1247,11 +1263,16 @@ mod tests { setup_test_index(&client, &index).await?; - // "zweite" = "second" in german - // => an embedding should be able to detect that this is equivalent, but not the regular search + // "2nd" = "second" + // no semantic searching => no matches + let results: SearchResults = index.search().with_query("2nd").execute().await?; + assert_eq!(results.hits.len(), 0); + + // an embedding should be able to detect that this is equivalent, but not the regular search let results: SearchResults = index .search() - .with_query("Facebook") + .with_query("2nd") + .with_vector(&[2000.0]) .with_hybrid("default", 1.0) // entirely rely on semantic searching .execute() .await?; @@ -1263,16 +1284,10 @@ mod tests { kind: S("text"), number: 10, nested: Nested { child: S("second") }, + _vectors: HashMap::from([(S("default"), vec![2000.0])]) }, &results.hits[0].result ); - let results: SearchResults = index - .search() - .with_query("zweite") - .with_hybrid("default", 0.0) // no semantic searching => no matches - .execute() - .await?; - assert_eq!(results.hits.len(), 0); // word that has a typo => would have been found via traditional means // if entirely relying on semantic searching, no result is found @@ -1280,6 +1295,7 @@ mod tests { .search() .with_query("lohrem") .with_hybrid("default", 1.0) + .with_vector(&[1000.0]) .execute() .await?; assert_eq!(results.hits.len(), 0); @@ -1287,6 +1303,7 @@ mod tests { .search() .with_query("lohrem") .with_hybrid("default", 0.0) + .with_vector(&[1000.0]) .execute() .await?; assert_eq!(results.hits.len(), 1); @@ -1296,7 +1313,8 @@ mod tests { value: S("Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."), kind: S("text"), number: 0, - nested: Nested { child: S("first") } + nested: Nested { child: S("first") }, + _vectors: HashMap::from([(S("default"), vec![1000.0])]), }, &results.hits[0].result );