add sparse embeddings

StarlightSearch · Oct 17, 2024 · 38f945c · 38f945c
1 parent 49c6097
commit 38f945c
Show file tree

Hide file tree

Showing 31 changed files with 619 additions and 259 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile b/Dockerfile
@@ -11,11 +11,14 @@ COPY --from=planner /app/recipe.json recipe.json
 RUN cargo chef cook --release --recipe-path recipe.json
 # Build application
 
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null \          
-    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list \
-    && apt-get update \
+# Download Intel GPG key and add repository
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null \
+    && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+# Install Intel MKL and extract libiomp5.so
+RUN apt-get update \
     && apt-get install -y intel-oneapi-mkl-devel \
-    && export LD_LIBRARY_PATH="/opt/intel/oneapi/compiler/2024.2/lib:$LD_LIBRARY_PATH"
+    && cp /opt/intel/oneapi/compiler/2024.2/lib/libiomp5.so /app/libiomp5.so
 
 RUN apt-get install libssl-dev pkg-config python3-full python3-pip -y 
 RUN pip3 install maturin[patchelf] --break-system-packages
@@ -27,11 +30,15 @@ FROM python:3.11-slim
 
 WORKDIR /app
 
-COPY --from=builder /app/target/wheels .
+# Copy the extracted libiomp5.so from the builder stage
+COPY --from=builder /app/libiomp5.so /usr/lib/
+
+# Set the library path
+ENV LD_LIBRARY_PATH="/usr/lib:$LD_LIBRARY_PATH"
 
 COPY . .
 
-RUN pip install *.whl
+RUN pip install target/wheels/*.whl
 
 RUN pip install numpy pillow pytest
 

diff --git a/examples/splade.py b/examples/splade.py
@@ -0,0 +1,49 @@
+import embed_anything
+from embed_anything import EmbedData, EmbeddingModel, WhichModel, embed_query
+from embed_anything.vectordb import Adapter
+import os
+from time import time
+import numpy as np
+import heapq
+
+
+model = EmbeddingModel.from_pretrained_hf(
+    WhichModel.SparseBert, "prithivida/Splade_PP_en_v1"
+)
+
+sentences = [
+        "The cat sits outside",
+        "A man is playing guitar",
+        "I love pasta",
+        "The new movie is awesome",
+        "The cat plays in the garden",
+        "A woman watches TV",
+        "The new movie is so great",
+        "Do you like pizza?",
+]
+
+embedddings = embed_query(sentences, embeder=model)
+
+embed_vector = np.array([e.embedding for e in embedddings])
+
+similarities = np.matmul(embed_vector, embed_vector.T)
+
+# get top 5 similarities and show the two sentences and their similarity scores
+# Flatten the upper triangle of the similarity matrix, excluding the diagonal
+similarity_scores = [
+    (similarities[i, j], i, j)
+    for i in range(len(sentences))
+    for j in range(i + 1, len(sentences))
+]
+
+# Get the top 5 similarity scores
+top_5_similarities = heapq.nlargest(5, similarity_scores, key=lambda x: x[0])
+
+# Print the top 5 similarities with sentences
+for score, i, j in top_5_similarities:
+    print(f"Score: {score:.2} | {sentences[i]} | {sentences[j]}")
+
+
+
+
+
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -8,7 +8,7 @@ name = "_embed_anything"
 crate-type = ["cdylib"]
 
 [dependencies]
-embed_anything = {path = "../rust"}
+embed_anything = {path = "../rust", features = ["load-dynamic"]}
 pyo3 = { version = "0.22.3"}
 tokio = { version = "1.39.0", features = ["rt-multi-thread"]}
 
@@ -18,5 +18,4 @@ mkl = ["embed_anything/mkl"]
 accelerate = ["embed_anything/accelerate"]
 cuda = ["embed_anything/cuda"]
 cudnn = ["embed_anything/cudnn"]
-load-dynamic = ["embed_anything/load-dynamic"]
 
diff --git a/python/python/embed_anything/_embed_anything.pyi b/python/python/embed_anything/_embed_anything.pyi
@@ -471,3 +471,4 @@ class WhichModel(Enum):
     Jina = ("Jina",)
     Clip = ("Clip",)
     Colpali = ("Colpali",)
+    SparseBert = ("SparseBert",)
diff --git a/python/src/lib.rs b/python/src/lib.rs
@@ -6,7 +6,7 @@ use embed_anything::{
     self,
     config::TextEmbedConfig,
     emb_audio,
-    embeddings::embed::{EmbeddingResult, Embedder},
+    embeddings::embed::{Embedder, EmbeddingResult},
     file_processor::audio::audio_processor,
     text_loader::FileLoadingError,
 };
@@ -36,8 +36,8 @@ impl EmbedData {
         Python::with_gil(|py| {
             let embedding = self.inner.embedding.clone();
             match embedding {
-                EmbeddingResult::Dense(x) => PyList::new_bound(py, x).into(),
-                EmbeddingResult::Sparse(x) => {
+                EmbeddingResult::DenseVector(x) => PyList::new_bound(py, x).into(),
+                EmbeddingResult::MultiVector(x) => {
                     PyList::new_bound(py, x.iter().map(|inner| PyList::new_bound(py, inner))).into()
                 }
             }
@@ -84,6 +84,7 @@ pub enum WhichModel {
     OpenAI,
     Cohere,
     Bert,
+    SparseBert,
     Clip,
     Jina,
     Colpali,
@@ -99,6 +100,7 @@ impl From<&str> for WhichModel {
             "openai" | "OpenAI" => WhichModel::OpenAI,
             "cohere" | "Cohere" => WhichModel::Cohere,
             "bert" | "Bert" => WhichModel::Bert,
+            "sparse-bert" | "SparseBert" => WhichModel::SparseBert,
             "clip" | "Clip" => WhichModel::Clip,
             "jina" | "Jina" => WhichModel::Jina,
             "colpali" | "Colpali" => WhichModel::Colpali,
@@ -113,6 +115,7 @@ impl From<String> for WhichModel {
             "openai" | "OpenAI" => WhichModel::OpenAI,
             "cohere" | "Cohere" => WhichModel::Cohere,
             "bert" | "Bert" => WhichModel::Bert,
+            "sparse-bert" | "SparseBert" => WhichModel::SparseBert,
             "clip" | "Clip" => WhichModel::Clip,
             "jina" | "Jina" => WhichModel::Jina,
             "colpali" | "Colpali" => WhichModel::Colpali,
@@ -150,12 +153,25 @@ impl EmbeddingModel {
                     inner: Arc::new(model),
                 })
             }
+            WhichModel::SparseBert => {
+                let model_id = model_id.unwrap_or("prithivida/Splade_PP_en_v1");
+                let model = Embedder::Text(TextEmbedder::Bert(Box::new(
+                    embed_anything::embeddings::local::bert::SparseBertEmbedder::new(
+                        model_id.to_string(),
+                        revision.map(|s| s.to_string()),
+                    )
+                    .unwrap(),
+                )));
+                Ok(EmbeddingModel {
+                    inner: Arc::new(model),
+                })
+            }
             WhichModel::Clip => {
                 let model_id = model_id.unwrap_or("openai/clip-vit-base-patch32");
                 let model = Embedder::Vision(VisionEmbedder::Clip(
                     embed_anything::embeddings::local::clip::ClipEmbedder::new(
                         model_id.to_string(),
-                        revision.map(|s| s.to_string()),
+                        revision,
                     )
                     .map_err(|e| PyValueError::new_err(e.to_string()))?,
                 ));
@@ -166,29 +182,26 @@ impl EmbeddingModel {
             WhichModel::Jina => {
                 let model_id = model_id.unwrap_or("jinaai/jina-embeddings-v2-small-en");
                 let model = Embedder::Text(TextEmbedder::Jina(
-                    embed_anything::embeddings::local::jina::JinaEmbedder::new(
-                        model_id.to_string(),
-                        revision.map(|s| s.to_string()),
-                    )
-                    .unwrap(),
+                    embed_anything::embeddings::local::jina::JinaEmbedder::new(model_id, revision)
+                        .unwrap(),
                 ));
                 Ok(EmbeddingModel {
                     inner: Arc::new(model),
                 })
             }
             WhichModel::Colpali => {
                 let model_id = model_id.unwrap_or("vidore/colpali-v1.2-merged");
-                let model = Embedder::Vision(VisionEmbedder::ColPali(embed_anything::embeddings::local::colpali::ColPaliEmbedder::new(
-                    model_id,
-                        revision.map(|s| s),
+                let model = Embedder::Vision(VisionEmbedder::ColPali(
+                    embed_anything::embeddings::local::colpali::ColPaliEmbedder::new(
+                        model_id, revision,
                     )
                     .unwrap(),
                 ));
                 Ok(EmbeddingModel {
                     inner: Arc::new(model),
                 })
             }
-        
+
             _ => panic!("Invalid model"),
         }
     }
@@ -358,7 +371,7 @@ pub fn embed_file(
 
     let embeddings = rt
         .block_on(async {
-            embed_anything::embed_file(file_name, &embedding_model, config, adapter).await
+            embed_anything::embed_file(file_name, embedding_model, config, adapter).await
         })
         .map_err(|e| match e.downcast_ref::<FileLoadingError>() {
             Some(FileLoadingError::FileNotFound(file)) => {
@@ -390,7 +403,7 @@ pub fn embed_audio_file(
     let audio_decoder = &mut audio_decoder.inner;
     let rt = Builder::new_multi_thread().enable_all().build().unwrap();
     let data = rt.block_on(async {
-        emb_audio(audio_file, audio_decoder, &embedding_model, config)
+        emb_audio(audio_file, audio_decoder, embedding_model, config)
             .await
             .map_err(|e| PyValueError::new_err(e.to_string()))
             .unwrap()

diff --git a/python/src/models/colpali.rs b/python/src/models/colpali.rs
@@ -11,7 +11,6 @@ pub struct ColpaliModel {
 
 #[pymethods]
 impl ColpaliModel {
-
     #[new]
     #[pyo3(signature = (model_id, revision=None))]
     pub fn new(model_id: &str, revision: Option<&str>) -> PyResult<Self> {
@@ -40,7 +39,10 @@ impl ColpaliModel {
     }
 
     pub fn embed_query(&self, query: &str) -> PyResult<Vec<EmbedData>> {
-        let embed_data = self.model.embed_query(query).map_err(|e| PyValueError::new_err(e.to_string()))?;
+        let embed_data = self
+            .model
+            .embed_query(query)
+            .map_err(|e| PyValueError::new_err(e.to_string()))?;
         Ok(embed_data
             .into_iter()
             .map(|data| EmbedData { inner: data })

diff --git a/python/src/models/mod.rs b/python/src/models/mod.rs
@@ -1 +1 @@
-pub mod colpali;
+pub mod colpali;
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
@@ -69,7 +69,6 @@ byteorder = "1.5.0"
 futures = "0.3.30"
 
 pdf-extract = {workspace = true}
-cudarc = { version = "0.12.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
 ort = {workspace = true}
 ndarray = "0.16.1"
 ndarray-linalg = {version = "0.16.0"}
@@ -93,5 +92,5 @@ default = []
 mkl = ["dep:intel-mkl-src", "candle-nn/mkl", "candle-transformers/mkl", "candle-core/mkl"]
 accelerate = ["dep:accelerate-src", "candle-core/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
 cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-core/cuda"]
-cudnn = ["candle-core/cuda", "cudarc/cudnn"]
+cudnn = ["candle-core/cudnn"]
 load-dynamic = ["ort/load-dynamic"]
diff --git a/rust/examples/bert_ort.rs b/rust/examples/bert_ort.rs
@@ -11,7 +11,6 @@ use std::time::Instant;
 #[tokio::main]
 async fn main() -> Result<(), anyhow::Error> {
     let model =
-
         Arc::new(Embedder::from_pretrained_onnx("bert", ONNXModel::AllMiniLML6V2, None).unwrap());
     let config = TextEmbedConfig::new(
         Some(1000),
@@ -29,17 +28,13 @@ async fn main() -> Result<(), anyhow::Error> {
 
     let now = Instant::now();
 
-
     let futures = files
         .par_iter()
-        .map(|file| {
-            embed_file(file, &model, Some(&config), None::<fn(Vec<EmbedData>)>)
-        })
+        .map(|file| embed_file(file, &model, Some(&config), None::<fn(Vec<EmbedData>)>))
         .collect::<Vec<_>>();
 
     let _data = futures.into_iter().next().unwrap().await;
 
-
     let elapsed_time = now.elapsed();
     println!("Elapsed Time: {}", elapsed_time.as_secs_f32());
 
@@ -52,7 +47,6 @@ async fn main() -> Result<(), anyhow::Error> {
         "The dog is sitting in the park",
         "The window is broken",
         "pizza is the best",
-
     ]
     .iter()
     .map(|s| s.to_string())
@@ -71,7 +65,10 @@ async fn main() -> Result<(), anyhow::Error> {
             .map(|x| x.to_dense().unwrap())
             .flatten()
             .collect::<Vec<_>>(),
-        (n_vectors, doc_embeddings[0].embedding.to_dense().unwrap().len()),
+        (
+            n_vectors,
+            doc_embeddings[0].embedding.to_dense().unwrap().len(),
+        ),
         &Device::Cpu,
     )
     .unwrap();
@@ -94,5 +91,4 @@ async fn main() -> Result<(), anyhow::Error> {
     }
 
     Ok(())
-
-}
+}
diff --git a/rust/examples/cloud.rs b/rust/examples/cloud.rs
@@ -4,22 +4,13 @@ use embed_anything::{
     config::TextEmbedConfig,
     embed_directory_stream, embed_file,
     embeddings::embed::{EmbedData, Embedder},
-    text_loader::SplittingStrategy,
 };
 
 use anyhow::Result;
 
 #[tokio::main]
 async fn main() -> Result<()> {
-    let semantic_encoder =
-       Embedder::from_pretrained_cloud("openai", "text-embedding-3-small", None).unwrap();
-    let text_embed_config = TextEmbedConfig::new(
-        Some(1000),
-        Some(256),
-        Some(32),
-        None,
-        None,
-    );
+    let text_embed_config = TextEmbedConfig::new(Some(1000), Some(512), Some(512), None, None);
     let cohere_model =
         Embedder::from_pretrained_cloud("cohere", "embed-english-v3.0", None).unwrap();
     let openai_model =
@@ -44,7 +35,6 @@ async fn main() -> Result<()> {
     .await?
     .unwrap();
 
-
     let _cohere_embedding = embed_file(
         "test_files/attention.pdf",
         &cohere_model,

diff --git a/rust/examples/colpali.rs b/rust/examples/colpali.rs
@@ -1,8 +1,7 @@
 use embed_anything::embeddings::local::colpali::ColPaliEmbedder;
 
 fn main() -> Result<(), anyhow::Error> {
-
-    let colpali_model = ColPaliEmbedder::new( "vidore/colpali-v1.2-merged", None)?;
+    let colpali_model = ColPaliEmbedder::new("vidore/colpali-v1.2-merged", None)?;
     let file_path = "test_files/attention.pdf";
     let batch_size = 1;
     let embed_data = colpali_model.embed_file(file_path, batch_size)?;
@@ -12,5 +11,4 @@ fn main() -> Result<(), anyhow::Error> {
     let query_embeddings = colpali_model.embed_query(prompt)?;
     println!("{:?}", query_embeddings.len());
     Ok(())
-
 }