Skip to content

Commit

Permalink
add sparse embeddings
Browse files Browse the repository at this point in the history
  • Loading branch information
akshayballal95 committed Oct 17, 2024
1 parent 49c6097 commit 38f945c
Show file tree
Hide file tree
Showing 31 changed files with 619 additions and 259 deletions.
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 13 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@ COPY --from=planner /app/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json
# Build application

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null \
&& echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list \
&& apt-get update \
# Download Intel GPG key and add repository
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null \
&& echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list

# Install Intel MKL and extract libiomp5.so
RUN apt-get update \
&& apt-get install -y intel-oneapi-mkl-devel \
&& export LD_LIBRARY_PATH="/opt/intel/oneapi/compiler/2024.2/lib:$LD_LIBRARY_PATH"
&& cp /opt/intel/oneapi/compiler/2024.2/lib/libiomp5.so /app/libiomp5.so

RUN apt-get install libssl-dev pkg-config python3-full python3-pip -y
RUN pip3 install maturin[patchelf] --break-system-packages
Expand All @@ -27,11 +30,15 @@ FROM python:3.11-slim

WORKDIR /app

COPY --from=builder /app/target/wheels .
# Copy the extracted libiomp5.so from the builder stage
COPY --from=builder /app/libiomp5.so /usr/lib/

# Set the library path
ENV LD_LIBRARY_PATH="/usr/lib:$LD_LIBRARY_PATH"

COPY . .

RUN pip install *.whl
RUN pip install target/wheels/*.whl

RUN pip install numpy pillow pytest

Expand Down
49 changes: 49 additions & 0 deletions examples/splade.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import embed_anything
from embed_anything import EmbedData, EmbeddingModel, WhichModel, embed_query
from embed_anything.vectordb import Adapter
import os
from time import time
import numpy as np
import heapq


model = EmbeddingModel.from_pretrained_hf(
WhichModel.SparseBert, "prithivida/Splade_PP_en_v1"
)

sentences = [
"The cat sits outside",
"A man is playing guitar",
"I love pasta",
"The new movie is awesome",
"The cat plays in the garden",
"A woman watches TV",
"The new movie is so great",
"Do you like pizza?",
]

embedddings = embed_query(sentences, embeder=model)

embed_vector = np.array([e.embedding for e in embedddings])

similarities = np.matmul(embed_vector, embed_vector.T)

# get top 5 similarities and show the two sentences and their similarity scores
# Flatten the upper triangle of the similarity matrix, excluding the diagonal
similarity_scores = [
(similarities[i, j], i, j)
for i in range(len(sentences))
for j in range(i + 1, len(sentences))
]

# Get the top 5 similarity scores
top_5_similarities = heapq.nlargest(5, similarity_scores, key=lambda x: x[0])

# Print the top 5 similarities with sentences
for score, i, j in top_5_similarities:
print(f"Score: {score:.2} | {sentences[i]} | {sentences[j]}")





3 changes: 1 addition & 2 deletions python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ name = "_embed_anything"
crate-type = ["cdylib"]

[dependencies]
embed_anything = {path = "../rust"}
embed_anything = {path = "../rust", features = ["load-dynamic"]}
pyo3 = { version = "0.22.3"}
tokio = { version = "1.39.0", features = ["rt-multi-thread"]}

Expand All @@ -18,5 +18,4 @@ mkl = ["embed_anything/mkl"]
accelerate = ["embed_anything/accelerate"]
cuda = ["embed_anything/cuda"]
cudnn = ["embed_anything/cudnn"]
load-dynamic = ["embed_anything/load-dynamic"]

1 change: 1 addition & 0 deletions python/python/embed_anything/_embed_anything.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -471,3 +471,4 @@ class WhichModel(Enum):
Jina = ("Jina",)
Clip = ("Clip",)
Colpali = ("Colpali",)
SparseBert = ("SparseBert",)
43 changes: 28 additions & 15 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use embed_anything::{
self,
config::TextEmbedConfig,
emb_audio,
embeddings::embed::{EmbeddingResult, Embedder},
embeddings::embed::{Embedder, EmbeddingResult},
file_processor::audio::audio_processor,
text_loader::FileLoadingError,
};
Expand Down Expand Up @@ -36,8 +36,8 @@ impl EmbedData {
Python::with_gil(|py| {
let embedding = self.inner.embedding.clone();
match embedding {
EmbeddingResult::Dense(x) => PyList::new_bound(py, x).into(),
EmbeddingResult::Sparse(x) => {
EmbeddingResult::DenseVector(x) => PyList::new_bound(py, x).into(),
EmbeddingResult::MultiVector(x) => {
PyList::new_bound(py, x.iter().map(|inner| PyList::new_bound(py, inner))).into()
}
}
Expand Down Expand Up @@ -84,6 +84,7 @@ pub enum WhichModel {
OpenAI,
Cohere,
Bert,
SparseBert,
Clip,
Jina,
Colpali,
Expand All @@ -99,6 +100,7 @@ impl From<&str> for WhichModel {
"openai" | "OpenAI" => WhichModel::OpenAI,
"cohere" | "Cohere" => WhichModel::Cohere,
"bert" | "Bert" => WhichModel::Bert,
"sparse-bert" | "SparseBert" => WhichModel::SparseBert,
"clip" | "Clip" => WhichModel::Clip,
"jina" | "Jina" => WhichModel::Jina,
"colpali" | "Colpali" => WhichModel::Colpali,
Expand All @@ -113,6 +115,7 @@ impl From<String> for WhichModel {
"openai" | "OpenAI" => WhichModel::OpenAI,
"cohere" | "Cohere" => WhichModel::Cohere,
"bert" | "Bert" => WhichModel::Bert,
"sparse-bert" | "SparseBert" => WhichModel::SparseBert,
"clip" | "Clip" => WhichModel::Clip,
"jina" | "Jina" => WhichModel::Jina,
"colpali" | "Colpali" => WhichModel::Colpali,
Expand Down Expand Up @@ -150,12 +153,25 @@ impl EmbeddingModel {
inner: Arc::new(model),
})
}
WhichModel::SparseBert => {
let model_id = model_id.unwrap_or("prithivida/Splade_PP_en_v1");
let model = Embedder::Text(TextEmbedder::Bert(Box::new(
embed_anything::embeddings::local::bert::SparseBertEmbedder::new(
model_id.to_string(),
revision.map(|s| s.to_string()),
)
.unwrap(),
)));
Ok(EmbeddingModel {
inner: Arc::new(model),
})
}
WhichModel::Clip => {
let model_id = model_id.unwrap_or("openai/clip-vit-base-patch32");
let model = Embedder::Vision(VisionEmbedder::Clip(
embed_anything::embeddings::local::clip::ClipEmbedder::new(
model_id.to_string(),
revision.map(|s| s.to_string()),
revision,
)
.map_err(|e| PyValueError::new_err(e.to_string()))?,
));
Expand All @@ -166,29 +182,26 @@ impl EmbeddingModel {
WhichModel::Jina => {
let model_id = model_id.unwrap_or("jinaai/jina-embeddings-v2-small-en");
let model = Embedder::Text(TextEmbedder::Jina(
embed_anything::embeddings::local::jina::JinaEmbedder::new(
model_id.to_string(),
revision.map(|s| s.to_string()),
)
.unwrap(),
embed_anything::embeddings::local::jina::JinaEmbedder::new(model_id, revision)
.unwrap(),
));
Ok(EmbeddingModel {
inner: Arc::new(model),
})
}
WhichModel::Colpali => {
let model_id = model_id.unwrap_or("vidore/colpali-v1.2-merged");
let model = Embedder::Vision(VisionEmbedder::ColPali(embed_anything::embeddings::local::colpali::ColPaliEmbedder::new(
model_id,
revision.map(|s| s),
let model = Embedder::Vision(VisionEmbedder::ColPali(
embed_anything::embeddings::local::colpali::ColPaliEmbedder::new(
model_id, revision,
)
.unwrap(),
));
Ok(EmbeddingModel {
inner: Arc::new(model),
})
}

_ => panic!("Invalid model"),
}
}
Expand Down Expand Up @@ -358,7 +371,7 @@ pub fn embed_file(

let embeddings = rt
.block_on(async {
embed_anything::embed_file(file_name, &embedding_model, config, adapter).await
embed_anything::embed_file(file_name, embedding_model, config, adapter).await
})
.map_err(|e| match e.downcast_ref::<FileLoadingError>() {
Some(FileLoadingError::FileNotFound(file)) => {
Expand Down Expand Up @@ -390,7 +403,7 @@ pub fn embed_audio_file(
let audio_decoder = &mut audio_decoder.inner;
let rt = Builder::new_multi_thread().enable_all().build().unwrap();
let data = rt.block_on(async {
emb_audio(audio_file, audio_decoder, &embedding_model, config)
emb_audio(audio_file, audio_decoder, embedding_model, config)
.await
.map_err(|e| PyValueError::new_err(e.to_string()))
.unwrap()
Expand Down
6 changes: 4 additions & 2 deletions python/src/models/colpali.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ pub struct ColpaliModel {

#[pymethods]
impl ColpaliModel {

#[new]
#[pyo3(signature = (model_id, revision=None))]
pub fn new(model_id: &str, revision: Option<&str>) -> PyResult<Self> {
Expand Down Expand Up @@ -40,7 +39,10 @@ impl ColpaliModel {
}

pub fn embed_query(&self, query: &str) -> PyResult<Vec<EmbedData>> {
let embed_data = self.model.embed_query(query).map_err(|e| PyValueError::new_err(e.to_string()))?;
let embed_data = self
.model
.embed_query(query)
.map_err(|e| PyValueError::new_err(e.to_string()))?;
Ok(embed_data
.into_iter()
.map(|data| EmbedData { inner: data })
Expand Down
2 changes: 1 addition & 1 deletion python/src/models/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pub mod colpali;
pub mod colpali;
3 changes: 1 addition & 2 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ byteorder = "1.5.0"
futures = "0.3.30"

pdf-extract = {workspace = true}
cudarc = { version = "0.12.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
ort = {workspace = true}
ndarray = "0.16.1"
ndarray-linalg = {version = "0.16.0"}
Expand All @@ -93,5 +92,5 @@ default = []
mkl = ["dep:intel-mkl-src", "candle-nn/mkl", "candle-transformers/mkl", "candle-core/mkl"]
accelerate = ["dep:accelerate-src", "candle-core/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-core/cuda"]
cudnn = ["candle-core/cuda", "cudarc/cudnn"]
cudnn = ["candle-core/cudnn"]
load-dynamic = ["ort/load-dynamic"]
16 changes: 6 additions & 10 deletions rust/examples/bert_ort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use std::time::Instant;
#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
let model =

Arc::new(Embedder::from_pretrained_onnx("bert", ONNXModel::AllMiniLML6V2, None).unwrap());
let config = TextEmbedConfig::new(
Some(1000),
Expand All @@ -29,17 +28,13 @@ async fn main() -> Result<(), anyhow::Error> {

let now = Instant::now();


let futures = files
.par_iter()
.map(|file| {
embed_file(file, &model, Some(&config), None::<fn(Vec<EmbedData>)>)
})
.map(|file| embed_file(file, &model, Some(&config), None::<fn(Vec<EmbedData>)>))
.collect::<Vec<_>>();

let _data = futures.into_iter().next().unwrap().await;


let elapsed_time = now.elapsed();
println!("Elapsed Time: {}", elapsed_time.as_secs_f32());

Expand All @@ -52,7 +47,6 @@ async fn main() -> Result<(), anyhow::Error> {
"The dog is sitting in the park",
"The window is broken",
"pizza is the best",

]
.iter()
.map(|s| s.to_string())
Expand All @@ -71,7 +65,10 @@ async fn main() -> Result<(), anyhow::Error> {
.map(|x| x.to_dense().unwrap())
.flatten()
.collect::<Vec<_>>(),
(n_vectors, doc_embeddings[0].embedding.to_dense().unwrap().len()),
(
n_vectors,
doc_embeddings[0].embedding.to_dense().unwrap().len(),
),
&Device::Cpu,
)
.unwrap();
Expand All @@ -94,5 +91,4 @@ async fn main() -> Result<(), anyhow::Error> {
}

Ok(())

}
}
12 changes: 1 addition & 11 deletions rust/examples/cloud.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,13 @@ use embed_anything::{
config::TextEmbedConfig,
embed_directory_stream, embed_file,
embeddings::embed::{EmbedData, Embedder},
text_loader::SplittingStrategy,
};

use anyhow::Result;

#[tokio::main]
async fn main() -> Result<()> {
let semantic_encoder =
Embedder::from_pretrained_cloud("openai", "text-embedding-3-small", None).unwrap();
let text_embed_config = TextEmbedConfig::new(
Some(1000),
Some(256),
Some(32),
None,
None,
);
let text_embed_config = TextEmbedConfig::new(Some(1000), Some(512), Some(512), None, None);
let cohere_model =
Embedder::from_pretrained_cloud("cohere", "embed-english-v3.0", None).unwrap();
let openai_model =
Expand All @@ -44,7 +35,6 @@ async fn main() -> Result<()> {
.await?
.unwrap();


let _cohere_embedding = embed_file(
"test_files/attention.pdf",
&cohere_model,
Expand Down
4 changes: 1 addition & 3 deletions rust/examples/colpali.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use embed_anything::embeddings::local::colpali::ColPaliEmbedder;

fn main() -> Result<(), anyhow::Error> {

let colpali_model = ColPaliEmbedder::new( "vidore/colpali-v1.2-merged", None)?;
let colpali_model = ColPaliEmbedder::new("vidore/colpali-v1.2-merged", None)?;
let file_path = "test_files/attention.pdf";
let batch_size = 1;
let embed_data = colpali_model.embed_file(file_path, batch_size)?;
Expand All @@ -12,5 +11,4 @@ fn main() -> Result<(), anyhow::Error> {
let query_embeddings = colpali_model.embed_query(prompt)?;
println!("{:?}", query_embeddings.len());
Ok(())

}
Loading

0 comments on commit 38f945c

Please sign in to comment.