Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions clients/new-js/packages/ai-embeddings/chroma-bm25/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,20 @@ export class ChromaBm25EmbeddingFunction implements SparseEmbeddingFunction {
}

private encode(text: string): SparseVector {
const tokens = this.tokenizer.tokenize(text);
const tokenList = this.tokenizer.tokenize(text);

if (tokens.length === 0) {
if (tokenList.length === 0) {
return { indices: [], values: [] };
}

const docLen = tokens.length;
const docLen = tokenList.length;
const counts = new Map<number, number>();
const tokenMap = new Map<number, string>();

for (const token of tokens) {
for (const token of tokenList) {
const tokenId = this.hasher.hash(token);
counts.set(tokenId, (counts.get(tokenId) ?? 0) + 1);
tokenMap.set(tokenId, token);
}

const indices = Array.from(counts.keys()).sort((a, b) => a - b);
Expand All @@ -213,8 +215,9 @@ export class ChromaBm25EmbeddingFunction implements SparseEmbeddingFunction {
(1 - this.b + (this.b * docLen) / this.avgDocLength);
return (tf * (this.k + 1)) / denominator;
});
const tokens = indices.map((idx) => tokenMap.get(idx)!);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[BestPractice]

Potential panic on unwrap: tokenMap.get(idx)! will panic if the index doesn't exist in the map. This can happen if there are hash collisions or concurrent modifications:

// If tokenMap.get(idx) returns undefined, ! will throw
const tokens = indices.map((idx) => tokenMap.get(idx)!);

Add safety check:

const tokens = indices.map((idx) => {
    const token = tokenMap.get(idx);
    if (!token) throw new Error(`Token not found for index ${idx}`);
    return token;
});
Context for Agents
[**BestPractice**]

Potential panic on unwrap: `tokenMap.get(idx)!` will panic if the index doesn't exist in the map. This can happen if there are hash collisions or concurrent modifications:

```typescript
// If tokenMap.get(idx) returns undefined, ! will throw
const tokens = indices.map((idx) => tokenMap.get(idx)!);
```

Add safety check:
```typescript
const tokens = indices.map((idx) => {
    const token = tokenMap.get(idx);
    if (!token) throw new Error(`Token not found for index ${idx}`);
    return token;
});
```

File: clients/new-js/packages/ai-embeddings/chroma-bm25/src/index.ts
Line: 218


return { indices, values };
return { indices, values, tokens };
}

public async generate(texts: string[]): Promise<SparseVector[]> {
Expand Down
4 changes: 4 additions & 0 deletions clients/new-js/packages/chromadb/src/api/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,10 @@ export type SparseVector = {
* Dimension indices
*/
indices: Array<number>;
/**
* Tokens corresponding to each index
*/
tokens?: Array<string> | null;
/**
* Values corresponding to each index
*/
Expand Down
50 changes: 25 additions & 25 deletions clients/new-js/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions idl/chromadb/proto/chroma.proto
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ message Vector {
message SparseVector {
repeated uint32 indices = 1;
repeated float values = 2;
repeated string tokens = 3;
}

enum SegmentScope {
Expand Down
11 changes: 6 additions & 5 deletions rust/chroma/src/embed/bm25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,23 +78,24 @@ where
let mut token_ids = Vec::with_capacity(tokens.len());
for token in tokens {
let id = self.hasher.hash(&token);
token_ids.push(id);
token_ids.push((id, token));
}

token_ids.sort_unstable();

let sparse_pairs = token_ids.chunk_by(|a, b| a == b).map(|chunk| {
let id = chunk[0];
let sparse_triples = token_ids.chunk_by(|a, b| a.0 == b.0).map(|chunk| {
let id = chunk[0].0;
let tk = chunk[0].1.clone();
let tf = chunk.len() as f32;

// BM25 formula
let score = tf * (self.k + 1.0)
/ (tf + self.k * (1.0 - self.b + self.b * doc_len / self.avg_len));

(id, score)
(tk, id, score)
});

Ok(SparseVector::from_pairs(sparse_pairs))
Ok(SparseVector::from_triples(sparse_triples))
}
}

Expand Down
12 changes: 7 additions & 5 deletions rust/frontend/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1702,11 +1702,13 @@ async fn collection_delete(
r#where,
)?;

server
.frontend
.delete(request)
.meter(metering_context_container)
.await?;
Box::pin(
server
.frontend
.delete(request)
.meter(metering_context_container),
)
.await?;

Ok(Json(DeleteCollectionRecordsResponse {}))
}
Expand Down
4 changes: 3 additions & 1 deletion rust/frontend/tests/proptest_helpers/frontend_under_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ impl StateMachineTest for FrontendUnderTest {
}
}

state.frontend.delete(request.clone()).await.unwrap();
Box::pin(state.frontend.delete(request.clone()))
.await
.unwrap();
}
CollectionRequest::Get(mut request) => {
let expected_result = {
Expand Down
2 changes: 1 addition & 1 deletion rust/python_bindings/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ impl Bindings {

let mut frontend_clone = self.frontend.clone();
self.runtime
.block_on(async { frontend_clone.delete(request).await })?;
.block_on(async { Box::pin(frontend_clone.delete(request)).await })?;
Ok(())
}

Expand Down
1 change: 1 addition & 0 deletions rust/segment/src/blockfile_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2669,6 +2669,7 @@ mod test {
UpdateMetadataValue::SparseVector(chroma_types::SparseVector::new(
vec![0, 5, 10],
vec![0.1, 0.5, 0.9],
None,
)),
);
update_metadata1.insert(
Expand Down
8 changes: 7 additions & 1 deletion rust/types/src/api_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2419,7 +2419,11 @@ mod test {
// Add unsorted sparse vector - should fail validation
metadata.insert(
"sparse".to_string(),
MetadataValue::SparseVector(SparseVector::new(vec![3, 1, 2], vec![0.3, 0.1, 0.2])),
MetadataValue::SparseVector(SparseVector::new(
vec![3, 1, 2],
vec![0.3, 0.1, 0.2],
None,
)),
);

let result = AddCollectionRecordsRequest::try_new(
Expand All @@ -2446,6 +2450,7 @@ mod test {
UpdateMetadataValue::SparseVector(SparseVector::new(
vec![3, 1, 2],
vec![0.3, 0.1, 0.2],
None,
)),
);

Expand Down Expand Up @@ -2473,6 +2478,7 @@ mod test {
UpdateMetadataValue::SparseVector(SparseVector::new(
vec![3, 1, 2],
vec![0.3, 0.1, 0.2],
None,
)),
);

Expand Down
8 changes: 4 additions & 4 deletions rust/types/src/collection_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1988,7 +1988,7 @@ impl Schema {
///
/// # Examples
/// ```
/// use chroma_types::{Schema, VectorIndexConfig, StringInvertedIndexConfig, Space};
/// use chroma_types::{Schema, VectorIndexConfig, StringInvertedIndexConfig, Space, SchemaBuilderError};
///
/// # fn main() -> Result<(), SchemaBuilderError> {
/// let schema = Schema::default()
Expand Down Expand Up @@ -2069,7 +2069,7 @@ impl Schema {
///
/// # Examples
/// ```
/// use chroma_types::{Schema, StringInvertedIndexConfig};
/// use chroma_types::{Schema, StringInvertedIndexConfig, SchemaBuilderError};
///
/// # fn main() -> Result<(), SchemaBuilderError> {
/// let schema = Schema::default()
Expand Down Expand Up @@ -3053,7 +3053,7 @@ mod tests {
let schema = Schema::new_default(KnnIndex::Spann);
let result = schema.is_knn_key_indexing_enabled(
"custom_sparse",
&QueryVector::Sparse(SparseVector::new(vec![0_u32], vec![1.0_f32])),
&QueryVector::Sparse(SparseVector::new(vec![0_u32], vec![1.0_f32], None)),
);

let err = result.expect_err("expected indexing disabled error");
Expand Down Expand Up @@ -3088,7 +3088,7 @@ mod tests {

let result = schema.is_knn_key_indexing_enabled(
"sparse_enabled",
&QueryVector::Sparse(SparseVector::new(vec![0_u32], vec![1.0_f32])),
&QueryVector::Sparse(SparseVector::new(vec![0_u32], vec![1.0_f32], None)),
);

assert!(result.is_ok());
Expand Down
11 changes: 7 additions & 4 deletions rust/types/src/execution/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -800,7 +800,8 @@ impl TryFrom<KnnBatchResult> for chroma_proto::KnnBatchResult {
///
/// let sparse = QueryVector::Sparse(SparseVector::new(
/// vec![0, 5, 10, 50], // indices
/// vec![0.5, 0.3, 0.8, 0.2] // values
/// vec![0.5, 0.3, 0.8, 0.2], // values
/// None,
/// ));
/// ```
///
Expand Down Expand Up @@ -829,7 +830,8 @@ impl TryFrom<KnnBatchResult> for chroma_proto::KnnBatchResult {
/// let rank = RankExpr::Knn {
/// query: QueryVector::Sparse(SparseVector::new(
/// vec![1, 5, 10],
/// vec![0.5, 0.3, 0.8]
/// vec![0.5, 0.3, 0.8],
/// None,
/// )),
/// key: Key::field("sparse_embedding"),
/// limit: 100,
Expand Down Expand Up @@ -2691,7 +2693,7 @@ mod tests {

#[test]
fn test_query_vector_sparse_proto_conversion() {
let sparse = SparseVector::new(vec![0, 5, 10], vec![0.1, 0.5, 0.9]);
let sparse = SparseVector::new(vec![0, 5, 10], vec![0.1, 0.5, 0.9], None);
let query_vector = QueryVector::Sparse(sparse.clone());

// Convert to proto
Expand Down Expand Up @@ -2977,7 +2979,8 @@ mod tests {
assert_eq!(deserialized, dense);

// Test sparse vector
let sparse = QueryVector::Sparse(SparseVector::new(vec![0, 5, 10], vec![0.1, 0.5, 0.9]));
let sparse =
QueryVector::Sparse(SparseVector::new(vec![0, 5, 10], vec![0.1, 0.5, 0.9], None));
let json = serde_json::to_string(&sparse).unwrap();
let deserialized: QueryVector = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized, sparse);
Expand Down
Loading
Loading