Skip to content

Commit

Permalink
Update python interface for match_text, match_sparse and match_dense (#…
Browse files Browse the repository at this point in the history
…1667)

### What problem does this PR solve?

When using Python sdk:
- Use match_dense instead of knn
- Use match_text instead of match

Function parameters for match_text and match_sparse have changed

fix #1625

### Type of change

- [x] Breaking Change (fix or feature that could cause existing
functionality not to work as expected)
- [x] Documentation Update
- [x] Refactoring
- [x] Test cases
- [x] Python SDK impacted, Need to update PyPI
  • Loading branch information
yangzq50 authored Aug 17, 2024
1 parent 7c6b829 commit 9da9455
Show file tree
Hide file tree
Showing 40 changed files with 392 additions and 379 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ Infinity, also available as a Python module, eliminates the need for a separate
table = db.create_table("my_table", {"num": {"type": "integer"}, "body": {"type": "varchar"}, "vec": {"type": "vector, 4, float"}})
table.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}])
table.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}])
res = table.output(["*"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl()
res = table.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl()
print(res)
```

Expand Down
2 changes: 1 addition & 1 deletion docs/getstarted/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Infinity, also available as a Python module, eliminates the need for a separate
table_object.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}])
table_object.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}])
res = table_object.output(["*"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2)
.match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2)
.to_pl()
print(res)
```
Expand Down
129 changes: 80 additions & 49 deletions docs/references/pysdk_api_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1443,10 +1443,10 @@ table_object.output(["*"]).filter("c2 = 3").to_pl()

---

## knn
## match_dense

```python
table_object.knn(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None)
table_object.match_dense(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None)
```

Creates a dense vector search expression to identify the top n closest rows to the given dense vector. Suitable for working with dense vectors (dense embeddings).
Expand Down Expand Up @@ -1503,7 +1503,7 @@ A dictionary representing additional parameters for the KNN or ANN search.
# Find the 100 nearest neighbors using Euclidean distance
# If no vector index is created on the column being queried, then the vector search defaults to a brute-force search.
# In such case, set `knn_params` to `None` or leave it blank.
table_object.knn("vec", [0.1,0.2,0.3], "float", "l2", 100)
table_object.match_dense("vec", [0.1,0.2,0.3], "float", "l2", 100)
```

:::caution NOTE
Expand All @@ -1522,7 +1522,7 @@ table_object.create_index("my_index", IndexInfo("vec", IndexType.Hnsw, [InitPara
# If an HNSW index is successfully built on the column being queried, then the vector search uses this index,
# regardless of whether `knn_params` is set.
# If you leave `knn_params` blank, the search takes the `"ef"` value set in `create_index()`.
table_object.knn("vec", [1, 2, 3], "uint8", "cosine", 2)
table_object.match_dense("vec", [1, 2, 3], "uint8", "cosine", 2)
```

```python
Expand All @@ -1532,7 +1532,7 @@ table_object.create_index("my_index", IndexInfo("vec", IndexType.Hnsw, [InitPara
# If an HNSW index is successfully built on the column being queried, then the vector search uses this index,
# regardless of whether `knn_params` is set.
# You can specify the value of `"ef"` in `knn_params`, which overrides the value set in `create_index()`
table_object.knn("vec", [0.1,0.2,0.3], "float", "ip", 2, {"ef": "100"})
table_object.match_dense("vec", [0.1,0.2,0.3], "float", "ip", 2, {"ef": "100"})
```

:::tip NOTE
Expand All @@ -1559,12 +1559,23 @@ To display your query results, you must chain this method with `output(columns)`

A non-empty string indicating the name of the column to query on.

#### sparse_data: `dict[str, list[int | float]]`, *Required*
#### sparse_data: `SparseVector(list[int], list[int] | list[float])`, *Required*

The query sparse vector data to compare against. The `sparse_data` parameter should be provided as a dictionary like `{"indices": list[int], "values": list[int | float]}`:
The query sparse vector data to compare against. The `sparse_data` parameter should be provided as a SparseVector object, which has two members:

- `"indices"`: A list of the indices, each corresponding to a non-zero value in the sparse vector.
- `"values"`: A list of non-zero values in the sparse vector.
- `indices`: A list of the indices, each corresponding to a non-zero value in the sparse vector.
- `values`: A list of the corresponding values for each index in the `indices` list.

:::tip NOTE
If you have a dictionary of indices and values, you can create a SparseVector object using the `SparseVector` class. For example:

```python
from infinity.common import SparseVector
dic_sparse_vector = {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}
sparse_vector = SparseVector(**dic_sparse_vector)
```

:::

#### distance_type: `str`, *Required*

Expand Down Expand Up @@ -1597,13 +1608,14 @@ A dictionary representing additional parameters for the sparse vector search. Fo
```python
# As demonstrated in the following example:
# The sparse vector search is performed on column "sparse_column" to find the 100 nearest neighbors using inner product
# {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]} represents the sparse vector to compare against:
# SparseVector(**{"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}) represents the sparse vector to compare against:
# - 0: the index of 0.1
# - 10: the index of 0.2
# - 20: the index of 0.3
# If no sparse vector index is created on the column being queried, then the search defaults to a brute-force search.
# In such case, set `opt_params` to `None` or leave it blank.
table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}, 'ip', 100)
from infinity.common import SparseVector
table_object.match_sparse('sparse', SparseVector([0, 10, 20], [0.1, 0.2, 0.3]), 'ip', 100)
```

:::caution NOTE
Expand All @@ -1614,30 +1626,32 @@ table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [0.1, 0.2

```python
from infinity.index import IndexInfo, IndexType, InitParameter
table_object.create_index("my_index", [IndexInfo("sparse", IndexType.BMP, [])])
table_object.create_index("my_index", [IndexInfo("sparse", IndexType.BMP)])
# Find the 100 nearest neighbors using inner product
# If a BMP index is successfully built on the column being queried, then the sparse vector search uses this index,
# regardless of whether `opt_params` is set.
# If you leave `opt_params` blank, the search takes the default settings for `"alpha"` and `"beta"`.
table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}, 'ip', 100, {"alpha": "1.0", "beta": "1.0"})
from infinity.common import SparseVector
table_object.match_sparse('sparse', SparseVector([0, 10, 20], [0.1, 0.2, 0.3]), 'ip', 100, {"alpha": "1.0", "beta": "1.0"})
```

```python
from infinity.index import IndexInfo, IndexType, InitParameter
table_object.create_index("my_index", IndexInfo("sparse", IndexType.BMP, []))
table_object.create_index("my_index", IndexInfo("sparse", IndexType.BMP))
# Find the 100 nearest neighbors using inner product
# If a BMP index is successfully built on the column being queried, then the sparse vector search uses this index,
# regardless of whether `opt_params` is set.
# You can set the values of `"alpha"` or `"beta"` in `opt_params`, which overrides the default settings.
table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [8, 10, 66]}, 'ip', 100, {"alpha": "1.0", "beta": "1.0"})
from infinity.common import SparseVector
table_object.match_sparse('sparse', SparseVector([0, 10, 20], [8, 10, 66]), 'ip', 100, {"alpha": "1.0", "beta": "1.0"})
```

---

## match
## match_text

```python
table_object.match(fields, matching_text, distance_type, options_text)
table_object.match_text(fields, matching_text, topn, extra_options)
```

Creates a full-text search expression on the specified field(s)/column(s) to identify the most relevant rows.
Expand Down Expand Up @@ -1665,30 +1679,42 @@ To display your query results, you must chain this method with `output(columns)`
A non-empty text string to search for. You can use various search options within the matching text, including:

- Single terms: `"blooms"`
- OR multiple terms: `"Bloom filter"`
- OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"`
- Phrase search: `'"Bloom filter"'`
- AND multiple terms: "space efficient"
- Escaping reserved characters: "space\-efficient"
- Sloppy phrase search: "harmful chemical"~10
- Field-specific search: title:(quick OR brown) AND body:foobar
- AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"`
- Escaping reserved characters: `"space\-efficient"`
- Sloppy phrase search: `'"harmful chemical"~10'`
- Field-specific search: `"title:(quick OR brown) AND body:foobar"`

#### options_text: `str`, *Required*
#### topn: `int`, *Required*

Specifies the number of the most relevant rows to retrieve, e.g., assign `10` to obtain the ten most relevant rows.

#### extra_options: `dict`, *Optional*

A non-empty string specifying the following search options:
An optional dictionary specifying the following search options:

- **"topn"**: `str`, *Required*
Specifies the number of the most relevant rows to retrieve, e.g., `"topn=10"` to obtain the ten most relevant rows.
- **"default_field"**: `str`, *Optional*
- If `"fields"` is an empty string, this parameter specifies the default field to search on.
- **"operator"**: `str`, *Optional*
- If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators and escape characters will function as full-text search operators, such as:
- `&&`, `+`, `||`, `!`, `NOT`, `AND`, `OR` `-`, `(`, `)`, `~`, `^`, `:`, `"`.
- Escape characters like `\`, `\t`, and more.
- If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators, quotation marks and escape characters will function as full-text search operators, such as:
- AND operator: `AND`, `&&`, `+`
- OR operator: `OR`, `||`
- NOT operator: `NOT`, `!`, `-`
- PAREN operator: `(`, `)`, need to appear in pairs, and can be nested.
- COLON operator: `:`: Used to specify field-specific search, e.g., `body:foobar` searches for `foobar` in the `body` field.
- CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`.
- TILDE operator: `~`: Used for sloppy phrase search, e.g., `"harmful chemical"~10` searches for the phrase `"harmful chemical"` within a tolerable distance of 10 words.
- SINGLE_QUOTED_STRING: Used to search for a phrase, e.g., `'Bloom filter'`.
- DOUBLE_QUOTED_STRING: Used to search for a phrase, e.g., `"Bloom filter"`.
- Escape characters: Used to escape reserved characters, e.g., `space\-efficient`. Starting with a backslash `\` will escape the following characters:
`' '`, `'+'`, `'-'`, `'='`, `'&'`, `'|'`, `'!'`, `'('`, `')'`, `'{'`, `'}'`, `'['`, `']'`, `'^'`, `'"'`, `'~'`, `'*'`, `'?'`, `':'`, `'\'`, `'/'`
- If specified, Infinity's full-text search syntax will not take effect, and the specified operator will be interpolated into `matching_text`.
- `"operator=OR"`/`"operator=or"`: Interpolates the `OR` operator between words in `matching_text` to create a new search text.
- `"operator=AND"`/`"operator=and"`: Interpolates the `AND` operator between words in `matching_text` to create a new search text. Useful for searching text including code numbers like `"A01-233:BC"`, resulting in `"A01" AND "-233" AND "BC"`.

:::tip NOTE
If both `"topn"` and `"operator"` options are specified, separate them with a semicolon, e.g., `"topn=100;operator=OR"`
:::
Useful for searching text including code numbers like `"A01-233:BC"`.
- `{"operator": "or"}`: Interpolates the `OR` operator between words in `matching_text` to create a new search text.
For example, reinterprets `"A01-233:BC"` as `'"A01" OR "-233" OR "BC"'`.
- `{"operator": "and"}`: Interpolates the `AND` operator between words in `matching_text` to create a new search text.
For example, reinterprets `"A01-233:BC"` as `'"A01" AND "-233" AND "BC"'`.

### Returns

Expand All @@ -1712,7 +1738,8 @@ questions = [
r'title:(quick OR brown) AND body:foobar', # search `(quick OR brown)` in the `title` field. keep fields empty.
]
for question in questions:
table_object.match('body', question, 'topn=2')
table_object.match_text('body', question, 2)
table_object.match_text('', question, 2, {'default_field': 'body'})
```

---
Expand Down Expand Up @@ -1796,20 +1823,22 @@ The following code snippets illustrate the use of fused reranking in a three-way
#### Use RRF for reranking

```python {6}
from infinity.common import SparseVector
table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
.match("body", "blooms", "topn=10")
.match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3)
.match_text("body", "blooms", 10)
.filter("year < 2024")
.fusion("rrf", 2)
.to_pl()
```

```python {6}
from infinity.common import SparseVector
table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
.match("body", "blooms", "topn=10")
.match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3)
.match_text("body", "blooms", 10)
.filter("year < 2024")
.fusion("rrf", 2, {"rank_constant": 30})
.to_pl()
Expand All @@ -1818,10 +1847,11 @@ table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"]
#### Use Weighted Sum for reranking

```python {6}
from infinity.common import SparseVector
table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
.match("body", "blooms", "topn=10")
.match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3)
.match_text("body", "blooms", 10)
.filter("year < 2024")
.fusion("weighted_sum", 2, {"weights": "1,2,0.5"})
.to_pl()
Expand All @@ -1830,10 +1860,11 @@ table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"]
#### Use tensor reranking

```python {8}
from infinity.common import SparseVector
table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
.match("body", "blooms", "topn=10")
.match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3)
.match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3)
.match_text("body", "blooms", 10)
.filter("year < 2024")
.fusion("match_tensor", 2, {"field": "tensor", "data_type": "float", "data": [[0.0, -10.0, 0.0, 0.7], [9.2, 45.6, -55.8, 3.5]]})
.to_pl()
Expand Down Expand Up @@ -1906,7 +1937,7 @@ A `polas.DataFrame` object.
```python
# Format a vector search result into a Polas DataFrame.
res = table_object.output(["*"])
.knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10)
.match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10)
.to_pl()
```

Expand Down
8 changes: 4 additions & 4 deletions example/ColBERT_reranker_example/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ def query_bm25(self, query_str: str, output_columns: list[str], top_n: int):
output_columns.append('_row_id')
if '_score' not in output_columns:
output_columns.append('_score')
query_result = self.colbert_test_table.output(output_columns).match(self.inner_col_txt, query_str,
f'topn={top_n}').to_pl()
query_result = self.colbert_test_table.output(output_columns).match_text(self.inner_col_txt, query_str,
top_n).to_pl()
print(query_result)
return query_result

Expand Down Expand Up @@ -161,8 +161,8 @@ def query_fusion(self, query_str: str, output_columns: list[str], final_top_n: i
query_tensor = self.ckpt.queryFromText([query_str])[0]
if query_tensor.dim() != 2 or query_tensor.size(1) != 128:
raise ValueError("Dimension error.")
query_result = self.colbert_test_table.output(output_columns).match(self.inner_col_txt, query_str,
f'topn={first_stage_top_n}').fusion(
query_result = self.colbert_test_table.output(output_columns).match_text(self.inner_col_txt, query_str,
first_stage_top_n).fusion(
method='match_tensor', topn=final_top_n,
fusion_params={"field": target_col_name, "data": query_tensor.numpy(force=True),
"data_type": "float"}).to_pl()
Expand Down
2 changes: 1 addition & 1 deletion example/fulltext_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
for question in questions:
qb_result = (
table_instance.output(["num", "body", "_score"])
.match("body", question, "topn=10")
.match_text("body", question, 10)
.to_pl()
)
print(f"question: {question}")
Expand Down
2 changes: 1 addition & 1 deletion example/fulltext_search_zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@
r'"Bloom filter"', # phrase: adjacent multiple terms
]
for question in questions:
qb_result = table_instance.output(["num", "body", "_score"]).match("body", question, "topn=10").to_pl()
qb_result = table_instance.output(["num", "body", "_score"]).match_text("body", question, 10).to_pl()
print(f"question: {question}")
print(qb_result)

Expand Down
Loading

0 comments on commit 9da9455

Please sign in to comment.