From 9da9455d3c983e24eb7f5231924bd3db3487106f Mon Sep 17 00:00:00 2001 From: yangzq50 <58433399+yangzq50@users.noreply.github.com> Date: Sat, 17 Aug 2024 11:59:26 +0800 Subject: [PATCH] Update python interface for match_text, match_sparse and match_dense (#1667) ### What problem does this PR solve? When using Python sdk: - Use match_dense instead of knn - Use match_text instead of match Function parameters for match_text and match_sparse have changed fix #1625 ### Type of change - [x] Breaking Change (fix or feature that could cause existing functionality not to work as expected) - [x] Documentation Update - [x] Refactoring - [x] Test cases - [x] Python SDK impacted, Need to update PyPI --- README.md | 2 +- docs/getstarted/quickstart.md | 2 +- docs/references/pysdk_api_reference.md | 129 ++++++---- example/ColBERT_reranker_example/helper.py | 8 +- example/fulltext_search.py | 2 +- example/fulltext_search_zh.py | 2 +- example/hybrid_search.py | 15 +- example/tensor_search.py | 7 +- example/vector_search.py | 4 +- python/README.md | 2 +- python/benchmark/clients/infinity_client.py | 10 +- .../legacy_benchmark/remote_benchmark_knn.py | 8 +- .../remote_benchmark_milvus_knn.py | 2 +- .../remote_benchmark_sparse.py | 6 +- .../remote_benchmark_sparse_import.py | 7 +- .../benchmark/mldr_benchmark/insert_data.py | 4 +- .../mldr_benchmark/insert_data_50000.py | 4 +- .../insert_data_with_colbert.py | 12 +- .../insert_data_with_colbert_50000.py | 12 +- .../mldr_benchmark/mldr_common_tools.py | 7 +- python/infinity/common.py | 32 ++- .../infinity/local_infinity/query_builder.py | 42 ++-- python/infinity/local_infinity/table.py | 24 +- python/infinity/local_infinity/types.py | 4 +- python/infinity/local_infinity/utils.py | 14 +- .../infinity/remote_thrift/query_builder.py | 16 +- python/infinity/remote_thrift/table.py | 24 +- python/infinity/remote_thrift/types.py | 35 +-- python/infinity/remote_thrift/utils.py | 14 +- python/infinity/utils.py | 4 + python/infinity_http.py | 25 +- python/parallel_test/test_chaos.py | 6 +- python/parallel_test/test_index_parallel.py | 8 +- python/restart_test/test_memidx.py | 17 +- .../test_pysdk/doc/pysdk_api_test_document.md | 4 +- python/test_pysdk/test_index.py | 4 +- python/test_pysdk/test_insert.py | 13 +- python/test_pysdk/test_knn.py | 234 ++++++------------ python/test_pysdk/test_query.py | 4 +- python/tmp_test.py | 2 +- 40 files changed, 392 insertions(+), 379 deletions(-) diff --git a/README.md b/README.md index fb6f4277af..5ae319e431 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Infinity, also available as a Python module, eliminates the need for a separate table = db.create_table("my_table", {"num": {"type": "integer"}, "body": {"type": "varchar"}, "vec": {"type": "vector, 4, float"}}) table.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}]) table.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}]) - res = table.output(["*"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl() + res = table.output(["*"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl() print(res) ``` diff --git a/docs/getstarted/quickstart.md b/docs/getstarted/quickstart.md index 22982ca7e8..eb0057a21b 100644 --- a/docs/getstarted/quickstart.md +++ b/docs/getstarted/quickstart.md @@ -30,7 +30,7 @@ Infinity, also available as a Python module, eliminates the need for a separate table_object.insert([{"num": 1, "body": "unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9]}]) table_object.insert([{"num": 2, "body": "Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5]}]) res = table_object.output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2) + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2) .to_pl() print(res) ``` diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md index 58da5b764b..8d7b291191 100644 --- a/docs/references/pysdk_api_reference.md +++ b/docs/references/pysdk_api_reference.md @@ -1443,10 +1443,10 @@ table_object.output(["*"]).filter("c2 = 3").to_pl() --- -## knn +## match_dense ```python -table_object.knn(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None) +table_object.match_dense(vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params = None) ``` Creates a dense vector search expression to identify the top n closest rows to the given dense vector. Suitable for working with dense vectors (dense embeddings). @@ -1503,7 +1503,7 @@ A dictionary representing additional parameters for the KNN or ANN search. # Find the 100 nearest neighbors using Euclidean distance # If no vector index is created on the column being queried, then the vector search defaults to a brute-force search. # In such case, set `knn_params` to `None` or leave it blank. -table_object.knn("vec", [0.1,0.2,0.3], "float", "l2", 100) +table_object.match_dense("vec", [0.1,0.2,0.3], "float", "l2", 100) ``` :::caution NOTE @@ -1522,7 +1522,7 @@ table_object.create_index("my_index", IndexInfo("vec", IndexType.Hnsw, [InitPara # If an HNSW index is successfully built on the column being queried, then the vector search uses this index, # regardless of whether `knn_params` is set. # If you leave `knn_params` blank, the search takes the `"ef"` value set in `create_index()`. -table_object.knn("vec", [1, 2, 3], "uint8", "cosine", 2) +table_object.match_dense("vec", [1, 2, 3], "uint8", "cosine", 2) ``` ```python @@ -1532,7 +1532,7 @@ table_object.create_index("my_index", IndexInfo("vec", IndexType.Hnsw, [InitPara # If an HNSW index is successfully built on the column being queried, then the vector search uses this index, # regardless of whether `knn_params` is set. # You can specify the value of `"ef"` in `knn_params`, which overrides the value set in `create_index()` -table_object.knn("vec", [0.1,0.2,0.3], "float", "ip", 2, {"ef": "100"}) +table_object.match_dense("vec", [0.1,0.2,0.3], "float", "ip", 2, {"ef": "100"}) ``` :::tip NOTE @@ -1559,12 +1559,23 @@ To display your query results, you must chain this method with `output(columns)` A non-empty string indicating the name of the column to query on. -#### sparse_data: `dict[str, list[int | float]]`, *Required* +#### sparse_data: `SparseVector(list[int], list[int] | list[float])`, *Required* -The query sparse vector data to compare against. The `sparse_data` parameter should be provided as a dictionary like `{"indices": list[int], "values": list[int | float]}`: +The query sparse vector data to compare against. The `sparse_data` parameter should be provided as a SparseVector object, which has two members: -- `"indices"`: A list of the indices, each corresponding to a non-zero value in the sparse vector. -- `"values"`: A list of non-zero values in the sparse vector. +- `indices`: A list of the indices, each corresponding to a non-zero value in the sparse vector. +- `values`: A list of the corresponding values for each index in the `indices` list. + +:::tip NOTE +If you have a dictionary of indices and values, you can create a SparseVector object using the `SparseVector` class. For example: + +```python +from infinity.common import SparseVector +dic_sparse_vector = {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]} +sparse_vector = SparseVector(**dic_sparse_vector) +``` + +::: #### distance_type: `str`, *Required* @@ -1597,13 +1608,14 @@ A dictionary representing additional parameters for the sparse vector search. Fo ```python # As demonstrated in the following example: # The sparse vector search is performed on column "sparse_column" to find the 100 nearest neighbors using inner product -# {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]} represents the sparse vector to compare against: +# SparseVector(**{"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}) represents the sparse vector to compare against: # - 0: the index of 0.1 # - 10: the index of 0.2 # - 20: the index of 0.3 # If no sparse vector index is created on the column being queried, then the search defaults to a brute-force search. # In such case, set `opt_params` to `None` or leave it blank. -table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}, 'ip', 100) +from infinity.common import SparseVector +table_object.match_sparse('sparse', SparseVector([0, 10, 20], [0.1, 0.2, 0.3]), 'ip', 100) ``` :::caution NOTE @@ -1614,30 +1626,32 @@ table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [0.1, 0.2 ```python from infinity.index import IndexInfo, IndexType, InitParameter -table_object.create_index("my_index", [IndexInfo("sparse", IndexType.BMP, [])]) +table_object.create_index("my_index", [IndexInfo("sparse", IndexType.BMP)]) # Find the 100 nearest neighbors using inner product # If a BMP index is successfully built on the column being queried, then the sparse vector search uses this index, # regardless of whether `opt_params` is set. # If you leave `opt_params` blank, the search takes the default settings for `"alpha"` and `"beta"`. -table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [0.1, 0.2, 0.3]}, 'ip', 100, {"alpha": "1.0", "beta": "1.0"}) +from infinity.common import SparseVector +table_object.match_sparse('sparse', SparseVector([0, 10, 20], [0.1, 0.2, 0.3]), 'ip', 100, {"alpha": "1.0", "beta": "1.0"}) ``` ```python from infinity.index import IndexInfo, IndexType, InitParameter -table_object.create_index("my_index", IndexInfo("sparse", IndexType.BMP, [])) +table_object.create_index("my_index", IndexInfo("sparse", IndexType.BMP)) # Find the 100 nearest neighbors using inner product # If a BMP index is successfully built on the column being queried, then the sparse vector search uses this index, # regardless of whether `opt_params` is set. # You can set the values of `"alpha"` or `"beta"` in `opt_params`, which overrides the default settings. -table_object.match_sparse('sparse', {"indices": [0, 10, 20], "values": [8, 10, 66]}, 'ip', 100, {"alpha": "1.0", "beta": "1.0"}) +from infinity.common import SparseVector +table_object.match_sparse('sparse', SparseVector([0, 10, 20], [8, 10, 66]), 'ip', 100, {"alpha": "1.0", "beta": "1.0"}) ``` --- -## match +## match_text ```python -table_object.match(fields, matching_text, distance_type, options_text) +table_object.match_text(fields, matching_text, topn, extra_options) ``` Creates a full-text search expression on the specified field(s)/column(s) to identify the most relevant rows. @@ -1665,30 +1679,42 @@ To display your query results, you must chain this method with `output(columns)` A non-empty text string to search for. You can use various search options within the matching text, including: - Single terms: `"blooms"` -- OR multiple terms: `"Bloom filter"` +- OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"` - Phrase search: `'"Bloom filter"'` -- AND multiple terms: "space efficient" -- Escaping reserved characters: "space\-efficient" -- Sloppy phrase search: "harmful chemical"~10 -- Field-specific search: title:(quick OR brown) AND body:foobar +- AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"` +- Escaping reserved characters: `"space\-efficient"` +- Sloppy phrase search: `'"harmful chemical"~10'` +- Field-specific search: `"title:(quick OR brown) AND body:foobar"` -#### options_text: `str`, *Required* +#### topn: `int`, *Required* + +Specifies the number of the most relevant rows to retrieve, e.g., assign `10` to obtain the ten most relevant rows. + +#### extra_options: `dict`, *Optional* -A non-empty string specifying the following search options: +An optional dictionary specifying the following search options: -- **"topn"**: `str`, *Required* - Specifies the number of the most relevant rows to retrieve, e.g., `"topn=10"` to obtain the ten most relevant rows. +- **"default_field"**: `str`, *Optional* + - If `"fields"` is an empty string, this parameter specifies the default field to search on. - **"operator"**: `str`, *Optional* - - If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators and escape characters will function as full-text search operators, such as: - - `&&`, `+`, `||`, `!`, `NOT`, `AND`, `OR` `-`, `(`, `)`, `~`, `^`, `:`, `"`. - - Escape characters like `\`, `\t`, and more. + - If not specified, the search follows Infinity's full-text search syntax, meaning that logical and arithmetic operators, quotation marks and escape characters will function as full-text search operators, such as: + - AND operator: `AND`, `&&`, `+` + - OR operator: `OR`, `||` + - NOT operator: `NOT`, `!`, `-` + - PAREN operator: `(`, `)`, need to appear in pairs, and can be nested. + - COLON operator: `:`: Used to specify field-specific search, e.g., `body:foobar` searches for `foobar` in the `body` field. + - CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. + - TILDE operator: `~`: Used for sloppy phrase search, e.g., `"harmful chemical"~10` searches for the phrase `"harmful chemical"` within a tolerable distance of 10 words. + - SINGLE_QUOTED_STRING: Used to search for a phrase, e.g., `'Bloom filter'`. + - DOUBLE_QUOTED_STRING: Used to search for a phrase, e.g., `"Bloom filter"`. + - Escape characters: Used to escape reserved characters, e.g., `space\-efficient`. Starting with a backslash `\` will escape the following characters: + `' '`, `'+'`, `'-'`, `'='`, `'&'`, `'|'`, `'!'`, `'('`, `')'`, `'{'`, `'}'`, `'['`, `']'`, `'^'`, `'"'`, `'~'`, `'*'`, `'?'`, `':'`, `'\'`, `'/'` - If specified, Infinity's full-text search syntax will not take effect, and the specified operator will be interpolated into `matching_text`. - - `"operator=OR"`/`"operator=or"`: Interpolates the `OR` operator between words in `matching_text` to create a new search text. - - `"operator=AND"`/`"operator=and"`: Interpolates the `AND` operator between words in `matching_text` to create a new search text. Useful for searching text including code numbers like `"A01-233:BC"`, resulting in `"A01" AND "-233" AND "BC"`. - -:::tip NOTE -If both `"topn"` and `"operator"` options are specified, separate them with a semicolon, e.g., `"topn=100;operator=OR"` -::: + Useful for searching text including code numbers like `"A01-233:BC"`. + - `{"operator": "or"}`: Interpolates the `OR` operator between words in `matching_text` to create a new search text. + For example, reinterprets `"A01-233:BC"` as `'"A01" OR "-233" OR "BC"'`. + - `{"operator": "and"}`: Interpolates the `AND` operator between words in `matching_text` to create a new search text. + For example, reinterprets `"A01-233:BC"` as `'"A01" AND "-233" AND "BC"'`. ### Returns @@ -1712,7 +1738,8 @@ questions = [ r'title:(quick OR brown) AND body:foobar', # search `(quick OR brown)` in the `title` field. keep fields empty. ] for question in questions: - table_object.match('body', question, 'topn=2') + table_object.match_text('body', question, 2) + table_object.match_text('', question, 2, {'default_field': 'body'}) ``` --- @@ -1796,20 +1823,22 @@ The following code snippets illustrate the use of fused reranking in a three-way #### Use RRF for reranking ```python {6} +from infinity.common import SparseVector table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) - .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) - .match("body", "blooms", "topn=10") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) + .match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3) + .match_text("body", "blooms", 10) .filter("year < 2024") .fusion("rrf", 2) .to_pl() ``` ```python {6} +from infinity.common import SparseVector table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) - .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) - .match("body", "blooms", "topn=10") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) + .match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3) + .match_text("body", "blooms", 10) .filter("year < 2024") .fusion("rrf", 2, {"rank_constant": 30}) .to_pl() @@ -1818,10 +1847,11 @@ table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"] #### Use Weighted Sum for reranking ```python {6} +from infinity.common import SparseVector table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) - .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) - .match("body", "blooms", "topn=10") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) + .match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3) + .match_text("body", "blooms", 10) .filter("year < 2024") .fusion("weighted_sum", 2, {"weights": "1,2,0.5"}) .to_pl() @@ -1830,10 +1860,11 @@ table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"] #### Use tensor reranking ```python {8} +from infinity.common import SparseVector table_object.output(["num", "body", "vec", "sparse", "year", "tensor", "_score"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) - .match_sparse("sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) - .match("body", "blooms", "topn=10") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) + .match_sparse("sparse", SparseVector([0, 20, 80], [1.0, 2.0, 3.0]), "ip", 3) + .match_text("body", "blooms", 10) .filter("year < 2024") .fusion("match_tensor", 2, {"field": "tensor", "data_type": "float", "data": [[0.0, -10.0, 0.0, 0.7], [9.2, 45.6, -55.8, 3.5]]}) .to_pl() @@ -1906,7 +1937,7 @@ A `polas.DataFrame` object. ```python # Format a vector search result into a Polas DataFrame. res = table_object.output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10) + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 10) .to_pl() ``` diff --git a/example/ColBERT_reranker_example/helper.py b/example/ColBERT_reranker_example/helper.py index 878fbff331..24dc5ac305 100644 --- a/example/ColBERT_reranker_example/helper.py +++ b/example/ColBERT_reranker_example/helper.py @@ -125,8 +125,8 @@ def query_bm25(self, query_str: str, output_columns: list[str], top_n: int): output_columns.append('_row_id') if '_score' not in output_columns: output_columns.append('_score') - query_result = self.colbert_test_table.output(output_columns).match(self.inner_col_txt, query_str, - f'topn={top_n}').to_pl() + query_result = self.colbert_test_table.output(output_columns).match_text(self.inner_col_txt, query_str, + top_n).to_pl() print(query_result) return query_result @@ -161,8 +161,8 @@ def query_fusion(self, query_str: str, output_columns: list[str], final_top_n: i query_tensor = self.ckpt.queryFromText([query_str])[0] if query_tensor.dim() != 2 or query_tensor.size(1) != 128: raise ValueError("Dimension error.") - query_result = self.colbert_test_table.output(output_columns).match(self.inner_col_txt, query_str, - f'topn={first_stage_top_n}').fusion( + query_result = self.colbert_test_table.output(output_columns).match_text(self.inner_col_txt, query_str, + first_stage_top_n).fusion( method='match_tensor', topn=final_top_n, fusion_params={"field": target_col_name, "data": query_tensor.numpy(force=True), "data_type": "float"}).to_pl() diff --git a/example/fulltext_search.py b/example/fulltext_search.py index 1deb180566..20dd32187f 100644 --- a/example/fulltext_search.py +++ b/example/fulltext_search.py @@ -86,7 +86,7 @@ for question in questions: qb_result = ( table_instance.output(["num", "body", "_score"]) - .match("body", question, "topn=10") + .match_text("body", question, 10) .to_pl() ) print(f"question: {question}") diff --git a/example/fulltext_search_zh.py b/example/fulltext_search_zh.py index de98dc81de..bbef298bef 100644 --- a/example/fulltext_search_zh.py +++ b/example/fulltext_search_zh.py @@ -111,7 +111,7 @@ r'"Bloom filter"', # phrase: adjacent multiple terms ] for question in questions: - qb_result = table_instance.output(["num", "body", "_score"]).match("body", question, "topn=10").to_pl() + qb_result = table_instance.output(["num", "body", "_score"]).match_text("body", question, 10).to_pl() print(f"question: {question}") print(qb_result) diff --git a/example/hybrid_search.py b/example/hybrid_search.py index a0386499f1..48dd2ef4d3 100644 --- a/example/hybrid_search.py +++ b/example/hybrid_search.py @@ -17,6 +17,7 @@ """ import infinity +from infinity.common import SparseVector try: # open a local directory to store the data @@ -51,7 +52,7 @@ "num": 1, "body": r"unnecessary and harmful", "vec": [1.0, 1.2, 0.8, 0.9], - "sparse": {"indices": [10, 20, 30], "values": [1.1, 2.2, 3.3]}, + "sparse": SparseVector([10, 20, 30], [1.1, 2.2, 3.3]), "year": 2024, "tensor": [[1.0, 0.0, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]], }, @@ -59,7 +60,7 @@ "num": 2, "body": r"Office for Harmful Blooms", "vec": [4.0, 4.2, 4.3, 4.5], - "sparse": {"indices": [40, 50, 60], "values": [4.4, 5.5, 6.6]}, + "sparse": SparseVector([40, 50, 60], [4.4, 5.5, 6.6]), "year": 2023, "tensor": [[4.0, 0.0, 4.3, 4.5], [4.0, 4.2, 4.4, 5.0]], }, @@ -67,7 +68,7 @@ "num": 3, "body": r"A Bloom filter is a space-efficient probabilistic data structure, conceived by Burton Howard Bloom in 1970, that is used to test whether an element is a member of a set.", "vec": [4.0, 4.2, 4.3, 4.2], - "sparse": {"indices": [70, 80, 90], "values": [7.7, 8.8, 9.9]}, + "sparse": SparseVector([70, 80, 90], [7.7, 8.8, 9.9]), "year": 2019, "tensor": [[0.9, 0.1, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]], }, @@ -75,7 +76,7 @@ "num": 4, "body": r"The American Football Conference (AFC) harm chemical anarchism add test is one of harm chemical the two conferences of the National Football League (NFL). This add test conference and its counterpart, the National Football Conference (NFC), currently contain 16 teams each, making up the 32 teams of the NFL. The current AFC title holder is the New England Patriots.", "vec": [4.0, 4.2, 4.3, 4.5], - "sparse": {"indices": [20, 80, 90], "values": [7.7, 7.8, 97.9]}, + "sparse": SparseVector([20, 80, 90], [7.7, 7.8, 97.9]), "year": 2018, "tensor": [[4.0, 4.2, 4.3, 4.5], [4.0, 4.2, 4.3, 4.4]], }, @@ -83,17 +84,17 @@ ) # TODO: dense vector + sparse vector + full-text + structured data filter + tensor reranker - # result = table_instance.output(["num", "body"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3).match("body", "blooms","topn=1").fusion(method="rrf").to_pl() + # result = table_instance.output(["num", "body"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3).match_text("body", "blooms","topn=1").fusion(method="rrf").to_pl() result = ( table_instance.output( ["num", "body", "vec", "sparse", "year", "tensor", "_score"] ) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", 3) .match_sparse( "sparse", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3 ) - .match("body", "blooms", "topn=10") + .match_text("body", "blooms", "topn=10") .filter("year < 2024") .fusion( method="match_tensor", topn=2, diff --git a/example/tensor_search.py b/example/tensor_search.py index 7182d83df1..c4557c10a5 100644 --- a/example/tensor_search.py +++ b/example/tensor_search.py @@ -61,10 +61,9 @@ }, ] ) - result = table_instance.output(["num", "vec", "_score"]).match_tensor(column_name="vec", topn=2, - query_data=[[0.9, 0.0, 0.0, 0.0], - [1.1, 0.0, 0.0, 0.0]], - query_data_type='float').to_pl() + result = table_instance.output(["num", "vec", "_score"]).match_tensor("vec", + [[0.9, 0.0, 0.0, 0.0], [1.1, 0.0, 0.0, 0.0]], + 'float', 2).to_pl() print(result) infinity_instance.disconnect() diff --git a/example/vector_search.py b/example/vector_search.py index cb1ec7e3a2..fa2473216b 100644 --- a/example/vector_search.py +++ b/example/vector_search.py @@ -59,8 +59,8 @@ ] ) - result = table_instance.output(["num", "vec", "_similarity"]).knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "cosine", - 3).to_pl() + result = table_instance.output(["num", "vec", "_similarity"]).match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", + "cosine", 3).to_pl() print(result) infinity_instance.disconnect() diff --git a/python/README.md b/python/README.md index 811bd45833..a95d872f53 100644 --- a/python/README.md +++ b/python/README.md @@ -24,7 +24,7 @@ table.insert([{"num": 1, "body": "undesirable, unnecessary, and harmful", "vec": table.insert([{"num": 2, "body": "publisher=US National Office for Harmful Algal Blooms", "vec": [4.0] * 5}]) table.insert([{"num": 3, "body": "in the case of plants, growth and chemical", "vec": [7.0] * 5}]) -res = table.output(["*"]).knn("vec", [3.0] * 5, "float", "ip", 2).to_pl() +res = table.output(["*"]).match_dense("vec", [3.0] * 5, "float", "ip", 2).to_pl() print(res) ``` diff --git a/python/benchmark/clients/infinity_client.py b/python/benchmark/clients/infinity_client.py index 86ac74cf21..7d9ffea126 100644 --- a/python/benchmark/clients/infinity_client.py +++ b/python/benchmark/clients/infinity_client.py @@ -18,6 +18,7 @@ import infinity import infinity.index as index from infinity import NetworkAddress +from infinity.common import SparseVector from infinity.remote_thrift.query_builder import InfinityThriftQueryBuilder from .base_client import BaseClient @@ -166,7 +167,7 @@ def do_single_query(self, query_id, client_id) -> list[Any]: if self.data_mode == "vector": res, _ = ( table_obj.output(["_row_id"]) - .knn( + .match_dense( self.data["vector_name"], query, "float", @@ -179,10 +180,11 @@ def do_single_query(self, query_id, client_id) -> list[Any]: elif self.data_mode == "fulltext": res, _ = ( table_obj.output(["_row_id", "_score"]) - .match( + .match_text( "", query, - f"topn={self.data['topK']};default_field=body", + self.data['topK'], + {"default_field": "body"} ) .to_result() ) @@ -193,7 +195,7 @@ def do_single_query(self, query_id, client_id) -> list[Any]: query_builder.output(["_row_id"]) query_builder.match_sparse( list(self.data["schema"].keys())[0],#vector column name:col1 - {"indices": indices, "values": values}, + SparseVector(**{"indices": indices, "values": values}), self.data["metric_type"],#ip self.data["topK"], {"alpha": str(self.data["alpha"]), "beta": str(self.data["beta"])}, diff --git a/python/benchmark/legacy_benchmark/remote_benchmark_knn.py b/python/benchmark/legacy_benchmark/remote_benchmark_knn.py index 1efb868378..b021b380a3 100644 --- a/python/benchmark/legacy_benchmark/remote_benchmark_knn.py +++ b/python/benchmark/legacy_benchmark/remote_benchmark_knn.py @@ -119,10 +119,10 @@ def work(queries, topk, metric_type, column_name, data_type,ef: int, remote: boo table = infinity_obj.get_database("default_db").get_table(table_name) for query in queries: # print(len(query)) - # table.knn(column_name, query_vec, data_type, metric_type, topk).output(["_row_id"]).to_result() + # table.match_dense(column_name, query_vec, data_type, metric_type, topk).output(["_row_id"]).to_result() query_builder = InfinityThriftQueryBuilder(table) query_builder.output(["_row_id"]) - query_builder.knn(column_name, query, data_type, metric_type, topk, {"ef": str(ef)}) + query_builder.match_dense(column_name, query, data_type, metric_type, topk, {"ef": str(ef)}) query_builder.to_result() infinity_obj.disconnect() @@ -188,7 +188,7 @@ def one_thread(rounds, query_path, ground_truth_path, ef: int, remote: bool, tab table = infinity_obj.get_database("default_db").get_table(table_name) query_builder = InfinityThriftQueryBuilder(table) query_builder.output(["_row_id"]) - query_builder.knn('col1', queries[0], 'float', 'l2', 100, {'ef': str(ef)}) + query_builder.match_dense('col1', queries[0], 'float', 'l2', 100, {'ef': str(ef)}) res, _ = query_builder.to_result() dur_sum = 0 @@ -203,7 +203,7 @@ def one_thread(rounds, query_path, ground_truth_path, ef: int, remote: bool, tab query_builder = InfinityThriftQueryBuilder(table) query_builder.output(["_row_id"]) - query_builder.knn('col1', query_vec, 'float', 'l2', 100, {'index_name': 'hnsw_index', 'ef': str(ef)}) + query_builder.match_dense('col1', query_vec, 'float', 'l2', 100, {'index_name': 'hnsw_index', 'ef': str(ef)}) res, _ = query_builder.to_result() end = time.time() diff --git a/python/benchmark/legacy_benchmark/remote_benchmark_milvus_knn.py b/python/benchmark/legacy_benchmark/remote_benchmark_milvus_knn.py index 6e3629c1bd..f7d2fcf2b6 100644 --- a/python/benchmark/legacy_benchmark/remote_benchmark_milvus_knn.py +++ b/python/benchmark/legacy_benchmark/remote_benchmark_milvus_knn.py @@ -185,7 +185,7 @@ def one_thread(rounds, query_path, ground_truth_path, ef: int, limit: int, remot for idx, query_vec in enumerate(queries): start = time.time() - #query_builder.knn('col1', query_vec, 'float', 'l2', 100, {'ef': str(ef)}) + #query_builder.match_dense('col1', query_vec, 'float', 'l2', 100, {'ef': str(ef)}) res_list = client.search( collection_name="sift_benchmark_collection", data=[query_vec], diff --git a/python/benchmark/legacy_benchmark/remote_benchmark_sparse.py b/python/benchmark/legacy_benchmark/remote_benchmark_sparse.py index 395cd35c56..ea205f0186 100644 --- a/python/benchmark/legacy_benchmark/remote_benchmark_sparse.py +++ b/python/benchmark/legacy_benchmark/remote_benchmark_sparse.py @@ -20,7 +20,7 @@ import infinity from infinity.remote_thrift.query_builder import InfinityThriftQueryBuilder -from infinity.common import LOCAL_HOST, LOCAL_INFINITY_PATH +from infinity.common import LOCAL_HOST, LOCAL_INFINITY_PATH, SparseVector class SparseMatrix: @@ -95,7 +95,7 @@ def work(remote, queries, topk, alpha, beta): query_builder.output(["_row_id"]) query_builder.match_sparse( "col1", - {"indices": indices, "values": value}, + SparseVector(**{"indices": indices, "values": value}), "ip", topk, {"alpha": str(alpha), "beta": str(beta)}, @@ -164,7 +164,7 @@ def one_thread(remote, rounds, query_path, gt_path, alpha, beta): query_builder.output(["_row_id"]) query_builder.match_sparse( "col1", - {"indices": indices, "values": values}, + SparseVector(**{"indices": indices, "values": values}), "ip", topk, {"alpha": str(alpha), "beta": str(beta)}, diff --git a/python/benchmark/legacy_benchmark/remote_benchmark_sparse_import.py b/python/benchmark/legacy_benchmark/remote_benchmark_sparse_import.py index 9ec44903e7..7206a9425d 100644 --- a/python/benchmark/legacy_benchmark/remote_benchmark_sparse_import.py +++ b/python/benchmark/legacy_benchmark/remote_benchmark_sparse_import.py @@ -73,12 +73,7 @@ def import_data( index.IndexInfo( "col1", index.IndexType.BMP, - [ - index.InitParameter("block_size", str(block_size)), - index.InitParameter( - "compress_type", "compress" if compress else "raww" - ), - ], + {"block_size": str(block_size), "compress_type": "compress" if compress else "raww"} ) ) assert res.error_code == ErrorCode.OK diff --git a/python/benchmark/mldr_benchmark/insert_data.py b/python/benchmark/mldr_benchmark/insert_data.py index df76aaeb62..c834d11dff 100644 --- a/python/benchmark/mldr_benchmark/insert_data.py +++ b/python/benchmark/mldr_benchmark/insert_data.py @@ -16,7 +16,7 @@ from tqdm import tqdm from mldr_common_tools import load_corpus, fvecs_read_yield, read_mldr_sparse_embedding_yield, get_all_part_begin_ends import infinity -from infinity.common import ConflictType, LOCAL_HOST +from infinity.common import ConflictType, LOCAL_HOST, SparseVector import infinity.index as index from infinity.errors import ErrorCode @@ -77,7 +77,7 @@ def main(self): sparse_base_name = f"sparse-{sparse_pos_part_begin}-{sparse_pos_part_end}.data" sparse_data = read_mldr_sparse_embedding_yield(os.path.join(sparse_embedding_dir, sparse_base_name)) insert_dict = {"docid_col": docid_list[row_pos], "fulltext_col": corpus_text_list[row_pos], - "dense_col": next(dense_data), "sparse_col": next(sparse_data)} + "dense_col": next(dense_data), "sparse_col": SparseVector(**(next(sparse_data)))} buffer.append(insert_dict) self.infinity_table.insert(buffer) buffer.clear() diff --git a/python/benchmark/mldr_benchmark/insert_data_50000.py b/python/benchmark/mldr_benchmark/insert_data_50000.py index 04be9eac69..5a46c802c6 100644 --- a/python/benchmark/mldr_benchmark/insert_data_50000.py +++ b/python/benchmark/mldr_benchmark/insert_data_50000.py @@ -17,7 +17,7 @@ from mldr_common_tools import load_corpus, fvecs_read_yield, read_mldr_sparse_embedding_yield, get_all_part_begin_ends import infinity import infinity.index as index -from infinity.common import ConflictType, LOCAL_HOST +from infinity.common import ConflictType, LOCAL_HOST, SparseVector from infinity.errors import ErrorCode @@ -83,7 +83,7 @@ def main(self): if int(docid_str.split('-')[-1]) >= 189796: continue insert_dict = {"docid_col": docid_str, "fulltext_col": corpus_text_list[row_pos], - "dense_col": insert_dense_data, "sparse_col": insert_sparse_data} + "dense_col": insert_dense_data, "sparse_col": SparseVector(**insert_sparse_data)} buffer.append(insert_dict) if len(buffer) > 0: self.infinity_table.insert(buffer) diff --git a/python/benchmark/mldr_benchmark/insert_data_with_colbert.py b/python/benchmark/mldr_benchmark/insert_data_with_colbert.py index e489a8fbcf..e19e905794 100644 --- a/python/benchmark/mldr_benchmark/insert_data_with_colbert.py +++ b/python/benchmark/mldr_benchmark/insert_data_with_colbert.py @@ -18,7 +18,7 @@ from mldr_common_tools import load_corpus, fvecs_read_yield, read_mldr_sparse_embedding_yield, read_colbert_data_yield from mldr_common_tools import get_all_part_begin_ends, get_bit_array import infinity.index as index -from infinity.common import ConflictType, LOCAL_HOST +from infinity.common import ConflictType, LOCAL_HOST, SparseVector from infinity.errors import ErrorCode @@ -90,7 +90,7 @@ def main(self): insert_sparse_data = next(sparse_data) colbert_list = next(colbert_data) insert_dict = {"docid_col": docid_list[row_pos], "fulltext_col": corpus_text_list[row_pos], - "dense_col": insert_dense_data, "sparse_col": insert_sparse_data, + "dense_col": insert_dense_data, "sparse_col": SparseVector(**insert_sparse_data), "colbert_col": colbert_list, "colbert_bit_col": get_bit_array(colbert_list)} self.infinity_table.insert(insert_dict) print("Finish inserting data.") @@ -131,11 +131,9 @@ def main(self): self.infinity_table.optimize("bmp_index", {"topk": "1000", "bp_reorder": ""}) print("Finish creating BMP index.") # print("Start creating EMVB index...") - # res = self.infinity_table.create_index("emvb_index", - # index.IndexInfo("colbert_col", - # index.IndexType.EMVB, - # [index.InitParameter("pq_subspace_num", "32"), - # index.InitParameter("pq_subspace_bits", "8")]) + # res = self.infinity_table.create_index("emvb_index", index.IndexInfo("colbert_col", index.IndexType.EMVB, + # {"pq_subspace_num": "32", + # "pq_subspace_bits": "8"}) # , ConflictType.Error) # assert res.error_code == ErrorCode.OK # print("Finish creating EMVB index.") diff --git a/python/benchmark/mldr_benchmark/insert_data_with_colbert_50000.py b/python/benchmark/mldr_benchmark/insert_data_with_colbert_50000.py index 7612866dc1..50ef849155 100644 --- a/python/benchmark/mldr_benchmark/insert_data_with_colbert_50000.py +++ b/python/benchmark/mldr_benchmark/insert_data_with_colbert_50000.py @@ -18,7 +18,7 @@ from mldr_common_tools import load_corpus, fvecs_read_yield, read_mldr_sparse_embedding_yield, read_colbert_data_yield from mldr_common_tools import get_all_part_begin_ends, get_bit_array import infinity.index as index -from infinity.common import ConflictType, LOCAL_HOST +from infinity.common import ConflictType, LOCAL_HOST, SparseVector from infinity.errors import ErrorCode @@ -93,7 +93,7 @@ def main(self): if int(docid_str.split('-')[-1]) >= 189796: continue insert_dict = {"docid_col": docid_str, "fulltext_col": corpus_text_list[row_pos], - "dense_col": insert_dense_data, "sparse_col": insert_sparse_data, + "dense_col": insert_dense_data, "sparse_col": SparseVector(**insert_sparse_data), "colbert_col": colbert_list, "colbert_bit_col": get_bit_array(colbert_list)} self.infinity_table.insert(insert_dict) print("Finish inserting data.") @@ -132,11 +132,9 @@ def main(self): self.infinity_table.optimize("bmp_index", {"topk": "1000", "bp_reorder": ""}) print("Finish creating BMP index.") # print("Start creating EMVB index...") - # res = self.infinity_table.create_index("emvb_index", - # index.IndexInfo("colbert_col", - # index.IndexType.EMVB, - # [index.InitParameter("pq_subspace_num", "32"), - # index.InitParameter("pq_subspace_bits", "8")]), + # res = self.infinity_table.create_index("emvb_index", index.IndexInfo("colbert_col", index.IndexType.EMVB, + # {"pq_subspace_num": "32", + # "pq_subspace_bits": "8"}), # ConflictType.Error) # assert res.error_code == ErrorCode.OK # print("Finish creating EMVB index.") diff --git a/python/benchmark/mldr_benchmark/mldr_common_tools.py b/python/benchmark/mldr_benchmark/mldr_common_tools.py index f2efa4adf2..1d194321d5 100644 --- a/python/benchmark/mldr_benchmark/mldr_common_tools.py +++ b/python/benchmark/mldr_benchmark/mldr_common_tools.py @@ -5,6 +5,7 @@ import datasets from dataclasses import dataclass, field from pyserini.output_writer import get_output_writer, OutputFormat +from infinity.common import SparseVector @dataclass @@ -265,15 +266,15 @@ def colbert_query_yield(queries: list[str], embedding_file: str): def apply_bm25(table, query_str: str, max_hits: int): - return table.match('fulltext_col', query_str, f'topn={max_hits}') + return table.match_text('fulltext_col', query_str, max_hits) def apply_dense(table, query_embedding, max_hits: int): - return table.knn("dense_col", query_embedding, "float", "ip", max_hits, {"ef": str(max_hits)}) + return table.match_dense("dense_col", query_embedding, "float", "ip", max_hits, {"ef": str(max_hits)}) def apply_sparse(table, query_embedding: dict, max_hits: int): - return table.match_sparse("sparse_col", query_embedding, "ip", max_hits, {"alpha": "0.9", "beta": "0.5"}) + return table.match_sparse("sparse_col", SparseVector(**query_embedding), "ip", max_hits, {"alpha": "0.9", "beta": "0.5"}) def apply_colbert(table, query_tensor: list[list], max_hits: int): diff --git a/python/infinity/common.py b/python/infinity/common.py index ccf693c859..163b940b9f 100644 --- a/python/infinity/common.py +++ b/python/infinity/common.py @@ -13,7 +13,7 @@ # limitations under the License. from pathlib import Path from typing import Union - +from dataclasses import dataclass import numpy as np @@ -26,16 +26,41 @@ def __str__(self): return f'IP: {self.ip}, Port: {self.port}' +@dataclass +class SparseVector: + indices: list[int] + values: Union[list[float], list[int], None] = None + + def __post_init__(self): + assert (self.values is None) or (len(self.indices) == len(self.values)) + + def to_dict(self): + d = {"indices": self.indices} + if self.values is not None: + d["values"] = self.values + return d + + @staticmethod + def from_dict(d): + return SparseVector(d["indices"], d.get("values")) + + def __str__(self): + return f"SparseVector(indices={self.indices}{'' if self.values is None else f', values={self.values}'})" + + def __repr__(self): + return str(self) + + URI = Union[NetworkAddress, Path] VEC = Union[list, np.ndarray] -SPARSE = dict[str, Union[list[int], list[float]]] -INSERT_DATA = dict[str, Union[str, int, float, list[Union[int, float]]], SPARSE] +INSERT_DATA = dict[str, Union[str, int, float, list[Union[int, float]]], SparseVector] LOCAL_HOST = NetworkAddress("127.0.0.1", 23817) # test embedded_infinity LOCAL_INFINITY_PATH = "/var/infinity" + class ConflictType(object): Ignore = 0 Error = 1 @@ -47,5 +72,6 @@ def __init__(self, error_code=0, error_message=None): self.error_code = error_code self.error_message = error_message + DEFAULT_MATCH_VECTOR_TOPN = 10 DEFAULT_MATCH_SPARSE_TOPN = 10 diff --git a/python/infinity/local_infinity/query_builder.py b/python/infinity/local_infinity/query_builder.py index a87ae29b4b..9ab466892b 100644 --- a/python/infinity/local_infinity/query_builder.py +++ b/python/infinity/local_infinity/query_builder.py @@ -10,7 +10,7 @@ from pyarrow import Table from sqlglot import condition, maybe_parse -from infinity.common import VEC, SPARSE, InfinityException +from infinity.common import VEC, SparseVector, InfinityException from infinity.embedded_infinity_ext import * from infinity.local_infinity.types import logic_type_to_dtype, make_match_tensor_expr from infinity.local_infinity.utils import traverse_conditions, parse_expr @@ -63,7 +63,7 @@ def reset(self): self._limit = None self._offset = None - def knn( + def match_dense( self, vector_column_name: str, embedding_data: VEC, @@ -179,7 +179,7 @@ def knn( def match_sparse( self, vector_column_name: str, - sparse_data: SPARSE, + sparse_data: SparseVector, metric_type: str, topn: int, opt_params: {} = None, @@ -208,18 +208,22 @@ def match_sparse( match_sparse_expr.column_expr = column_expr sparse_expr = WrapConstantExpr() - if isinstance(sparse_data["values"][0], int): - sparse_expr.literal_type = LiteralType.kLongSparseArray - sparse_expr.i64_array_idx = sparse_data["indices"] - sparse_expr.i64_array_value = sparse_data["values"] - elif isinstance(sparse_data["values"][0], float): - sparse_expr.literal_type = LiteralType.kDoubleSparseArray - sparse_expr.i64_array_idx = sparse_data["indices"] - sparse_expr.f64_array_value = sparse_data["values"] - else: - raise InfinityException( - ErrorCode.INVALID_CONSTANT_TYPE, f"Invalid sparse data {sparse_data['values'][0]} type" - ) + match sparse_data: + case SparseVector([int(), *_] as indices, [int(), *_] as values): + sparse_expr.literal_type = LiteralType.kLongSparseArray + sparse_expr.i64_array_idx = indices + sparse_expr.i64_array_value = values + case SparseVector([int(), *_] as indices, [float(), *_] as values): + sparse_expr.literal_type = LiteralType.kDoubleSparseArray + sparse_expr.i64_array_idx = indices + sparse_expr.f64_array_value = values + case SparseVector([int(), *_], None): + raise InfinityException(ErrorCode.INVALID_CONSTANT_TYPE, + f"No values! Sparse data does not support bool value type now") + case _: + raise InfinityException(ErrorCode.INVALID_CONSTANT_TYPE, + f"Invalid sparse data type {type(sparse_data)}") + match_sparse_expr.sparse_expr = sparse_expr match_sparse_expr.metric_type = metric_type @@ -232,8 +236,8 @@ def match_sparse( self._search.match_exprs += [generic_match_expr] return self - def match( - self, fields: str, matching_text: str, options_text: str = "" + def match_text( + self, fields: str, matching_text: str, topn: int, extra_options: Optional[dict] ) -> InfinityLocalQueryBuilder: if self._search is None: self._search = WrapSearchExpr() @@ -241,6 +245,10 @@ def match( match_expr = WrapMatchExpr() match_expr.fields = fields match_expr.matching_text = matching_text + options_text = f"topn={topn}" + if extra_options is not None: + for k, v in extra_options.items(): + options_text += f";{k}={v}" match_expr.options_text = options_text generic_match_expr = WrapParsedExpr() diff --git a/python/infinity/local_infinity/table.py b/python/infinity/local_infinity/table.py index 67ddba2885..ebfae7d1d6 100644 --- a/python/infinity/local_infinity/table.py +++ b/python/infinity/local_infinity/table.py @@ -21,7 +21,7 @@ from infinity.embedded_infinity_ext import WrapIndexInfo, ImportOptions, CopyFileType, WrapParsedExpr, \ ParsedExprType, WrapUpdateExpr, ExportOptions, WrapOptimizeOptions from infinity.common import ConflictType, DEFAULT_MATCH_VECTOR_TOPN -from infinity.common import INSERT_DATA, VEC, SPARSE, InfinityException +from infinity.common import INSERT_DATA, VEC, SparseVector, InfinityException from infinity.errors import ErrorCode from infinity.index import IndexInfo from infinity.local_infinity.query_builder import Query, InfinityLocalQueryBuilder, ExplainQuery @@ -32,6 +32,7 @@ from infinity.table import Table, ExplainType import infinity.index as index from infinity.index import InitParameter +from infinity.utils import deprecated_api from sqlglot import condition @@ -317,21 +318,30 @@ def update(self, cond: Optional[str], else: raise InfinityException(res.error_code, res.error_msg) - def knn(self, vector_column_name: str, embedding_data: VEC, embedding_data_type: str, distance_type: str, - topn: int = DEFAULT_MATCH_VECTOR_TOPN, knn_params: {} = None): - self.query_builder.knn( + def match_dense(self, vector_column_name: str, embedding_data: VEC, embedding_data_type: str, distance_type: str, + topn: int = DEFAULT_MATCH_VECTOR_TOPN, knn_params: {} = None): + self.query_builder.match_dense( vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params) return self - def match_sparse(self, vector_column_name: str, sparse_data: SPARSE, distance_type: str, topn: int, opt_params: {} = None): + def knn(self, *args, **kwargs): + deprecated_api("knn is deprecated, please use match_dense instead") + return self.match_dense(*args, **kwargs) + + def match_sparse(self, vector_column_name: str, sparse_data: SparseVector, distance_type: str, topn: int, + opt_params: {} = None): self.query_builder.match_sparse(vector_column_name, sparse_data, distance_type, topn, opt_params) return self @params_type_check - def match(self, fields: str, matching_text: str, options_text: str = ''): - self.query_builder.match(fields, matching_text, options_text) + def match_text(self, fields: str, matching_text: str, topn: int, extra_options: Optional[dict] = None): + self.query_builder.match_text(fields, matching_text, topn, extra_options) return self + def match(self, *args, **kwargs): + deprecated_api("match is deprecated, please use match_text instead") + return self.match_text(*args, **kwargs) + @params_type_check def match_tensor(self, column_name: str, query_data: VEC, query_data_type: str, topn: int, extra_option: Optional[dict] = None): diff --git a/python/infinity/local_infinity/types.py b/python/infinity/local_infinity/types.py index 1c4bbba68b..b29900e944 100644 --- a/python/infinity/local_infinity/types.py +++ b/python/infinity/local_infinity/types.py @@ -18,7 +18,7 @@ import polars as pl import numpy as np from numpy import dtype -from infinity.common import VEC, SPARSE, InfinityException, DEFAULT_MATCH_VECTOR_TOPN +from infinity.common import VEC, SparseVector, InfinityException, DEFAULT_MATCH_VECTOR_TOPN from infinity.embedded_infinity_ext import * from infinity.errors import ErrorCode @@ -304,7 +304,7 @@ def parse_sparse_bytes(column_data_type, column_vector): pass case _: raise NotImplementedError(f"Unsupported type {element_type}") - res.append({"indices": indices, "values": values}) + res.append(SparseVector(list(indices), values).to_dict()) return res diff --git a/python/infinity/local_infinity/utils.py b/python/infinity/local_infinity/utils.py index 8633a77f57..a66d1e2d6e 100644 --- a/python/infinity/local_infinity/utils.py +++ b/python/infinity/local_infinity/utils.py @@ -20,7 +20,7 @@ import sqlglot.expressions as exp import numpy as np from infinity.errors import ErrorCode -from infinity.common import InfinityException +from infinity.common import InfinityException, SparseVector from infinity.local_infinity.types import build_result, logic_type_to_dtype from infinity.utils import binary_exp_to_paser_exp from infinity.embedded_infinity_ext import WrapParsedExpr, WrapFunctionExpr, WrapColumnExpr, WrapSearchExpr, WrapConstantExpr, ParsedExprType, LiteralType @@ -195,14 +195,14 @@ def get_local_constant_expr_from_python_value(value) -> WrapConstantExpr: case [[[float(), *_], *_], *_]: constant_expression.literal_type = LiteralType.kSubArrayArray constant_expression.f64_tensor_array_value = value - case {"indices": [int(), *_], "values": [int(), *_]}: + case SparseVector([int(), *_] as indices, [int(), *_] as values): constant_expression.literal_type = LiteralType.kLongSparseArray - constant_expression.i64_array_idx = value["indices"] - constant_expression.i64_array_value = value["values"] - case {"indices": [int(), *_], "values": [float(), *_]}: + constant_expression.i64_array_idx = indices + constant_expression.i64_array_value = values + case SparseVector([int(), *_] as indices, [float(), *_] as values): constant_expression.literal_type = LiteralType.kDoubleSparseArray - constant_expression.i64_array_idx = value["indices"] - constant_expression.f64_array_value = value["values"] + constant_expression.i64_array_idx = indices + constant_expression.f64_array_value = values case _: raise InfinityException(ErrorCode.INVALID_EXPRESSION, f"Invalid constant type: {type(value)}") return constant_expression diff --git a/python/infinity/remote_thrift/query_builder.py b/python/infinity/remote_thrift/query_builder.py index 1f1680ce8a..db341bb72c 100644 --- a/python/infinity/remote_thrift/query_builder.py +++ b/python/infinity/remote_thrift/query_builder.py @@ -24,7 +24,7 @@ from pyarrow import Table from sqlglot import condition, maybe_parse -from infinity.common import VEC, SPARSE, InfinityException +from infinity.common import VEC, SparseVector, InfinityException from infinity.errors import ErrorCode from infinity.remote_thrift.infinity_thrift_rpc.ttypes import * from infinity.remote_thrift.types import ( @@ -83,7 +83,7 @@ def reset(self): self._limit = None self._offset = None - def knn( + def match_dense( self, vector_column_name: str, embedding_data: VEC, @@ -191,10 +191,10 @@ def knn( def match_sparse( self, vector_column_name: str, - sparse_data: SPARSE, + sparse_data: SparseVector, metric_type: str, topn: int, - opt_params: {} = None, + opt_params: Optional[dict] = None, ) -> InfinityThriftQueryBuilder: if self._search is None: self._search = SearchExpr() @@ -207,8 +207,8 @@ def match_sparse( self._search.match_exprs.append(generic_match_expr) return self - def match( - self, fields: str, matching_text: str, options_text: str = "" + def match_text( + self, fields: str, matching_text: str, topn: int, extra_options: Optional[dict] ) -> InfinityThriftQueryBuilder: if self._search is None: self._search = SearchExpr() @@ -216,6 +216,10 @@ def match( match_expr = MatchExpr() match_expr.fields = fields match_expr.matching_text = matching_text + options_text = f"topn={topn}" + if extra_options is not None: + for k, v in extra_options.items(): + options_text += f";{k}={v}" match_expr.options_text = options_text generic_match_expr = GenericMatchExpr(match_text_expr=match_expr) self._search.match_exprs.append(generic_match_expr) diff --git a/python/infinity/remote_thrift/table.py b/python/infinity/remote_thrift/table.py index 4ba5cfef0b..689b9c2dac 100644 --- a/python/infinity/remote_thrift/table.py +++ b/python/infinity/remote_thrift/table.py @@ -21,7 +21,7 @@ from sqlglot import condition import infinity.remote_thrift.infinity_thrift_rpc.ttypes as ttypes -from infinity.common import INSERT_DATA, VEC, InfinityException +from infinity.common import INSERT_DATA, VEC, InfinityException, SparseVector from infinity.errors import ErrorCode from infinity.index import IndexInfo from infinity.remote_thrift.query_builder import Query, InfinityThriftQueryBuilder, ExplainQuery @@ -30,6 +30,7 @@ from infinity.remote_thrift.utils import get_remote_constant_expr_from_python_value from infinity.table import Table, ExplainType from infinity.common import ConflictType, DEFAULT_MATCH_VECTOR_TOPN +from infinity.utils import deprecated_api class RemoteTable(Table, ABC): @@ -330,24 +331,33 @@ def update(self, cond: Optional[str], else: raise InfinityException(res.error_code, res.error_msg) - def knn(self, vector_column_name: str, embedding_data: VEC, embedding_data_type: str, distance_type: str, - topn: int = DEFAULT_MATCH_VECTOR_TOPN, knn_params: {} = None): - self.query_builder.knn( + def match_dense(self, vector_column_name: str, embedding_data: VEC, embedding_data_type: str, distance_type: str, + topn: int = DEFAULT_MATCH_VECTOR_TOPN, knn_params: {} = None): + self.query_builder.match_dense( vector_column_name, embedding_data, embedding_data_type, distance_type, topn, knn_params) return self + def knn(self, *args, **kwargs): + deprecated_api("knn is deprecated, please use match_dense instead") + return self.match_dense(*args, **kwargs) + @params_type_check - def match(self, fields: str, matching_text: str, options_text: str = ''): - self.query_builder.match(fields, matching_text, options_text) + def match_text(self, fields: str, matching_text: str, topn: int, extra_options: Optional[dict] = None): + self.query_builder.match_text(fields, matching_text, topn, extra_options) return self + def match(self, *args, **kwargs): + deprecated_api("match is deprecated, please use match_text instead") + return self.match_text(*args, **kwargs) + @params_type_check def match_tensor(self, column_name: str, query_data: VEC, query_data_type: str, topn: int, extra_option: Optional[dict] = None): self.query_builder.match_tensor(column_name, query_data, query_data_type, topn, extra_option) return self - def match_sparse(self, vector_column_name: str, sparse_data, distance_type: str, topn: int, opt_params: {} = None): + def match_sparse(self, vector_column_name: str, sparse_data: SparseVector, distance_type: str, topn: int, + opt_params: Optional[dict] = None): self.query_builder.match_sparse(vector_column_name, sparse_data, distance_type, topn, opt_params) return self diff --git a/python/infinity/remote_thrift/types.py b/python/infinity/remote_thrift/types.py index 5aadb79715..b22cff9b2e 100644 --- a/python/infinity/remote_thrift/types.py +++ b/python/infinity/remote_thrift/types.py @@ -14,10 +14,10 @@ import struct import numpy as np -from infinity.common import VEC, SPARSE, InfinityException +from infinity.common import VEC, SparseVector, InfinityException from infinity.remote_thrift.infinity_thrift_rpc.ttypes import * from collections import defaultdict -from typing import Any, Tuple, Dict, List +from typing import Any, Tuple, Dict, List, Optional import polars as pl from numpy import dtype @@ -399,7 +399,7 @@ def parse_sparse_bytes(column_data_type: ttypes.DataType, column_vector): case _: raise NotImplementedError(f"Unsupported type {element_type}") # print("indices: {}, values: {}".format(indices, values)) - res.append({"indices": indices, "values": values}) + res.append(SparseVector(list(indices), values).to_dict()) return res @@ -476,20 +476,27 @@ def make_match_tensor_expr(vector_column_name: str, embedding_data: VEC, embeddi match_tensor_expr.embedding_data = data return match_tensor_expr -def make_match_sparse_expr(vector_column_name: str, sparse_data: SPARSE, metric_type: str, topn: int, opt_params: {} = None): + +def make_match_sparse_expr(vector_column_name: str, sparse_data: SparseVector, metric_type: str, topn: int, + opt_params: Optional[dict] = None): column_expr = ColumnExpr(column_name=[vector_column_name], star=False) query_sparse_expr = ConstantExpr() - if isinstance(sparse_data["values"][0], int): - query_sparse_expr.literal_type = LiteralType.SparseIntegerArray - query_sparse_expr.i64_array_idx = sparse_data["indices"] - query_sparse_expr.i64_array_value = sparse_data["values"] - elif isinstance(sparse_data["values"][0], float): - query_sparse_expr.literal_type = LiteralType.SparseDoubleArray - query_sparse_expr.i64_array_idx = sparse_data["indices"] - query_sparse_expr.f64_array_value = sparse_data["values"] - else: - raise InfinityException(ErrorCode.INVALID_CONSTANT_TYPE, f"Invalid sparse data {sparse_data['values'][0]} type") + + match sparse_data: + case SparseVector([int(), *_] as indices, [int(), *_] as values): + query_sparse_expr.literal_type = LiteralType.SparseIntegerArray + query_sparse_expr.i64_array_idx = indices + query_sparse_expr.i64_array_value = values + case SparseVector([int(), *_] as indices, [float(), *_] as values): + query_sparse_expr.literal_type = LiteralType.SparseDoubleArray + query_sparse_expr.i64_array_idx = indices + query_sparse_expr.f64_array_value = values + case SparseVector([int(), *_], None): + raise InfinityException(ErrorCode.INVALID_CONSTANT_TYPE, + f"No values! Sparse data does not support bool value type now") + case _: + raise InfinityException(ErrorCode.INVALID_CONSTANT_TYPE, f"Invalid sparse data type {type(sparse_data)}") match_sparse_options = [] if opt_params is not None: diff --git a/python/infinity/remote_thrift/utils.py b/python/infinity/remote_thrift/utils.py index bc14a8785e..2f74ac16c8 100644 --- a/python/infinity/remote_thrift/utils.py +++ b/python/infinity/remote_thrift/utils.py @@ -22,7 +22,7 @@ import infinity.remote_thrift.infinity_thrift_rpc.ttypes as ttypes from infinity.remote_thrift.types import build_result, logic_type_to_dtype from infinity.utils import binary_exp_to_paser_exp -from infinity.common import InfinityException +from infinity.common import InfinityException, SparseVector from infinity.errors import ErrorCode @@ -221,16 +221,16 @@ def get_remote_constant_expr_from_python_value(value) -> ttypes.ConstantExpr: case [[[float(), *_], *_], *_]: constant_expression = ttypes.ConstantExpr(literal_type=ttypes.LiteralType.DoubleTensorArray, f64_tensor_array_value=value) - case {"indices": [int(), *_], "values": [int(), *_]}: + case SparseVector([int(), *_] as indices, [int(), *_] as values): constant_expression = ttypes.ConstantExpr( literal_type=ttypes.LiteralType.SparseIntegerArray, - i64_array_value=value["values"], - i64_array_idx=value["indices"]) - case {"indices": [int(), *_], "values": [float(), *_]}: + i64_array_idx=indices, + i64_array_value=values) + case SparseVector([int(), *_] as indices, [float(), *_] as values): constant_expression = ttypes.ConstantExpr( literal_type=ttypes.LiteralType.SparseDoubleArray, - f64_array_value=value["values"], - i64_array_idx=value["indices"]) + i64_array_idx=indices, + f64_array_value=values) case _: raise InfinityException(ErrorCode.INVALID_EXPRESSION, f"Invalid constant type: {type(value)}") return constant_expression diff --git a/python/infinity/utils.py b/python/infinity/utils.py index 865588892f..2514dfe445 100644 --- a/python/infinity/utils.py +++ b/python/infinity/utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from infinity.common import InfinityException from infinity.errors import ErrorCode @@ -45,3 +46,6 @@ def binary_exp_to_paser_exp(binary_expr_key) -> str: return "%" else: raise InfinityException(ErrorCode.INVALID_EXPRESSION, f"unknown binary expression: {binary_expr_key}") + +def deprecated_api(message): + warnings.warn(message, DeprecationWarning, stacklevel=2) diff --git a/python/infinity_http.py b/python/infinity_http.py index b4613ea660..3a2959d9d7 100644 --- a/python/infinity_http.py +++ b/python/infinity_http.py @@ -4,11 +4,12 @@ import logging import os from test_pysdk.common.common_data import * -from infinity.common import ConflictType, InfinityException +from infinity.common import ConflictType, InfinityException, SparseVector from test_pysdk.common import common_values import infinity from typing import Optional from infinity.errors import ErrorCode +from infinity.utils import deprecated_api import numpy as np import pandas as pd import polars as pl @@ -418,6 +419,8 @@ def insert(self,values=[]): for idx in range(len(value[key])): if isinstance(value[key][idx], np.ndarray): value[key][idx] = value[key][idx].tolist() + if isinstance(value[key], SparseVector): + value[key] = value[key].to_dict() url = f"databases/{self.database_name}/tables/{self.table_name}/docs" h = self.set_up_header(["accept", "content-type"]) @@ -553,13 +556,21 @@ def output( self._match_sparse = {} return self - def match(self, fields, query, operator="top=10"): + def match_text(self, fields: str, query: str, topn: int, opt_params: Optional[dict] = None): self._match = {} self._match["fields"] = fields self._match["query"] = query - self._match["operator"] = operator + operator_str = f"topn={topn}" + if opt_params is not None: + for k, v in opt_params.items(): + operator_str += f";{k}={v}" + self._match["operator"] = operator_str return self + def match(self, *args, **kwargs): + deprecated_api("match is deprecated, please use match_text instead") + return self.match_text(*args, **kwargs) + def match_tensor(self, column_name: str, query_data, query_data_type: str, topn: int): self._match_tensor = {} self._match_tensor["search_method"] = "maxsim" @@ -580,7 +591,7 @@ def match_sparse(self, vector_column_name, sparse_data, distance_type="ip", topn else: vector_column_name = [vector_column_name] self._match_sparse["fields"] = vector_column_name - self._match_sparse["query_sparse"] = sparse_data + self._match_sparse["query_sparse"] = sparse_data.to_dict() self._match_sparse["metric_type"] = distance_type self._match_sparse["topn"] = topn self._match_sparse["opt_params"] = opt_params @@ -590,7 +601,7 @@ def filter(self, filter): self._filter = filter return self - def knn(self, fields, query_vector, element_type, metric_type, top_k, opt_params : {} = None): + def match_dense(self, fields, query_vector, element_type, metric_type, top_k, opt_params : {} = None): self._knn = {} self._knn["fields"] = [fields] self._knn["query_vector"] = query_vector @@ -602,6 +613,10 @@ def knn(self, fields, query_vector, element_type, metric_type, top_k, opt_params self._knn[key] = opt_params[key] return self + def knn(self, *args, **kwargs): + deprecated_api("knn is deprecated, please use match_dense instead") + return self.match_dense(*args, **kwargs) + def fusion(self, method: str, topn: int, fusion_params: Optional[dict] = None): _fusion = {"method": method} fusion_option_str = f"topn={topn}" diff --git a/python/parallel_test/test_chaos.py b/python/parallel_test/test_chaos.py index e7f858fa86..a453c8fcf8 100644 --- a/python/parallel_test/test_chaos.py +++ b/python/parallel_test/test_chaos.py @@ -84,13 +84,13 @@ def read_out_data(): def search_fulltext(table_obj: Table): - res = table_obj.output(["index", "body", "other_vector", "_row_id", "_score"]).match( - "body^5", "harmful chemical", "topn=3").to_pl() + res = table_obj.output(["index", "body", "other_vector", "_row_id", "_score"]).match_text( + "body^5", "harmful chemical", 3).to_pl() print(res) def search_vector(table_obj: Table): - res = table_obj.output(["*"]).knn("other_vector", [2] * 4, "float", "l2", 3).to_pl() + res = table_obj.output(["*"]).match_dense("other_vector", [2] * 4, "float", "l2", 3).to_pl() print(res) diff --git a/python/parallel_test/test_index_parallel.py b/python/parallel_test/test_index_parallel.py index 974e5ce490..b54db05d71 100644 --- a/python/parallel_test/test_index_parallel.py +++ b/python/parallel_test/test_index_parallel.py @@ -49,8 +49,8 @@ def read_worker(connection_pool: ConnectionPool, end_time): table_obj = db_obj.get_table("test_fulltext_index_parallel") while time.time() < end_time: - res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match( - "body^5", "harmful chemical", "topn=3").to_pl() + res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( + "body^5", "harmful chemical", 3).to_pl() print(res) time.sleep(0.1) @@ -147,7 +147,7 @@ def test_vector_index_single_thread(self, get_infinity_connection_pool, index_ty print("begin import") table_obj.import_data(file_path) print("import complete") - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1] * 4, "float", knn_distance_type, 5).to_pl() print(res) @@ -197,7 +197,7 @@ def read_worker(connection_pool: ConnectionPool, end_time, knn_column_name, knn_ table_obj = db_obj.get_table("test_vector_index_parallel") while time.time() < end_time: - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1] * 4, "float", knn_distance_type, 5).to_pl() print(res) time.sleep(0.1) diff --git a/python/restart_test/test_memidx.py b/python/restart_test/test_memidx.py index bb4462954e..5e4f4718e4 100644 --- a/python/restart_test/test_memidx.py +++ b/python/restart_test/test_memidx.py @@ -23,17 +23,8 @@ def test_mem_hnsw(self, infinity_runner: InfinityRunner): ) res = table_obj.create_index( "idx1", - index.IndexInfo( - "c2", - index.IndexType.Hnsw, - [ - index.InitParameter("M", "16"), - index.InitParameter("ef_construction", "20"), - index.InitParameter("ef", "20"), - index.InitParameter("metric", "l2"), - index.InitParameter("block_size", "1"), - ], - ) + index.IndexInfo("c2", index.IndexType.Hnsw, + {"M": "16", "ef_construction": "20", "ef": "20", "metric": "l2", "block_size": "1"}) ) assert res.error_code == infinity.ErrorCode.OK @@ -57,7 +48,7 @@ def test_mem_hnsw(self, infinity_runner: InfinityRunner): table_obj = db_obj.get_table("test_memidx1") data_dict, data_type_dict = ( table_obj.output(["c1"]) - .knn("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) + .match_dense("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) .to_result() ) # print(data_dict["c1"]) @@ -85,7 +76,7 @@ def test_mem_hnsw(self, infinity_runner: InfinityRunner): def check(): data_dict, data_type_dict = ( table_obj.output(["c1"]) - .knn("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) + .match_dense("c2", [0.3, 0.3, 0.2, 0.2], "float", "l2", 6) .to_result() ) assert data_dict["c1"] == [8, 6, 6, 4, 4, 4] diff --git a/python/test_pysdk/doc/pysdk_api_test_document.md b/python/test_pysdk/doc/pysdk_api_test_document.md index 2130e5db16..42a018271e 100644 --- a/python/test_pysdk/doc/pysdk_api_test_document.md +++ b/python/test_pysdk/doc/pysdk_api_test_document.md @@ -54,8 +54,8 @@ ### query_builder.output ### query_builder.filter -### query_builder.knn -### query_builder.match +### query_builder.match_dense +### query_builder.match_text ### query_builder.fusion ### query_builder.to_result ### query_builder.to_df diff --git a/python/test_pysdk/test_index.py b/python/test_pysdk/test_index.py index 6cb3c6ff2e..b9320cd67c 100644 --- a/python/test_pysdk/test_index.py +++ b/python/test_pysdk/test_index.py @@ -565,8 +565,8 @@ def test_insert_data_fulltext_index_search(self, file_format, suffix): "docdate": data["docdate"][i], "body": data["body"][i]}) table_obj.insert(value) time.sleep(5) - res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match( - "body^5", "harmful chemical", "topn=3").to_pl() + res = table_obj.output(["doctitle", "docdate", "_row_id", "_score"]).match_text( + "body^5", "harmful chemical", 3).to_pl() assert not res.is_empty() print(res) diff --git a/python/test_pysdk/test_insert.py b/python/test_pysdk/test_insert.py index cc914c2e74..62bc11c452 100644 --- a/python/test_pysdk/test_insert.py +++ b/python/test_pysdk/test_insert.py @@ -11,7 +11,7 @@ from common import common_values import infinity import infinity.index as index -from infinity.common import ConflictType, InfinityException +from infinity.common import ConflictType, InfinityException, SparseVector from infinity.errors import ErrorCode from infinity_http import infinity_http @@ -502,15 +502,16 @@ def _test_insert_sparse(self, suffix): table_obj = db_obj.create_table("test_insert_sparse"+suffix, {"c1": {"type": "sparse,100,float,int"}}, ConflictType.Error) assert table_obj - res = table_obj.insert([{"c1": {"indices": [10, 20, 30], "values": [1.1, 2.2, 3.3]}}]) + res = table_obj.insert([{"c1": SparseVector(**{"indices": [10, 20, 30], "values": [1.1, 2.2, 3.3]})}]) assert res.error_code == ErrorCode.OK - res = table_obj.insert([{"c1": {"indices": [40, 50, 60], "values": [4.4, 5.5, 6.6]}}]) + res = table_obj.insert([{"c1": SparseVector(**{"indices": [40, 50, 60], "values": [4.4, 5.5, 6.6]})}]) assert res.error_code == ErrorCode.OK - res = table_obj.insert([{"c1": {"indices": [70, 80, 90], "values": [7.7, 8.8, 9.9]}}, - {"c1": {"indices": [70, 80, 90], "values": [-7.7, -8.8, -9.9]}}]) + res = table_obj.insert([{"c1": SparseVector(**{"indices": [70, 80, 90], "values": [7.7, 8.8, 9.9]})}, + {"c1": SparseVector(**{"indices": [70, 80, 90], "values": [-7.7, -8.8, -9.9]})}]) assert res.error_code == ErrorCode.OK - + print(table_obj.output(["*"]).to_pl()) res = table_obj.output(["*"]).to_df() + print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': ( {"indices": [10, 20, 30], "values": [1.1, 2.2, 3.3]}, diff --git a/python/test_pysdk/test_knn.py b/python/test_pysdk/test_knn.py index 76a1d53a01..7468da4120 100644 --- a/python/test_pysdk/test_knn.py +++ b/python/test_pysdk/test_knn.py @@ -10,7 +10,7 @@ from infinity.remote_thrift.infinity import RemoteThriftInfinityConnection import infinity.index as index from infinity.errors import ErrorCode -from infinity.common import ConflictType, InfinityException +from infinity.common import ConflictType, InfinityException, SparseVector from common.utils import copy_data, generate_commas_enwiki import pandas as pd from numpy import dtype @@ -82,16 +82,16 @@ def test_knn(self, check_data, suffix): # res = table_obj.output(["variant_id"]).to_df() # true - # res = table_obj.output(["variant_id", "query_price"]).knn('gender_vector', [1.0] * 768, "float", "ip", - # 10).to_pl() + # res = table_obj.output(["variant_id", "query_price"]).match_dense('gender_vector', [1.0] * 768, "float", "ip", + # 10).to_pl() # print(res) # true - res = table_obj.output(["variant_id", "_row_id"]).knn("gender_vector", [1.0] * 4, "float", "ip", 10).to_pl() + res = table_obj.output(["variant_id", "_row_id"]).match_dense("gender_vector", [1.0] * 4, "float", "ip", 10).to_pl() print(res) # FIXME - # res = table_obj.output(["variant_id", "query_is_recommend", "query_gender", "query_color", "query_price"]).knn( + # res = table_obj.output(["variant_id", "query_is_recommend", "query_gender", "query_color", "query_price"]).match_dense( # "gender_vector", [1.0] * 4, "float", "ip", 3).to_pl() # print(res) @@ -119,7 +119,7 @@ def test_knn_u8(self, check_data, suffix): pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (1, 5, 9, 11), 'c2': ([2, 3, 4], [6, 7, 8], [10, 11, 12], [127, 128, 255])}).astype( {'c1': dtype('int32')})) - res = table_obj.output(["c1", "_distance"]).knn('c2', [0, 0, 0], "uint8", "l2", 10).to_df() + res = table_obj.output(["c1", "_distance"]).match_dense('c2', [0, 0, 0], "uint8", "l2", 10).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (1, 5, 9, 11), 'DISTANCE': (29.0, 149.0, 365.0, 97538.0)}).astype( @@ -141,13 +141,13 @@ def test_knn_u8(self, check_data, suffix): "metric": "l2" }), ConflictType.Error) assert res.error_code == ErrorCode.OK - res = table_obj.output(["c1", "_distance"]).knn('c2', [0, 0, 0], "uint8", "l2", 10).to_df() + res = table_obj.output(["c1", "_distance"]).match_dense('c2', [0, 0, 0], "uint8", "l2", 10).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (1, 5, 9, 11), 'DISTANCE': (29.0, 149.0, 365.0, 97538.0)}).astype( {'c1': dtype('int32'), 'DISTANCE': dtype('float32')})) with pytest.raises(InfinityException): - table_obj.output(["c1", "_distance"]).knn('c2', [0, 0, 0], "int8", "l2", 10).to_result() + table_obj.output(["c1", "_distance"]).match_dense('c2', [0, 0, 0], "int8", "l2", 10).to_result() res = db_obj.drop_table("test_knn_u8"+suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -173,7 +173,7 @@ def test_knn_fp16_bf16(self, check_data, save_elem_type, query_elem_type, suffix pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (1, 5, 9, 11), 'c2': ([2, 3, 4], [6, 7, 8], [10, 11, 12], [127, 128, 255])}).astype( {'c1': dtype('int32')})) - res = table_obj.output(["c1", "_distance"]).knn('c2', [0, 0, 0], query_elem_type, "l2", 10).to_df() + res = table_obj.output(["c1", "_distance"]).match_dense('c2', [0, 0, 0], query_elem_type, "l2", 10).to_df() print(res) pd.testing.assert_frame_equal(res, pd.DataFrame( {'c1': (1, 5, 9, 11), 'DISTANCE': (29.0, 149.0, 365.0, 97538.0)}).astype( @@ -237,7 +237,7 @@ def test_knn_on_vector_column(self, check_data, column_name, suffix): copy_data("tmp_20240116.csv") test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) - res = table_obj.output(["variant_id", "_row_id", "_similarity"]).knn( + res = table_obj.output(["variant_id", "_row_id", "_similarity"]).match_dense( column_name, [1.0] * 4, "float", "ip", 2).to_pl() print(res) @@ -274,7 +274,7 @@ def test_knn_on_non_vector_column(self, check_data, column_name, suffix): test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) with pytest.raises(InfinityException) as e: - table_obj.output(["variant_id", "_row_id"]).knn(column_name, [1.0] * 4, "float", "ip", 2).to_pl() + table_obj.output(["variant_id", "_row_id"]).match_dense(column_name, [1.0] * 4, "float", "ip", 2).to_pl() assert e.type == InfinityException assert e.value.args[0] == ErrorCode.SYNTAX_ERROR @@ -309,7 +309,7 @@ def test_valid_embedding_data(self, check_data, embedding_data, suffix): copy_data("tmp_20240116.csv") test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) - res = table_obj.output(["variant_id", "_row_id"]).knn( + res = table_obj.output(["variant_id", "_row_id"]).match_dense( "gender_vector", embedding_data, "float", "ip", 2).to_pl() print(res) @@ -349,7 +349,7 @@ def test_invalid_embedding_data(self, check_data, embedding_data, suffix): test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) with pytest.raises(Exception): - res = table_obj.output(["variant_id", "_row_id"]).knn( + res = table_obj.output(["variant_id", "_row_id"]).match_dense( "gender_vector", embedding_data, "float", "ip", 2).to_pl() print(res) @@ -387,16 +387,16 @@ def test_valid_embedding_data_type(self, check_data, embedding_data, embedding_d test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) if embedding_data_type[1]: - res = table_obj.output(["variant_id", "_distance"]).knn("gender_vector", embedding_data, - embedding_data_type[0], - "l2", - 2).to_pl() + res = table_obj.output(["variant_id", "_distance"]).match_dense("gender_vector", embedding_data, + embedding_data_type[0], + "l2", + 2).to_pl() print(res) else: - res = table_obj.output(["variant_id", "_similarity"]).knn("gender_vector", embedding_data, - embedding_data_type[0], - "ip", - 2).to_pl() + res = table_obj.output(["variant_id", "_similarity"]).match_dense("gender_vector", embedding_data, + embedding_data_type[0], + "ip", + 2).to_pl() res = db_obj.drop_table( "test_valid_embedding_data_type"+suffix, ConflictType.Error) @@ -436,14 +436,16 @@ def test_invalid_embedding_data_type(self, check_data, embedding_data, embedding table_obj.import_data(test_csv_dir, None) with pytest.raises(Exception): if embedding_data_type[1]: - res = table_obj.output(["variant_id"]).knn("gender_vector", embedding_data, embedding_data_type[0], - "l2", - 2).to_pl() + res = table_obj.output(["variant_id"]).match_dense("gender_vector", embedding_data, + embedding_data_type[0], + "l2", + 2).to_pl() print(res) else: - res = table_obj.output(["variant_id"]).knn("gender_vector", embedding_data, embedding_data_type[0], - "ip", - 2).to_pl() + res = table_obj.output(["variant_id"]).match_dense("gender_vector", embedding_data, + embedding_data_type[0], + "ip", + 2).to_pl() res = db_obj.drop_table( "test_invalid_embedding_data_type"+suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -487,15 +489,15 @@ def test_various_distance_type(self, check_data, embedding_data, embedding_data_ test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) if distance_type[1] and embedding_data_type[1]: - res = table_obj.output(["variant_id"]).knn("gender_vector", embedding_data, embedding_data_type[0], - distance_type[0], - 2).to_pl() + res = table_obj.output(["variant_id"]).match_dense("gender_vector", embedding_data, embedding_data_type[0], + distance_type[0], + 2).to_pl() print(res) else: with pytest.raises(InfinityException) as e: - table_obj.output(["variant_id"]).knn("gender_vector", embedding_data, embedding_data_type[0], - distance_type[0], - 2).to_pl() + table_obj.output(["variant_id"]).match_dense("gender_vector", embedding_data, embedding_data_type[0], + distance_type[0], + 2).to_pl() assert e.type == InfinityException assert e.value.args[0] == ErrorCode.NOT_SUPPORTED @@ -538,12 +540,12 @@ def test_various_topn(self, check_data, topn, suffix): test_csv_dir = "/var/infinity/test_data/tmp_20240116.csv" table_obj.import_data(test_csv_dir, None) if topn[1]: - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1] * 4, "float", "l2", topn[0]).to_pl() print(res) else: with pytest.raises(InfinityException) as e: - table_obj.output(["variant_id"]).knn( + table_obj.output(["variant_id"]).match_dense( "gender_vector", [1] * 4, "float", "l2", topn[0]).to_pl() assert e.type == InfinityException @@ -598,7 +600,7 @@ def test_with_index_before(self, check_data, index_column_name, knn_column_name, assert res.error_code == ErrorCode.OK - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1.0] * 4, "float", knn_distance_type, 5).to_pl() print(res) @@ -643,7 +645,7 @@ def test_with_index_after(self, check_data, copy_data("pysdk_test_knn.csv") test_csv_dir = "/var/infinity/test_data/pysdk_test_knn.csv" table_obj.import_data(test_csv_dir, None) - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1.0] * 4, "float", knn_distance_type, 5).to_pl() print(res) res = table_obj.create_index("my_index", @@ -682,15 +684,15 @@ def test_fulltext_operator_option(self, check_data, suffix): test_csv_dir = common_values.TEST_TMP_DIR + "enwiki_99.csv" table_obj.import_data(test_csv_dir, import_options={"delimiter": "\t"}) print('Test fulltext operator OR for query "TO BE OR NOT":') - print(table_obj.output(["*", "_row_id", "_score"]).match("doctitle,body^5", "TO BE OR NOT", - "topn=5;operator=or").to_pl()) + print(table_obj.output(["*", "_row_id", "_score"]).match_text("doctitle,body^5", "TO BE OR NOT", 5, + {"operator": "or"}).to_pl()) print('Test fulltext operator AND for query "TO BE OR NOT":') - print(table_obj.output(["*", "_row_id", "_score"]).match("doctitle,body^5", "TO BE OR NOT", - "topn=5;operator=and").to_pl()) + print(table_obj.output(["*", "_row_id", "_score"]).match_text("doctitle,body^5", "TO BE OR NOT", 5, + {"operator": "and"}).to_pl()) # expect throw print('No operator option for query "TO BE OR NOT", expect throw:') with pytest.raises(InfinityException) as e_info: - table_obj.output(["*", "_row_id", "_score"]).match("doctitle,body^5", "TO BE OR NOT", "topn=5").to_pl() + table_obj.output(["*", "_row_id", "_score"]).match_text("doctitle,body^5", "TO BE OR NOT", 5).to_pl() print(e_info.value.error_message) res = table_obj.drop_index("my_index", ConflictType.Error) assert res.error_code == ErrorCode.OK @@ -725,8 +727,8 @@ def test_with_fulltext_match_with_valid_columns(self, check_data, match_param_1, table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) res = (table_obj .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match(match_param_1, "black", "topn=1") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) + .match_text(match_param_1, "black", 1) .fusion(method='rrf', topn=10) .to_pl()) print(res) @@ -772,8 +774,8 @@ def test_with_fulltext_match_with_invalid_columns(self, check_data, match_param_ with pytest.raises(Exception): res = (table_obj .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match(match_param_1, "black", "topn=1") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) + .match_text(match_param_1, "black", 1) .fusion(method='rrf', topn=10) .to_pl()) print(res) @@ -814,8 +816,8 @@ def test_with_fulltext_match_with_valid_words(self, check_data, match_param_2, s table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) res = (table_obj .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match("doctitle,num,body^5", match_param_2, "topn=1") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) + .match_text("doctitle,num,body^5", match_param_2, 1) .fusion(method='rrf', topn=10) .to_pl()) print(res) @@ -862,8 +864,8 @@ def test_with_fulltext_match_with_invalid_words(self, check_data, match_param_2, with pytest.raises(Exception): res = (table_obj .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match("doctitle,num,body^5", match_param_2, "topn=1") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) + .match_text("doctitle,num,body^5", match_param_2, 1) .fusion(method='rrf', topn=10) .to_pl()) print(res) @@ -875,96 +877,6 @@ def test_with_fulltext_match_with_invalid_words(self, check_data, match_param_2, "test_with_fulltext_match_with_invalid_words"+suffix, ConflictType.Error) assert res.error_code == ErrorCode.OK - @pytest.mark.parametrize("match_param_3", [pytest.param("@#$!#@$SDa^sdf3!@#$"), - "topn=1", - "1"]) - @pytest.mark.parametrize("check_data", [{"file_name": "enwiki_embedding_99_commas.csv", - "data_dir": common_values.TEST_TMP_DIR}], indirect=True) - def test_with_fulltext_match_with_options(self, check_data, match_param_3, suffix): - db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table( - "test_with_fulltext_match_with_options"+suffix, ConflictType.Ignore) - table_obj = db_obj.create_table("test_with_fulltext_match_with_options"+suffix, - {"doctitle": {"type": "varchar"}, - "docdate": {"type": "varchar"}, - "body": {"type": "varchar"}, - "num": {"type": "int"}, - "vec": {"type": "vector, 4, float"}}) - table_obj.create_index("my_index", - index.IndexInfo("body", - index.IndexType.FullText, - {"ANALYZER": "standard"}), - ConflictType.Error) - - if not check_data: - generate_commas_enwiki( - "enwiki_99.csv", "enwiki_embedding_99_commas.csv", 1) - copy_data("enwiki_embedding_99_commas.csv") - - test_csv_dir = common_values.TEST_TMP_DIR + "enwiki_embedding_99_commas.csv" - table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) - res = (table_obj - .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match("doctitle,num,body^5", "word", match_param_3) - .fusion(method='rrf', topn=10) - .to_pl()) - print(res) - - res = table_obj.drop_index("my_index", ConflictType.Error) - assert res.error_code == ErrorCode.OK - - res = db_obj.drop_table( - "test_with_fulltext_match_with_options"+suffix, ConflictType.Error) - assert res.error_code == ErrorCode.OK - - @pytest.mark.parametrize("match_param_3", [pytest.param(1), - pytest.param(1.1), - pytest.param([]), - pytest.param({}), - pytest.param(()), ]) - @pytest.mark.parametrize("check_data", [{"file_name": "enwiki_embedding_99_commas.csv", - "data_dir": common_values.TEST_TMP_DIR}], indirect=True) - def test_with_fulltext_match_with_invalid_options(self, check_data, match_param_3, suffix): - db_obj = self.infinity_obj.get_database("default_db") - db_obj.drop_table( - "test_with_fulltext_match_with_invalid_options"+suffix, ConflictType.Ignore) - table_obj = db_obj.create_table("test_with_fulltext_match_with_invalid_options"+suffix, - {"doctitle": {"type": "varchar"}, - "docdate": {"type": "varchar"}, - "body": {"type": "varchar"}, - "num": {"type": "int"}, - "vec": {"type": "vector, 4, float"}}) - table_obj.create_index("my_index", - index.IndexInfo("body", - index.IndexType.FullText, - {"ANALYZER": "standard"}), - ConflictType.Error) - - if not check_data: - generate_commas_enwiki( - "enwiki_99.csv", "enwiki_embedding_99_commas.csv", 1) - copy_data("enwiki_embedding_99_commas.csv") - - test_csv_dir = common_values.TEST_TMP_DIR + "enwiki_embedding_99_commas.csv" - table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) - - with pytest.raises(Exception): - res = (table_obj - .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match("doctitle,num,body^5", "word", match_param_3) - .fusion(method='rrf', topn=10) - .to_pl()) - print(res) - - res = table_obj.drop_index("my_index", ConflictType.Error) - assert res.error_code == ErrorCode.OK - - res = db_obj.drop_table( - "test_with_fulltext_match_with_invalid_options"+suffix, ConflictType.Error) - assert res.error_code == ErrorCode.OK - @pytest.mark.parametrize("check_data", [{"file_name": "tensor_maxsim.csv", "data_dir": common_values.TEST_TMP_DIR}], indirect=True) @pytest.mark.parametrize("save_elem_t", ["float32", "float16", "bfloat16"]) @@ -1006,7 +918,7 @@ def test_sparse_knn(self, check_data, suffix): test_csv_dir = common_values.TEST_TMP_DIR + "sparse_knn.csv" table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) res = (table_obj.output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) + .match_sparse("c2", SparseVector(**{"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}), "ip", 3) .to_pl()) print(res) @@ -1034,7 +946,7 @@ def test_sparse_knn_with_index(self, check_data, suffix): res = (table_obj .output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3, + .match_sparse("c2", SparseVector(**{"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}), "ip", 3, {"alpha": "1.0", "beta": "1.0"}) .to_pl()) print(res) @@ -1061,7 +973,7 @@ def test_with_multiple_fusion(self, check_data, suffix): table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) res = (table_obj .output(["*", "_row_id", "_score"]) - .match('body', 'off', 'topn=4') + .match_text('body', 'off', 4) .match_tensor('t', [[1.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0]], 'float', 2) .fusion(method='rrf', topn=10) .fusion(method='match_tensor', topn=2, fusion_params={'field': 't', 'data_type': 'float', @@ -1111,7 +1023,7 @@ def test_with_various_index_knn_distance_combination(self, check_data, index_col "metric": index_distance_type }), ConflictType.Error) assert res.error_code == ErrorCode.OK - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1.0] * 4, "float", knn_distance_type, 5).to_pl() print(res) res = table_obj.drop_index("my_index", ConflictType.Error) @@ -1134,7 +1046,7 @@ def test_with_various_index_knn_distance_combination(self, check_data, index_col ConflictType.Error) assert res.error_code == ErrorCode.OK # for IVFFlat, index_distance_type has to match knn_distance_type? - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( knn_column_name, [1.0] * 4, "float", index_distance_type, 5).to_pl() print(res) res = table_obj.drop_index("my_index", ConflictType.Error) @@ -1157,7 +1069,7 @@ def test_zero_dimension_vector(self, suffix): assert e.type == InfinityException assert e.value.args[0] == ErrorCode.DATA_TYPE_MISMATCH with pytest.raises(InfinityException) as e: - res = table_obj.output(["_row_id"]).knn( + res = table_obj.output(["_row_id"]).match_dense( "zero_vector", [0.0], "float", "l2", 5).to_pl() assert e.type == InfinityException assert e.value.args[0] == ErrorCode.SYNTAX_ERROR @@ -1166,7 +1078,7 @@ def test_zero_dimension_vector(self, suffix): with pytest.raises(Exception): table_obj.insert([{"zero_vector": []}]) try: - res = table_obj.output(["_row_id"]).knn( + res = table_obj.output(["_row_id"]).match_dense( "zero_vector", [], "float", "l2", 5).to_pl() except: print("Exception") @@ -1187,7 +1099,7 @@ def test_big_dimension_vector(self, dim, suffix): {"big_vector": [3.0] * dim}, {"big_vector": [4.0] * dim}, {"big_vector": [5.0] * dim}]) - res = table_obj.output(["_row_id"]).knn( + res = table_obj.output(["_row_id"]).match_dense( "big_vector", [0.0] * dim, "float", "l2", 5).to_pl() print(res) @@ -1244,8 +1156,8 @@ def test_with_various_fulltext_match(self, check_data, fields_and_matching_text, table_obj.import_data(test_csv_dir, import_options={"delimiter": ","}) res = (table_obj .output(["*"]) - .knn("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) - .match(fields_and_matching_text[0], fields_and_matching_text[1], "topn=1") + .match_dense("vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 1) + .match_text(fields_and_matching_text[0], fields_and_matching_text[1], 1) .fusion(method='rrf', topn=10) .to_pl()) print(res) @@ -1393,7 +1305,7 @@ def test_sparse_with_invalid_table_params(self, check_data, table_params ,suffix with pytest.raises(InfinityException) as e: res = (table_obj.output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) + .match_sparse("c2", SparseVector(**{"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}), "ip", 3) .to_pl()) assert e.value.args[0] == ErrorCode.SYNTAX_ERROR @@ -1406,7 +1318,7 @@ def test_sparse_with_invalid_table_params(self, check_data, table_params ,suffix with pytest.raises(InfinityException) as e: res = (table_obj.output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3) + .match_sparse("c2", SparseVector(**{"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}), "ip", 3) .to_pl()) assert e.value.args[0] == ErrorCode.DATA_TYPE_MISMATCH @@ -1534,7 +1446,7 @@ def test_sparse_knn_with_invalid_alpha_beta(self, check_data, alpha, beta, suffi with pytest.raises(InfinityException) as e: res = (table_obj .output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3, + .match_sparse("c2", SparseVector(**{"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}), "ip", 3, {"alpha": alpha, "beta": beta}) .to_pl()) @@ -1560,7 +1472,7 @@ def test_sparse_knn_with_indices_values_mismatch(self, check_data, suffix): res = (table_obj .output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20], "values": [1.0, 2.0, 3.0]}, "ip", 3, + .match_sparse("c2", SparseVector(**{"indices": [0, 20], "values": [1.0, 2.0, 3.0]}), "ip", 3, {"alpha": "1.0", "beta": "1.0"}) .to_pl()) print(res) @@ -1585,7 +1497,7 @@ def test_sparse_knn_with_invalid_distance_type(self, check_data, distance_type, with pytest.raises(Exception): res = (table_obj .output(["*", "_row_id", "_similarity"]) - .match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, distance_type, 3, + .match_sparse("c2", SparseVector(**{"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}), distance_type, 3, {"alpha": "1.0", "beta": "1.0"}) .to_pl()) @@ -1638,11 +1550,11 @@ def test_knn_with_given_index_name(self, check_data, knn_distance_type, suffix): assert res.error_code == ErrorCode.OK - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1.0] * 4, "float", knn_distance_type, 5, {"index_name":"my_index_l2"}).to_pl() print(res) - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1.0] * 4, "float", knn_distance_type, 5, {"index_name": "my_index_ip"}).to_pl() print(res) @@ -1689,7 +1601,7 @@ def test_knn_with_ignore_index(self, check_data, knn_distance_type, suffix): assert res.error_code == ErrorCode.OK - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1.0] * 4, "float", knn_distance_type, 5, {"ignore_index":"true"}).to_pl() print(res) @@ -1734,7 +1646,7 @@ def test_knn_with_given_invalid_index_name(self, check_data, knn_distance_type, assert res.error_code == ErrorCode.OK with pytest.raises(InfinityException) as e: - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1.0] * 4, "float", knn_distance_type, 5, {"index_name": "my_index_ip"}).to_pl() assert e.value.args[0] == ErrorCode.INDEX_NOT_EXIST @@ -1778,12 +1690,12 @@ def test_knn_with_given_index_name_and_ignore_index(self, check_data, knn_distan assert res.error_code == ErrorCode.OK - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1.0] * 4, "float", knn_distance_type, 5, {"index_name": "my_index_l2", "ignore_index":"false"}).to_pl() print(res) with pytest.raises(InfinityException) as e: - res = table_obj.output(["variant_id"]).knn( + res = table_obj.output(["variant_id"]).match_dense( "gender_vector", [1.0] * 4, "float", knn_distance_type, 5, {"index_name": "my_index_l2", "ignore_index":"true"}).to_pl() assert e.value.args[0] == ErrorCode.SYNTAX_ERROR diff --git a/python/test_pysdk/test_query.py b/python/test_pysdk/test_query.py index 260dd879a8..86f78dda49 100644 --- a/python/test_pysdk/test_query.py +++ b/python/test_pysdk/test_query.py @@ -76,8 +76,8 @@ def test_query(self): # Create a query builder query_builder = InfinityThriftQueryBuilder(table) query_builder.output(["num", "body"]) - query_builder.knn('vec', [3.0] * 5, 'float', 'ip', 2) - query_builder.match('body', 'harmful', 'topn=2') + query_builder.match_dense('vec', [3.0] * 5, 'float', 'ip', 2) + query_builder.match_text('body', 'harmful', 2, None) query_builder.fusion(method='rrf', topn=10, fusion_params=None) res = query_builder.to_df() print(res) diff --git a/python/tmp_test.py b/python/tmp_test.py index d182b34c32..6096755b1d 100644 --- a/python/tmp_test.py +++ b/python/tmp_test.py @@ -116,5 +116,5 @@ print("Time to index geonames: ", end - start) start = time.time() -table_obj.output(["name"]).match("name", "Sankt Georgen").to_pl() +table_obj.output(["name"]).match_text("name", "Sankt Georgen", 10).to_pl() end = time.time()