From 52f38ca3c2ad4584dba71e832d2fa875f220df79 Mon Sep 17 00:00:00 2001 From: writinwaters <93570324+writinwaters@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:13:58 +0800 Subject: [PATCH] Updated full-text search scoring and syntax (#2343) ### What problem does this PR solve? ### Type of change - [x] Documentation Update --- docs/getstarted/build_from_source.mdx | 2 + docs/getstarted/deploy_infinity_server.mdx | 2 + docs/guides/search_guide.md | 82 +++++++++++++++---- docs/guides/set_up_cluster.md | 2 + docs/references/benchmark.md | 3 +- docs/references/configurations.mdx | 4 +- docs/references/faq.md | 2 +- docs/references/http_api_reference.mdx | 2 +- docs/references/pysdk_api_reference.md | 4 +- .../search/phrase_doc_iterator.cpp | 2 +- 10 files changed, 81 insertions(+), 24 deletions(-) diff --git a/docs/getstarted/build_from_source.mdx b/docs/getstarted/build_from_source.mdx index 39c1ede016..a22ba62893 100644 --- a/docs/getstarted/build_from_source.mdx +++ b/docs/getstarted/build_from_source.mdx @@ -9,6 +9,8 @@ import TabItem from '@theme/TabItem'; Build Infinity from source, build and run unit/functional tests. +--- + This document provides instructions for building Infinity from source, as well as building and running unit and functional tests. :::tip NOTE diff --git a/docs/getstarted/deploy_infinity_server.mdx b/docs/getstarted/deploy_infinity_server.mdx index 3f09210ccd..5a36c8493e 100644 --- a/docs/getstarted/deploy_infinity_server.mdx +++ b/docs/getstarted/deploy_infinity_server.mdx @@ -9,6 +9,8 @@ import TabItem from '@theme/TabItem'; Three ways to deploy Infinity. +--- + This document provides guidance on deploying the Infinity database. In general, you can deploy Infinity in the following three ways: - [Import Infinity as a Python module](#import-infinity-as-a-python-module): To run Infinity locally as a Python module. diff --git a/docs/guides/search_guide.md b/docs/guides/search_guide.md index 2a35e0c6ea..9c3924d5f7 100644 --- a/docs/guides/search_guide.md +++ b/docs/guides/search_guide.md @@ -6,6 +6,8 @@ slug: /search_guide Full-text, vector, sparse vector, tensor, hybrid search. +--- + ## Overview This document offers guidance on conducting a search within Infinity. @@ -71,32 +73,78 @@ Both RAG tokenization and fine-grained RAG tokenization are used in RAGFlow to e #### IK analyzer -The IK analyzer is a bilingual tokenizer that supports Chinese (simplified and traditional) and English. It is a C++ adaptation of the [IK Analyzer](https://github.com/infinilabs/analysis-ik), widely used as a tokenizer by Chinese Elasticsearch users. - -Use `"ik"` to select this analyzer, which works the same as the `ik_smart` argument in the [IK Analyzer](https://github.com/infinilabs/analysis-ik), or `"ik-fine"` for fine-grained mode, which works the same as the `ik_max_word` argument in the [IK Analyzer](https://github.com/infinilabs/analysis-ik). +The IK analyzer is a bilingual tokenizer that supports Chinese (simplified and traditional) and English. It is a C++ adaptation of the [IK Analyzer](https://github.com/infinilabs/analysis-ik), which is widely used as a tokenizer by Chinese Elasticsearch users. +Use `"ik"` to select this analyzer, which is equivalent to the `ik_smart` option in the [IK Analyzer](https://github.com/infinilabs/analysis-ik), or `"ik-fine"` for fine-grained mode, which is equivalent to the `ik_max_word` option in the [IK Analyzer](https://github.com/infinilabs/analysis-ik). #### Keyword analyzer The keyword analyzer is a "noop" analyzer used for columns containing keywords only, where traditional scoring methods like `BM25` do not apply. It scores `0` or `1`, depending on whether any keywords are matched. -### Search and ranking +Use `"keyword"` to select this analyzer. + +### Search and ranking syntax + +Infinity supports the following syntax or full-text search expressions: + +- Single term +- AND multiple terms +- OR multiple terms +- Phrase search +- CARAT opertor +- Sloppy phrase search +- Field-specific search +- Escape character + +#### Single term + +Example: `"blooms"` + +#### AND multiple terms + +- `"space AND efficient"` +- `"space && efficient"` +- `"space + efficient"` + +#### OR multiple terms + +- `"Bloom OR filter"` +- `"Bloom || filter"` +- `"Bloom filter"` + +:::tip NOTE +`OR` is the default semantic in a multi-term full-text search unless explicitly specified otherwise. +::: + +#### Phrase search + +- `"Bloom filter"` +- `'Bloom filter'` + +#### CARAT operator + +Use `^` to boost the importance of a specific term. For example: `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. + +#### Sloppy phrase search + +Example: `'"harmful chemical"~10'` + +#### Field-specific search + +Example: `"title:(quick OR brown) AND body:foobar"` + +#### Escape character -Infinity offers following syntax for full-text search: +Use `\` to escape reserved characters like `:` `~` `(` `)` `"` `+` `-` `=` `&` `|` `[` `]` `{` `}` `*` `?` `\` `/`. For example: `"space\-efficient"`. -- Single term: `"blooms"` -- AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"` -- OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"` . -- Phrase search: `"Bloom filter" or 'Bloom filter'` -- CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`. -- Sloppy phrase search: `'"harmful chemical"~10'` -- Field-specific search: `"title:(quick OR brown) AND body:foobar"` -- Escaping reserved characters: `"space\-efficient"` . `:` `~` `()` `""` `+` `-` `=` `&` `|` `[]` `{}` `*` `?` `\` `/` are reserved characters for search syntax. +### Scoring -`OR` is the default semantic among multiple terms if user does not specify in search syntax. Infinity offers `BM25` scoring and block-max `WAND` for dynamic pruning to accelerate the multiple terms search processing. There are two approaches to bypass `BM25` scoring: +Infinity offers `BM25` scoring and block-max `WAND` for dynamic pruning to accelerate multi-term searches. To *not* use `BM25` scoring, do either of the following: -- Using `keyword` analyzer when creating index, then `BM25` will not be used and it will return the score based on whether keywords are hit. -- Specifying `similarity=boolean` during searching. Then the scoring is decided by the number of keywords hits. +- Set `"analyzer"` to `"keyword"` when creating index (to select the keyword analyzer). + *The returned score will then be based on whether keywords are matched.* +- Add `{"similarity": "boolean"}` as a search option. + *The scoring will then depend on the number of matched keywords.* ## Dense vector search @@ -149,7 +197,7 @@ Infinity offers three types of rerankers for fusion: ## Conditional filters -Conditional filters in Infinity must work through an index to facilitate search. There are two types of indexes in Infinity that support conditional filters: +Conditional filters in Infinity must work through an index to facilitate search. The following two types of indexes in Infinity support conditional filters: - **Secondary index**: Built on numeric or string columns. This index does not apply any tokenization to a string column when using conditional filters. - **Full-text index**: Built on full-text columns. This index applies tokenization to the full-text column but does not trigger any relevance scoring procedure. diff --git a/docs/guides/set_up_cluster.md b/docs/guides/set_up_cluster.md index b6a8371cf9..abfeb8a1a9 100644 --- a/docs/guides/set_up_cluster.md +++ b/docs/guides/set_up_cluster.md @@ -6,6 +6,8 @@ slug: /set_up_cluster Architecture overview and user guide for Infinity cluster. +--- + ## Overview An Infinity cluster consists of one leader node, up to four follower nodes, and several learner nodes: diff --git a/docs/references/benchmark.md b/docs/references/benchmark.md index f056a974b7..b66c36bd76 100644 --- a/docs/references/benchmark.md +++ b/docs/references/benchmark.md @@ -1,8 +1,9 @@ --- -sidebar_position: 1 +sidebar_position: 3 slug: /benchmark --- # Benchmark + This document compares the following key specifications of Elasticsearch, Qdrant, Quickwit and Infinity: - Time to insert & build index diff --git a/docs/references/configurations.mdx b/docs/references/configurations.mdx index fdfd17f841..89de586e10 100644 --- a/docs/references/configurations.mdx +++ b/docs/references/configurations.mdx @@ -1,5 +1,5 @@ --- -sidebar_position: 5 +sidebar_position: 0 slug: /configurations --- @@ -9,6 +9,8 @@ import TabItem from '@theme/TabItem'; How to set and load configuration file when starting Infinity. +--- + This document provides instructions for loading configuration file for Infinity and descriptions of each configuration entry. diff --git a/docs/references/faq.md b/docs/references/faq.md index db612ee646..4fff1765a7 100644 --- a/docs/references/faq.md +++ b/docs/references/faq.md @@ -1,5 +1,5 @@ --- -sidebar_position: 2 +sidebar_position: 4 slug: /FAQ --- diff --git a/docs/references/http_api_reference.mdx b/docs/references/http_api_reference.mdx index 4af4ef303d..fab96783bd 100644 --- a/docs/references/http_api_reference.mdx +++ b/docs/references/http_api_reference.mdx @@ -1,5 +1,5 @@ --- -sidebar_position: 3 +sidebar_position: 1 slug: /http_api_reference --- diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md index 884f5a1121..6a3629cc6a 100644 --- a/docs/references/pysdk_api_reference.md +++ b/docs/references/pysdk_api_reference.md @@ -1,5 +1,5 @@ --- -sidebar_position: 4 +sidebar_position: 2 slug: /pysdk_api_reference --- # Python API Reference @@ -1571,7 +1571,7 @@ table_object.delete("c1 >= 70 and c1 <= 90") --- -### update data +### update ```python table_object.update(cond, data) diff --git a/src/storage/invertedindex/search/phrase_doc_iterator.cpp b/src/storage/invertedindex/search/phrase_doc_iterator.cpp index d1eed06d26..dc131afc18 100644 --- a/src/storage/invertedindex/search/phrase_doc_iterator.cpp +++ b/src/storage/invertedindex/search/phrase_doc_iterator.cpp @@ -188,7 +188,7 @@ bool PhraseDocIterator::GetSloppyPhraseMatchData() { term_pos_i : term i's current position in document phrase_pos_i: term_pos_i - pos_i - For a solution (term_pos_0, term_pos_1, ..., term_pos_n), it's acceptable iff: + For a solution (term_pos_0, term_pos_1, ..., term_pos_n), it's acceptable if: for any i, j (0<=i<=n, 0<=j<=n), |phrase_pos_i - phrase_pos_j| <= slop For an acceptable solution, its matchLength is: