From 52f38ca3c2ad4584dba71e832d2fa875f220df79 Mon Sep 17 00:00:00 2001
From: writinwaters <93570324+writinwaters@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:13:58 +0800
Subject: [PATCH] Updated full-text search scoring and syntax (#2343)

### What problem does this PR solve?

### Type of change

- [x] Documentation Update
---
 docs/getstarted/build_from_source.mdx         |  2 +
 docs/getstarted/deploy_infinity_server.mdx    |  2 +
 docs/guides/search_guide.md                   | 82 +++++++++++++++----
 docs/guides/set_up_cluster.md                 |  2 +
 docs/references/benchmark.md                  |  3 +-
 docs/references/configurations.mdx            |  4 +-
 docs/references/faq.md                        |  2 +-
 docs/references/http_api_reference.mdx        |  2 +-
 docs/references/pysdk_api_reference.md        |  4 +-
 .../search/phrase_doc_iterator.cpp            |  2 +-
 10 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/docs/getstarted/build_from_source.mdx b/docs/getstarted/build_from_source.mdx
index 39c1ede016..a22ba62893 100644
--- a/docs/getstarted/build_from_source.mdx
+++ b/docs/getstarted/build_from_source.mdx
@@ -9,6 +9,8 @@ import TabItem from '@theme/TabItem';
 
 Build Infinity from source, build and run unit/functional tests.
 
+---
+
 This document provides instructions for building Infinity from source, as well as building and running unit and functional tests. 
 
 :::tip NOTE
diff --git a/docs/getstarted/deploy_infinity_server.mdx b/docs/getstarted/deploy_infinity_server.mdx
index 3f09210ccd..5a36c8493e 100644
--- a/docs/getstarted/deploy_infinity_server.mdx
+++ b/docs/getstarted/deploy_infinity_server.mdx
@@ -9,6 +9,8 @@ import TabItem from '@theme/TabItem';
 
 Three ways to deploy Infinity.
 
+---
+
 This document provides guidance on deploying the Infinity database. In general, you can deploy Infinity in the following three ways: 
 
 - [Import Infinity as a Python module](#import-infinity-as-a-python-module): To run Infinity locally as a Python module.
diff --git a/docs/guides/search_guide.md b/docs/guides/search_guide.md
index 2a35e0c6ea..9c3924d5f7 100644
--- a/docs/guides/search_guide.md
+++ b/docs/guides/search_guide.md
@@ -6,6 +6,8 @@ slug: /search_guide
 
 Full-text, vector, sparse vector, tensor, hybrid search.
 
+---
+
 ## Overview
 
 This document offers guidance on conducting a search within Infinity.
@@ -71,32 +73,78 @@ Both RAG tokenization and fine-grained RAG tokenization are used in RAGFlow to e
 
 #### IK analyzer
 
-The IK analyzer is a bilingual tokenizer that supports Chinese (simplified and traditional) and English. It is a C++ adaptation of the [IK Analyzer](https://github.com/infinilabs/analysis-ik), widely used as a tokenizer by Chinese Elasticsearch users.
-
-Use `"ik"` to select this analyzer, which works the same as the `ik_smart` argument in the [IK Analyzer](https://github.com/infinilabs/analysis-ik), or `"ik-fine"` for fine-grained mode, which works the same as the `ik_max_word` argument in the [IK Analyzer](https://github.com/infinilabs/analysis-ik).
+The IK analyzer is a bilingual tokenizer that supports Chinese (simplified and traditional) and English. It is a C++ adaptation of the [IK Analyzer](https://github.com/infinilabs/analysis-ik), which is widely used as a tokenizer by Chinese Elasticsearch users.
 
+Use `"ik"` to select this analyzer, which is equivalent to the `ik_smart` option in the [IK Analyzer](https://github.com/infinilabs/analysis-ik), or `"ik-fine"` for fine-grained mode, which is equivalent to the `ik_max_word` option in the [IK Analyzer](https://github.com/infinilabs/analysis-ik).
 
 #### Keyword analyzer
 
 The keyword analyzer is a "noop" analyzer used for columns containing keywords only, where traditional scoring methods like `BM25` do not apply. It scores `0` or `1`, depending on whether any keywords are matched.
 
-### Search and ranking
+Use `"keyword"` to select this analyzer.
+
+### Search and ranking syntax
+
+Infinity supports the following syntax or full-text search expressions:
+
+- Single term
+- AND multiple terms
+- OR multiple terms
+- Phrase search
+- CARAT opertor
+- Sloppy phrase search
+- Field-specific search
+- Escape character
+
+#### Single term
+
+Example: `"blooms"`
+
+#### AND multiple terms
+
+- `"space AND efficient"`
+- `"space && efficient"`
+- `"space + efficient"`
+
+#### OR multiple terms
+
+- `"Bloom OR filter"`
+- `"Bloom || filter"`
+- `"Bloom filter"`
+
+:::tip NOTE
+`OR` is the default semantic in a multi-term full-text search unless explicitly specified otherwise.
+:::
+
+#### Phrase search
+
+- `"Bloom filter"`
+- `'Bloom filter'`
+
+#### CARAT operator
+
+Use `^` to boost the importance of a specific term. For example: `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`.
+
+#### Sloppy phrase search
+
+Example: `'"harmful chemical"~10'`
+
+#### Field-specific search
+
+Example: `"title:(quick OR brown) AND body:foobar"`
+
+#### Escape character
 
-Infinity offers following syntax for full-text search:
+Use `\` to escape reserved characters like `:` `~` `(` `)` `"` `+` `-` `=` `&` `|` `[` `]` `{` `}` `*` `?` `\` `/`. For example: `"space\-efficient"`.
 
-- Single term: `"blooms"`
-- AND multiple terms: `"space AND efficient"`, `"space && efficient"` or `"space + efficient"`
-- OR multiple terms: `"Bloom OR filter"`, `"Bloom || filter"` or just `"Bloom filter"` .
-- Phrase search: `"Bloom filter" or 'Bloom filter'`
-- CARAT operator: `^`: Used to boost the importance of a term, e.g., `quick^2 brown` boosts the importance of `quick` by a factor of 2, making it twice as important as `brown`.
-- Sloppy phrase search: `'"harmful chemical"~10'`
-- Field-specific search: `"title:(quick OR brown) AND body:foobar"`
-- Escaping reserved characters: `"space\-efficient"` . `:` `~` `()` `""` `+` `-` `=` `&` `|` `[]` `{}` `*` `?` `\` `/` are reserved characters for search syntax.
+### Scoring
 
-`OR`  is the default semantic among multiple terms if user does not specify in search syntax. Infinity offers `BM25` scoring and block-max `WAND` for dynamic pruning to accelerate the multiple terms search processing. There are two approaches to bypass `BM25` scoring:
+Infinity offers `BM25` scoring and block-max `WAND` for dynamic pruning to accelerate multi-term searches. To *not* use `BM25` scoring, do either of the following:
 
-- Using `keyword` analyzer when creating index, then `BM25` will not be used and it will return the score based on whether keywords are hit.
-- Specifying `similarity=boolean` during searching. Then the scoring is decided by the number of keywords hits.
+- Set `"analyzer"` to `"keyword"` when creating index (to select the keyword analyzer).  
+  *The returned score will then be based on whether keywords are matched.*
+- Add `{"similarity": "boolean"}` as a search option.  
+  *The scoring will then depend on the number of matched keywords.*
 
 ## Dense vector search
 
@@ -149,7 +197,7 @@ Infinity offers three types of rerankers for fusion:
 
 ## Conditional filters
 
-Conditional filters in Infinity must work through an index to facilitate search. There are two types of indexes in Infinity that support conditional filters:
+Conditional filters in Infinity must work through an index to facilitate search. The following two types of indexes in Infinity support conditional filters:
 
 - **Secondary index**: Built on numeric or string columns. This index does not apply any tokenization to a string column when using conditional filters.
 - **Full-text index**: Built on full-text columns. This index applies tokenization to the full-text column but does not trigger any relevance scoring procedure.
diff --git a/docs/guides/set_up_cluster.md b/docs/guides/set_up_cluster.md
index b6a8371cf9..abfeb8a1a9 100644
--- a/docs/guides/set_up_cluster.md
+++ b/docs/guides/set_up_cluster.md
@@ -6,6 +6,8 @@ slug: /set_up_cluster
 
 Architecture overview and user guide for Infinity cluster.
 
+---
+
 ## Overview
 
 An Infinity cluster consists of one leader node, up to four follower nodes, and several learner nodes:
diff --git a/docs/references/benchmark.md b/docs/references/benchmark.md
index f056a974b7..b66c36bd76 100644
--- a/docs/references/benchmark.md
+++ b/docs/references/benchmark.md
@@ -1,8 +1,9 @@
 ---
-sidebar_position: 1
+sidebar_position: 3
 slug: /benchmark
 ---
 # Benchmark
+
 This document compares the following key specifications of Elasticsearch, Qdrant, Quickwit and Infinity:
 
 - Time to insert & build index
diff --git a/docs/references/configurations.mdx b/docs/references/configurations.mdx
index fdfd17f841..89de586e10 100644
--- a/docs/references/configurations.mdx
+++ b/docs/references/configurations.mdx
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 5
+sidebar_position: 0
 slug: /configurations
 ---
 
@@ -9,6 +9,8 @@ import TabItem from '@theme/TabItem';
 
 How to set and load configuration file when starting Infinity.
 
+---
+
 This document provides instructions for loading configuration file for Infinity and descriptions of each configuration entry.
 
 
diff --git a/docs/references/faq.md b/docs/references/faq.md
index db612ee646..4fff1765a7 100644
--- a/docs/references/faq.md
+++ b/docs/references/faq.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 2
+sidebar_position: 4
 slug: /FAQ
 ---
 
diff --git a/docs/references/http_api_reference.mdx b/docs/references/http_api_reference.mdx
index 4af4ef303d..fab96783bd 100644
--- a/docs/references/http_api_reference.mdx
+++ b/docs/references/http_api_reference.mdx
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 3
+sidebar_position: 1
 slug: /http_api_reference
 ---
 
diff --git a/docs/references/pysdk_api_reference.md b/docs/references/pysdk_api_reference.md
index 884f5a1121..6a3629cc6a 100644
--- a/docs/references/pysdk_api_reference.md
+++ b/docs/references/pysdk_api_reference.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 4
+sidebar_position: 2
 slug: /pysdk_api_reference
 ---
 # Python API Reference
@@ -1571,7 +1571,7 @@ table_object.delete("c1 >= 70 and c1 <= 90")
 
 ---
 
-### update data
+### update
 
 ```python
 table_object.update(cond, data)
diff --git a/src/storage/invertedindex/search/phrase_doc_iterator.cpp b/src/storage/invertedindex/search/phrase_doc_iterator.cpp
index d1eed06d26..dc131afc18 100644
--- a/src/storage/invertedindex/search/phrase_doc_iterator.cpp
+++ b/src/storage/invertedindex/search/phrase_doc_iterator.cpp
@@ -188,7 +188,7 @@ bool PhraseDocIterator::GetSloppyPhraseMatchData() {
     term_pos_i : term i's current position in document
     phrase_pos_i: term_pos_i - pos_i
 
-    For a solution (term_pos_0, term_pos_1, ..., term_pos_n), it's acceptable iff:
+    For a solution (term_pos_0, term_pos_1, ..., term_pos_n), it's acceptable if:
     for any i, j (0<=i<=n, 0<=j<=n), |phrase_pos_i - phrase_pos_j| <= slop
 
     For an acceptable solution, its matchLength is: