diff --git a/_includes/code/config-refs/reference.collections.py b/_includes/code/config-refs/reference.collections.py
index 7228c112..04c1539f 100644
--- a/_includes/code/config-refs/reference.collections.py
+++ b/_includes/code/config-refs/reference.collections.py
@@ -316,6 +316,7 @@
# highlight-start
index_filterable=True,
index_searchable=True,
+ tokenization="word",
# highlight-end
),
Property(
@@ -324,6 +325,7 @@
# highlight-start
index_filterable=True,
index_searchable=True,
+ tokenization="field",
# highlight-end
),
Property(
diff --git a/_includes/code/howto/manage-data.collections.py b/_includes/code/howto/manage-data.collections.py
index 46d439dd..a8dca55f 100644
--- a/_includes/code/howto/manage-data.collections.py
+++ b/_includes/code/howto/manage-data.collections.py
@@ -564,6 +564,31 @@
)
# END PropModuleSettings
+# ====================================
+# ======= TRIGRAM TOKENIZATION =======
+# ====================================
+
+# Clean slate
+client.collections.delete("Article")
+
+# START TrigramTokenization
+from weaviate.classes.config import Configure, Property, DataType, Tokenization
+
+client.collections.create(
+ "Article",
+ vector_config=Configure.Vectors.text2vec_cohere(),
+ properties=[
+ Property(
+ name="title",
+ data_type=DataType.TEXT,
+ # highlight-start
+ tokenization=Tokenization.TRIGRAM, # Use "trigram" tokenization
+ # highlight-end
+ ),
+ ],
+)
+# END TrigramTokenization
+
# ====================================
# ===== MODULE SETTINGS PROPERTY =====
# ====================================
diff --git a/_includes/code/howto/manage-data.collections.ts b/_includes/code/howto/manage-data.collections.ts
index 9c383e9a..9d81b448 100644
--- a/_includes/code/howto/manage-data.collections.ts
+++ b/_includes/code/howto/manage-data.collections.ts
@@ -347,6 +347,36 @@ const newCollection = await client.collections.create({
})
// END PropModuleSettings
+// Delete the class to recreate it
+await client.collections.delete(collectionName)
+
+// ====================================
+// ======= TRIGRAM TOKENIZATION =======
+// ====================================
+
+/*
+// START TrigramTokenization
+import { vectors, dataType, tokenization } from 'weaviate-client';
+
+// END TrigramTokenization
+*/
+{
+// START TrigramTokenization
+const newCollection = await client.collections.create({
+ name: 'Article',
+ vectorizers: vectors.text2VecHuggingFace(),
+ properties: [
+ {
+ name: 'title',
+ dataType: dataType.TEXT,
+ // highlight-start
+ tokenization: tokenization.TRIGRAM // Use "trigram" tokenization
+ // highlight-end
+ },
+ ],
+})
+// END TrigramTokenization
+
// Test vectorizeCollectionName
result = client.collections.use(collectionName).config.get()
diff --git a/docs/weaviate/concepts/indexing/inverted-index.md b/docs/weaviate/concepts/indexing/inverted-index.md
index 8c86bbd1..35197c10 100644
--- a/docs/weaviate/concepts/indexing/inverted-index.md
+++ b/docs/weaviate/concepts/indexing/inverted-index.md
@@ -21,6 +21,14 @@ Understanding Weaviate's indexing architecture is crucial for optimizing perform
This architecture provides flexibility and performance optimization but also means that enabling multiple index types increases storage requirements and indexing overhead.
+For `text` properties specifically, the indexing process follows these steps:
+
+1. **Tokenization**: The text is first tokenized according to the [tokenization method](../../config-refs/collections.mdx#tokenization) configured for that property
+2. **Token processing**: Tokens may be further processed (e.g., lowercased, stopwords handled)
+3. **Index entry creation**: Each processed token gets an entry in the inverted index, pointing to the object containing it
+
+This process ensures that your text searches and filters can quickly locate relevant objects based on the tokens they contain.
+
Performance improvements added in Oct 2024
@@ -35,7 +43,7 @@ As always, we recommend upgrading to the latest version of Weaviate to benefit f
-### BlockMax WAND algorithm
+## BlockMax WAND algorithm
:::info Added in `v1.30`
:::
@@ -50,7 +58,7 @@ Due to the nature of the BlockMax WAND algorithm, the scoring of BM25 and hybrid
:::
-### Configure inverted indexes
+## Configure inverted indexes
There are three inverted index types in Weaviate:
diff --git a/docs/weaviate/config-refs/collections.mdx b/docs/weaviate/config-refs/collections.mdx
index e244a269..9476a64d 100644
--- a/docs/weaviate/config-refs/collections.mdx
+++ b/docs/weaviate/config-refs/collections.mdx
@@ -351,6 +351,12 @@ The `gse` tokenizer is not loaded by default to save resources. To use it, set t
- `"素早い茶色の狐が怠けた犬を飛び越えた"`: `["素早", "素早い", "早い", "茶色", "の", "狐", "が", "怠け", "けた", "犬", "を", "飛び", "飛び越え", "越え", "た", "素早い茶色の狐が怠けた犬を飛び越えた"]`
- `"すばやいちゃいろのきつねがなまけたいぬをとびこえた"`: `["すばや", "すばやい", "やい", "いち", "ちゃ", "ちゃい", "ちゃいろ", "いろ", "のき", "きつ", "きつね", "つね", "ねが", "がな", "なま", "なまけ", "まけ", "けた", "けたい", "たい", "いぬ", "を", "とび", "とびこえ", "こえ", "た", "すばやいちゃいろのきつねがなまけたいぬをとびこえた"]`
+:::note `trigram` for fuzzy matching
+
+While originally designed for Asian languages, `trigram` tokenization is also highly effective for fuzzy matching and typo tolerance in other languages.
+
+:::
+
@@ -405,6 +411,42 @@ You can limit the combined number of `gse` and `Kagome` tokenizers running at th
+
+ Fuzzy matching with `trigram` tokenization
+
+The `trigram` tokenization method provides fuzzy matching capabilities by breaking text into overlapping 3-character sequences. This enables BM25 searches to find matches even with spelling errors or variations.
+
+**Use cases for trigram fuzzy matching:**
+
+- **Typo tolerance**: Find matches despite spelling errors (e.g., "Reliace" matches "Reliance")
+- **Name reconciliation**: Match entity names with variations across datasets
+- **Search-as-you-type**: Build autocomplete functionality
+- **Partial matching**: Find objects with partial string matches
+
+**How it works:**
+
+When text is tokenized with `trigram`, it's broken into all possible 3-character sequences:
+
+- `"hello"` → `["hel", "ell", "llo"]`
+- `"world"` → `["wor", "orl", "rld"]`
+
+Similar strings share many trigrams, enabling fuzzy matching:
+
+- `"Morgan Stanley"` and `"Stanley Morgn"` share trigrams like `"sta", "tan", "anl", "nle", "ley"`
+
+**Performance considerations:**
+
+- Creates larger inverted indexes due to more tokens
+- May impact query performance for large datasets
+
+:::tip
+
+Use trigram tokenization selectively on fields where fuzzy matching is preferred. Keep exact-match fields with `word` or `field` tokenization for precision.
+
+:::
+
+
+
---
### Inverted index {#inverted-index}
diff --git a/docs/weaviate/config-refs/indexing/inverted-index.mdx b/docs/weaviate/config-refs/indexing/inverted-index.mdx
index 0a52bf76..96bd007a 100644
--- a/docs/weaviate/config-refs/indexing/inverted-index.mdx
+++ b/docs/weaviate/config-refs/indexing/inverted-index.mdx
@@ -170,7 +170,11 @@ As of `v1.18`, stopwords are indexed. Thus stopwords are included in the inverte
Stopwords can now be configured at runtime. You can use the RESTful API to update the list of stopwords after your data has been indexed.
-Note that stopwords are only removed when [tokenization](../collections.mdx#tokenization) is set to `word`.
+:::info
+
+Stopwords are only removed when [tokenization](../collections.mdx#tokenization) is set to `word`.
+
+:::
#### `indexTimestamps`
@@ -196,8 +200,8 @@ Using these features requires more resources. The additional inverted indexes mu
## How Weaviate creates inverted indexes
-Weaviate creates **separate inverted indexes for each property and each index type**. For example, if you have a `title` property that is both searchable and filterable,
-Weaviate will create two separate inverted indexes for that property - one optimized for search operations and another for filtering operations.
+Weaviate creates **separate inverted indexes for each property and each index type**. For example, if you have a `title` property that is both searchable and filterable,
+Weaviate will create two separate inverted indexes for that property - one optimized for search operations and another for filtering operations.
Find out more in [Concepts: Inverted index](../../concepts/indexing/inverted-index.md#how-weaviate-creates-inverted-indexes).
### Adding a property after collection creation
@@ -213,10 +217,35 @@ To avoid this, you can either:
We are working on a re-indexing API to allow you to re-index the data after adding a property. This will be available in a future release.
+## How tokenization affects inverted indexing
+
+For `text` properties, Weaviate first **[tokenizes](../collections.mdx#tokenization)** the text before creating inverted index entries. Tokenization is the process of breaking text into individual tokens (words, phrases, or characters) that can be indexed and searched.
+
+The tokenization method you choose directly impacts:
+
+- What tokens are created from your text
+- How these tokens are stored in the inverted index
+- How search queries and filters match against your data
+
+For example, with the text `"Hello, (beautiful) world"`:
+
+- **`word` tokenization** creates tokens: `["hello", "beautiful", "world"]`
+- **`whitespace` tokenization** creates tokens: `["hello", "(beautiful)", "world"]`
+- **`field` tokenization** creates a single token: `["Hello, (beautiful) world"]`
+
+Each token becomes a separate entry in the inverted index, pointing back to the objects containing that token.
+
+:::tip
+
+Different tokenization methods suit different use cases. Use `word` (default) for general text search, `field` for exact matching, or specialized tokenizers like `gse` or `kagome_ja` for Asian languages. Learn more about [tokenization options](../collections.mdx#tokenization).
+
+:::
+
## Further resources
- [Concepts: Inverted index](../../concepts/indexing/inverted-index.md)
-- [How-to: Configure collections](../../manage-collections/collection-operations.mdx#set-inverted-index-parameters)
+- [How-to: Set inverted index parameters](../../manage-collections/collection-operations.mdx#set-inverted-index-parameters)
+- [Reference: Tokenization options](../collections.mdx#tokenization) - Learn about different tokenization methods and how they affect text indexing
## Questions and feedback
diff --git a/docs/weaviate/search/bm25.md b/docs/weaviate/search/bm25.md
index 462d9f27..1205759b 100644
--- a/docs/weaviate/search/bm25.md
+++ b/docs/weaviate/search/bm25.md
@@ -1,5 +1,6 @@
---
title: Keyword search
+description: Weaviate BM25 keyword search documentation covering basic queries, search operators, scoring, property targeting, weighting, tokenization, filtering and fuzzy matching.
sidebar_position: 40
image: og/docs/howto.jpg
# tags: ['how to', 'similarity search']
@@ -8,13 +9,13 @@ image: og/docs/howto.jpg
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock';
-import PyCode from '!!raw-loader!/_includes/code/howto/search.bm25.py';
-import PyCodeV3 from '!!raw-loader!/_includes/code/howto/search.bm25-v3.py';
-import TSCode from '!!raw-loader!/_includes/code/howto/search.bm25.ts';
-import TSCodeLegacy from '!!raw-loader!/_includes/code/howto/search.bm25-v2.ts';
-import GoCode from '!!raw-loader!/_includes/code/howto/go/docs/mainpkg/search-bm25_test.go';
-import JavaCode from '!!raw-loader!/_includes/code/howto/java/src/test/java/io/weaviate/docs/search/KeywordSearchTest.java';
-import GQLCode from '!!raw-loader!/_includes/code/howto/search.bm25.gql.py';
+import PyCode from '!!raw-loader!/\_includes/code/howto/search.bm25.py';
+import PyCodeV3 from '!!raw-loader!/\_includes/code/howto/search.bm25-v3.py';
+import TSCode from '!!raw-loader!/\_includes/code/howto/search.bm25.ts';
+import TSCodeLegacy from '!!raw-loader!/\_includes/code/howto/search.bm25-v2.ts';
+import GoCode from '!!raw-loader!/\_includes/code/howto/go/docs/mainpkg/search-bm25_test.go';
+import JavaCode from '!!raw-loader!/\_includes/code/howto/java/src/test/java/io/weaviate/docs/search/KeywordSearchTest.java';
+import GQLCode from '!!raw-loader!/\_includes/code/howto/search.bm25.gql.py';
`Keyword` search, also called "BM25 (Best match 25)" or "sparse vector" search, returns objects that have the highest BM25F scores.
@@ -239,9 +240,6 @@ The response is like this:
## Search on selected properties only
-:::info Added in `v1.19.0`
-:::
-
A keyword search can be directed to only search a subset of object properties. In this example, the BM25 search only uses the `question` property to produce the BM25F score.
@@ -320,13 +318,11 @@ The response is like this:
endMarker="# END Expected BM25WithProperties results"
language="json"
/>
+
## Use weights to boost properties
-:::info Added in `v1.19.0`
-:::
-
You can weight how much each property affects the overall BM25F score. This example boosts the `question` property by a factor of 2 while the `answer` property remains static.
@@ -384,7 +380,6 @@ You can weight how much each property affects the overall BM25F score. This exam
/>
-
-
## Set tokenization
The BM25 query string is [tokenized](../config-refs/collections.mdx#tokenization) before it is used to search for objects using the inverted index.
You must specify the tokenization method in the collection definition for [each property](../manage-collections/vector-config.mdx#property-level-settings).
-import TknPyCode from '!!raw-loader!/_includes/code/howto/manage-data.collections.py';
-import TknPyCodeV3 from '!!raw-loader!/_includes/code/howto/manage-data.collections-v3.py';
-import TknTsCode from '!!raw-loader!/_includes/code/howto/manage-data.collections.ts';
-import TknTsCodeLegacy from '!!raw-loader!/_includes/code/howto/manage-data.collections-v2.ts';
-
+import TknPyCode from '!!raw-loader!/\_includes/code/howto/manage-data.collections.py';
+import TknPyCodeV3 from '!!raw-loader!/\_includes/code/howto/manage-data.collections-v3.py';
+import TknTsCode from '!!raw-loader!/\_includes/code/howto/manage-data.collections.ts';
+import TknTsCodeLegacy from '!!raw-loader!/\_includes/code/howto/manage-data.collections-v2.ts';
@@ -469,6 +462,12 @@ import TknTsCodeLegacy from '!!raw-loader!/_includes/code/howto/manage-data.coll
+:::tip Tokenization and fuzzy matching
+
+For fuzzy matching and typo tolerance, use `trigram` tokenization. See the [fuzzy matching section](#fuzzy-matching) above for details.
+
+:::
+
## `limit` & `offset`
Use `limit` to set a fixed maximum number of objects to return.
@@ -747,18 +746,56 @@ The response is like this:
### Tokenization
-import TokenizationNote from '/_includes/tokenization.mdx'
+import TokenizationNote from '/\_includes/tokenization.mdx'
-## Related pages
+## Fuzzy matching
+
+You can enable fuzzy matching and typo tolerance in BM25 searches by using [`trigram` tokenization](../config-refs/collections.mdx#tokenization). This technique breaks text into overlapping 3-character sequences, allowing BM25 to find matches even when there are spelling errors or variations.
+
+This enables matching between similar but not identical strings because they share many trigrams:
+
+- `"Morgn"` and `"Morgan"` share trigrams like `"org", "rga", "gan"`
+
+Set the tokenization method to `trigram` at the property level when creating your collection:
+
+
+
+
+
+
+
+
+
+
+:::tip Best practices
+
+- Use trigram tokenization selectively on fields that need fuzzy matching.
+- Keep exact-match fields with `word` or `field` tokenization for precision.
+
+:::
+
+
+## Further resources
-- [Connect to Weaviate](/weaviate/connections/index.mdx)
+- [Connect to Weaviate](../connections/index.mdx)
- [API References: Search operators # BM25](../api/graphql/search-operators.md#bm25)
+- [Reference: Tokenization options](../config-refs/collections.mdx#tokenization)
- [Weaviate Academy: Tokenization](../../academy/py/tokenization/index.md)
## Questions and feedback
-import DocsFeedback from '/_includes/docs-feedback.mdx';
+import DocsFeedback from '/\_includes/docs-feedback.mdx';