weaviate · g-despot · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 18, 2025
diff --git a/_includes/code/config-refs/reference.collections.py b/_includes/code/config-refs/reference.collections.py
@@ -316,6 +316,7 @@
             # highlight-start
             index_filterable=True,
             index_searchable=True,
+            tokenization="word",
             # highlight-end
         ),
         Property(
@@ -324,6 +325,7 @@
             # highlight-start
             index_filterable=True,
             index_searchable=True,
+            tokenization="field",
             # highlight-end
         ),
         Property(

diff --git a/_includes/code/howto/manage-data.collections.py b/_includes/code/howto/manage-data.collections.py
@@ -564,6 +564,31 @@
 )
 # END PropModuleSettings
 
+# ====================================
+# ======= TRIGRAM TOKENIZATION =======
+# ====================================
+
+# Clean slate
+client.collections.delete("Article")
+
+# START TrigramTokenization
+from weaviate.classes.config import Configure, Property, DataType, Tokenization
+
+client.collections.create(
+    "Article",
+    vector_config=Configure.Vectors.text2vec_cohere(),
+    properties=[
+        Property(
+            name="title",
+            data_type=DataType.TEXT,
+            # highlight-start
+            tokenization=Tokenization.TRIGRAM,  # Use "trigram" tokenization
+            # highlight-end
+        ),
+    ],
+)
+# END TrigramTokenization
+
 # ====================================
 # ===== MODULE SETTINGS PROPERTY =====
 # ====================================

diff --git a/_includes/code/howto/manage-data.collections.ts b/_includes/code/howto/manage-data.collections.ts
@@ -347,6 +347,36 @@ const newCollection = await client.collections.create({
 })
 // END PropModuleSettings
 
+// Delete the class to recreate it
+await client.collections.delete(collectionName)
+
+// ====================================
+// ======= TRIGRAM TOKENIZATION =======
+// ====================================
+
+/*
+// START TrigramTokenization
+import { vectors, dataType, tokenization } from 'weaviate-client';
+
+// END TrigramTokenization
+*/
+{
+// START TrigramTokenization
+const newCollection = await client.collections.create({
+  name: 'Article',
+  vectorizers: vectors.text2VecHuggingFace(),
+  properties: [
+    {
+      name: 'title',
+      dataType: dataType.TEXT,
+      // highlight-start
+      tokenization: tokenization.TRIGRAM  // Use "trigram" tokenization
+      // highlight-end
+    },
+  ],
+})
+// END TrigramTokenization
+
 // Test vectorizeCollectionName
 result = client.collections.use(collectionName).config.get()
 

diff --git a/docs/weaviate/concepts/indexing/inverted-index.md b/docs/weaviate/concepts/indexing/inverted-index.md
@@ -21,6 +21,14 @@ Understanding Weaviate's indexing architecture is crucial for optimizing perform
 
 This architecture provides flexibility and performance optimization but also means that enabling multiple index types increases storage requirements and indexing overhead.
 
+For `text` properties specifically, the indexing process follows these steps:
+
+1. **Tokenization**: The text is first tokenized according to the [tokenization method](../../config-refs/collections.mdx#tokenization) configured for that property
+2. **Token processing**: Tokens may be further processed (e.g., lowercased, stopwords handled)
+3. **Index entry creation**: Each processed token gets an entry in the inverted index, pointing to the object containing it
+
+This process ensures that your text searches and filters can quickly locate relevant objects based on the tokens they contain.
+
 <details>
   <summary>Performance improvements added in Oct 2024</summary>
 
@@ -35,7 +43,7 @@ As always, we recommend upgrading to the latest version of Weaviate to benefit f
 
 </details>
 
-### BlockMax WAND algorithm
+## BlockMax WAND algorithm
 
 :::info Added in `v1.30`
 :::
@@ -50,7 +58,7 @@ Due to the nature of the BlockMax WAND algorithm, the scoring of BM25 and hybrid
 
 :::
 
-### Configure inverted indexes
+## Configure inverted indexes
 
 There are three inverted index types in Weaviate:
 

diff --git a/docs/weaviate/config-refs/collections.mdx b/docs/weaviate/config-refs/collections.mdx
@@ -351,6 +351,12 @@ The `gse` tokenizer is not loaded by default to save resources. To use it, set t
 - `"素早い茶色の狐が怠けた犬を飛び越えた"`: `["素早", "素早い", "早い", "茶色", "の", "狐", "が", "怠け", "けた", "犬", "を", "飛び", "飛び越え", "越え", "た", "素早い茶色の狐が怠けた犬を飛び越えた"]`
 - `"すばやいちゃいろのきつねがなまけたいぬをとびこえた"`: `["すばや", "すばやい", "やい", "いち", "ちゃ", "ちゃい", "ちゃいろ", "いろ", "のき", "きつ", "きつね", "つね", "ねが", "がな", "なま", "なまけ", "まけ", "けた", "けたい", "たい", "いぬ", "を", "とび", "とびこえ", "こえ", "た", "すばやいちゃいろのきつねがなまけたいぬをとびこえた"]`
 
+:::note `trigram` for fuzzy matching
+
+While originally designed for Asian languages, `trigram` tokenization is also highly effective for fuzzy matching and typo tolerance in other languages.
+
+:::
+
 </details>
 
 <details>
@@ -405,6 +411,42 @@ You can limit the combined number of `gse` and `Kagome` tokenizers running at th
 
 </details>
 
+<details>
+  <summary>Fuzzy matching with `trigram` tokenization</summary>
+
+The `trigram` tokenization method provides fuzzy matching capabilities by breaking text into overlapping 3-character sequences. This enables BM25 searches to find matches even with spelling errors or variations.
+
+**Use cases for trigram fuzzy matching:**
+
+- **Typo tolerance**: Find matches despite spelling errors (e.g., "Reliace" matches "Reliance")
+- **Name reconciliation**: Match entity names with variations across datasets
+- **Search-as-you-type**: Build autocomplete functionality
+- **Partial matching**: Find objects with partial string matches
+
+**How it works:**
+
+When text is tokenized with `trigram`, it's broken into all possible 3-character sequences:
+
+- `"hello"` → `["hel", "ell", "llo"]`
+- `"world"` → `["wor", "orl", "rld"]`
+
+Similar strings share many trigrams, enabling fuzzy matching:
+
+- `"Morgan Stanley"` and `"Stanley Morgn"` share trigrams like `"sta", "tan", "anl", "nle", "ley"`
+
+**Performance considerations:**
+
+- Creates larger inverted indexes due to more tokens
+- May impact query performance for large datasets
+
+:::tip
+
+Use trigram tokenization selectively on fields where fuzzy matching is preferred. Keep exact-match fields with `word` or `field` tokenization for precision.
+
+:::
+
+</details>
+
 ---
 
 ### Inverted index {#inverted-index}

diff --git a/docs/weaviate/config-refs/indexing/inverted-index.mdx b/docs/weaviate/config-refs/indexing/inverted-index.mdx
@@ -170,7 +170,11 @@ As of `v1.18`, stopwords are indexed. Thus stopwords are included in the inverte
 
 Stopwords can now be configured at runtime. You can use the RESTful API to <SkipLink href="/weaviate/api/rest#tag/schema/put/schema/%7BclassName%7D">update</SkipLink> the list of stopwords after your data has been indexed.
 
-Note that stopwords are only removed when [tokenization](../collections.mdx#tokenization) is set to `word`.
+:::info
+
+Stopwords are only removed when [tokenization](../collections.mdx#tokenization) is set to `word`.
+
+:::
 
 #### `indexTimestamps`
 
@@ -196,8 +200,8 @@ Using these features requires more resources. The additional inverted indexes mu
 
 ## How Weaviate creates inverted indexes
 
-Weaviate creates **separate inverted indexes for each property and each index type**. For example, if you have a `title` property that is both searchable and filterable, 
-Weaviate will create two separate inverted indexes for that property - one optimized for search operations and another for filtering operations. 
+Weaviate creates **separate inverted indexes for each property and each index type**. For example, if you have a `title` property that is both searchable and filterable,
+Weaviate will create two separate inverted indexes for that property - one optimized for search operations and another for filtering operations.
 Find out more in [Concepts: Inverted index](../../concepts/indexing/inverted-index.md#how-weaviate-creates-inverted-indexes).
 
 ### Adding a property after collection creation
@@ -213,10 +217,35 @@ To avoid this, you can either:
 
 We are working on a re-indexing API to allow you to re-index the data after adding a property. This will be available in a future release.
 
+## How tokenization affects inverted indexing
+
+For `text` properties, Weaviate first **[tokenizes](../collections.mdx#tokenization)** the text before creating inverted index entries. Tokenization is the process of breaking text into individual tokens (words, phrases, or characters) that can be indexed and searched.
+
+The tokenization method you choose directly impacts:
+
+- What tokens are created from your text
+- How these tokens are stored in the inverted index
+- How search queries and filters match against your data
+
+For example, with the text `"Hello, (beautiful) world"`:
+
+- **`word` tokenization** creates tokens: `["hello", "beautiful", "world"]`
+- **`whitespace` tokenization** creates tokens: `["hello", "(beautiful)", "world"]`
+- **`field` tokenization** creates a single token: `["Hello, (beautiful) world"]`
+
+Each token becomes a separate entry in the inverted index, pointing back to the objects containing that token.
+
+:::tip
+
+Different tokenization methods suit different use cases. Use `word` (default) for general text search, `field` for exact matching, or specialized tokenizers like `gse` or `kagome_ja` for Asian languages. Learn more about [tokenization options](../collections.mdx#tokenization).
+
+:::
+
 ## Further resources
 
 - [Concepts: Inverted index](../../concepts/indexing/inverted-index.md)
-- [How-to: Configure collections](../../manage-collections/collection-operations.mdx#set-inverted-index-parameters)
+- [How-to: Set inverted index parameters](../../manage-collections/collection-operations.mdx#set-inverted-index-parameters)
+- [Reference: Tokenization options](../collections.mdx#tokenization) - Learn about different tokenization methods and how they affect text indexing
 
 ## Questions and feedback
 

diff --git a/docs/weaviate/search/bm25.md b/docs/weaviate/search/bm25.md
@@ -1,5 +1,6 @@
 ---
 title: Keyword search
+description: Weaviate BM25 keyword search documentation covering basic queries, search operators, scoring, property targeting, weighting, tokenization, filtering and fuzzy matching.
 sidebar_position: 40
 image: og/docs/howto.jpg
 # tags: ['how to', 'similarity search']
@@ -8,13 +9,13 @@ image: og/docs/howto.jpg
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock';
-import PyCode from '!!raw-loader!/_includes/code/howto/search.bm25.py';
-import PyCodeV3 from '!!raw-loader!/_includes/code/howto/search.bm25-v3.py';
-import TSCode from '!!raw-loader!/_includes/code/howto/search.bm25.ts';
-import TSCodeLegacy from '!!raw-loader!/_includes/code/howto/search.bm25-v2.ts';
-import GoCode from '!!raw-loader!/_includes/code/howto/go/docs/mainpkg/search-bm25_test.go';
-import JavaCode from '!!raw-loader!/_includes/code/howto/java/src/test/java/io/weaviate/docs/search/KeywordSearchTest.java';
-import GQLCode from '!!raw-loader!/_includes/code/howto/search.bm25.gql.py';
+import PyCode from '!!raw-loader!/\_includes/code/howto/search.bm25.py';
+import PyCodeV3 from '!!raw-loader!/\_includes/code/howto/search.bm25-v3.py';
+import TSCode from '!!raw-loader!/\_includes/code/howto/search.bm25.ts';
+import TSCodeLegacy from '!!raw-loader!/\_includes/code/howto/search.bm25-v2.ts';
+import GoCode from '!!raw-loader!/\_includes/code/howto/go/docs/mainpkg/search-bm25_test.go';
+import JavaCode from '!!raw-loader!/\_includes/code/howto/java/src/test/java/io/weaviate/docs/search/KeywordSearchTest.java';
+import GQLCode from '!!raw-loader!/\_includes/code/howto/search.bm25.gql.py';
 
 `Keyword` search, also called "BM25 (Best match 25)" or "sparse vector" search, returns objects that have the highest BM25F scores.
 
@@ -239,9 +240,6 @@ The response is like this:
 
 ## Search on selected properties only
 
-:::info Added in `v1.19.0`
-:::
-
 A keyword search can be directed to only search a subset of object properties. In this example, the BM25 search only uses the `question` property to produce the BM25F score.
 
 <Tabs groupId="languages">
@@ -320,13 +318,11 @@ The response is like this:
   endMarker="# END Expected BM25WithProperties results"
   language="json"
 />
+
 </details>
 
 ## Use weights to boost properties
 
-:::info Added in `v1.19.0`
-:::
-
 You can weight how much each property affects the overall BM25F score. This example boosts the `question` property by a factor of 2 while the `answer` property remains static.
 
 <Tabs groupId="languages">
@@ -384,7 +380,6 @@ You can weight how much each property affects the overall BM25F score. This exam
     />
   </TabItem>
 
-
   <TabItem value="graphql" label="GraphQL">
     <FilteredTextBlock
       text={PyCodeV3}
@@ -409,18 +404,16 @@ The response is like this:
 
 </details>
 
-
 ## Set tokenization
 
 The BM25 query string is [tokenized](../config-refs/collections.mdx#tokenization) before it is used to search for objects using the inverted index.
 
 You must specify the tokenization method in the collection definition for [each property](../manage-collections/vector-config.mdx#property-level-settings).
 
-import TknPyCode from '!!raw-loader!/_includes/code/howto/manage-data.collections.py';
-import TknPyCodeV3 from '!!raw-loader!/_includes/code/howto/manage-data.collections-v3.py';
-import TknTsCode from '!!raw-loader!/_includes/code/howto/manage-data.collections.ts';
-import TknTsCodeLegacy from '!!raw-loader!/_includes/code/howto/manage-data.collections-v2.ts';
-
+import TknPyCode from '!!raw-loader!/\_includes/code/howto/manage-data.collections.py';
+import TknPyCodeV3 from '!!raw-loader!/\_includes/code/howto/manage-data.collections-v3.py';
+import TknTsCode from '!!raw-loader!/\_includes/code/howto/manage-data.collections.ts';
+import TknTsCodeLegacy from '!!raw-loader!/\_includes/code/howto/manage-data.collections-v2.ts';
 
 <Tabs groupId="languages">
   <TabItem value="py" label="Python Client v4">
@@ -469,6 +462,12 @@ import TknTsCodeLegacy from '!!raw-loader!/_includes/code/howto/manage-data.coll
   </TabItem>
 </Tabs>
 
+:::tip Tokenization and fuzzy matching
+
+For fuzzy matching and typo tolerance, use `trigram` tokenization. See the [fuzzy matching section](#fuzzy-matching) above for details.
+
+:::
+
 ## `limit` & `offset`
 
 Use `limit` to set a fixed maximum number of objects to return.
@@ -747,18 +746,56 @@ The response is like this:
 
 ### Tokenization
 
-import TokenizationNote from '/_includes/tokenization.mdx'
+import TokenizationNote from '/\_includes/tokenization.mdx'
 
 <TokenizationNote />
 
-## Related pages
+## Fuzzy matching
+
+You can enable fuzzy matching and typo tolerance in BM25 searches by using [`trigram` tokenization](../config-refs/collections.mdx#tokenization). This technique breaks text into overlapping 3-character sequences, allowing BM25 to find matches even when there are spelling errors or variations.
+
+This enables matching between similar but not identical strings because they share many trigrams:
+
+- `"Morgn"` and `"Morgan"` share trigrams like `"org", "rga", "gan"`
+
+Set the tokenization method to `trigram` at the property level when creating your collection:
+
+<Tabs groupId="languages">
+  <TabItem value="py" label="Python Client v4">
+    <FilteredTextBlock
+      text={TknPyCode}
+      startMarker="# START TrigramTokenization"
+      endMarker="# END TrigramTokenization"
+      language="py"
+    />
+  </TabItem>
+  <TabItem value="js" label="JS/TS Client v3">
+    <FilteredTextBlock
+      text={TknTsCode}
+      startMarker="// START TrigramTokenization"
+      endMarker="// END TrigramTokenization"
+      language="ts"
+    />
+  </TabItem>
+</Tabs>
+
+:::tip Best practices
+
+- Use trigram tokenization selectively on fields that need fuzzy matching.
+- Keep exact-match fields with `word` or `field` tokenization for precision.
+
+:::
+
+
+## Further resources
 
-- [Connect to Weaviate](/weaviate/connections/index.mdx)
+- [Connect to Weaviate](../connections/index.mdx)
 - [API References: Search operators # BM25](../api/graphql/search-operators.md#bm25)
+- [Reference: Tokenization options](../config-refs/collections.mdx#tokenization)
 - [Weaviate Academy: Tokenization](../../academy/py/tokenization/index.md)
 
 ## Questions and feedback
 
-import DocsFeedback from '/_includes/docs-feedback.mdx';
+import DocsFeedback from '/\_includes/docs-feedback.mdx';
 
 <DocsFeedback/>