From db4f5b6bb2238956f97cdb8301534114463d59a8 Mon Sep 17 00:00:00 2001 From: aditya-balachander Date: Wed, 18 Dec 2024 11:33:46 +0530 Subject: [PATCH] Add warning for exact match approximation in high record volumes --- cumulusci/tasks/bulkdata/select_utils.py | 6 ++++++ docs/data.md | 3 +++ 2 files changed, 9 insertions(+) diff --git a/cumulusci/tasks/bulkdata/select_utils.py b/cumulusci/tasks/bulkdata/select_utils.py index cf8d50f110..b37aa457ad 100644 --- a/cumulusci/tasks/bulkdata/select_utils.py +++ b/cumulusci/tasks/bulkdata/select_utils.py @@ -343,6 +343,12 @@ def annoy_post_process( threshold: T.Union[float, None], ) -> T.Tuple[T.List[dict], list]: """Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records""" + # Add warning when threshold is 0 + if threshold is not None and threshold == 0: + logger.warning( + "Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy." + ) + selected_records = [] insertion_candidates = [] diff --git a/docs/data.md b/docs/data.md index fe9396a4ae..ba61076315 100644 --- a/docs/data.md +++ b/docs/data.md @@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org. +**Important Note:** +For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process. + --- #### Example