Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

@W-17427085: Set ANNOY related dependencies to be optional #3858

Merged
merged 4 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/feature_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,30 @@ jobs:
- name: Run Pytest
run: uv run pytest --cov-report= --cov=cumulusci

unit_tests_opt_deps:
name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
python-version: ["3.11", "3.12", "3.13"]
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "${{ matrix.python-version }}"
- name: Set up uv
uses: SFDO-Tooling/setup-uv@main
with:
version: "0.5.0"
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras -p ${{ matrix.python-version }}
- name: Run Pytest
run: uv run pytest --cov-report= --cov=cumulusci

robot_api:
name: "Robot: No browser"
runs-on: SFDO-Tooling-Ubuntu
Expand Down
33 changes: 27 additions & 6 deletions cumulusci/tasks/bulkdata/select_utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
import logging
import random
import re
import typing as T
from enum import Enum

import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from pydantic import Field, root_validator, validator
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

from cumulusci.core.enums import StrEnum
from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
DEFAULT_DECLARATIONS,
)
from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
from cumulusci.utils import get_cci_upgrade_command
from cumulusci.utils.yaml.model_parser import CCIDictModel

logger = logging.getLogger(__name__)
try:
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

OPTIONAL_DEPENDENCIES_AVAILABLE = True
except ImportError:
logger.warning(
f"Optional dependencies are missing. "
"Handling high volumes of records for the 'select' functionality will be significantly slower, "
"as optimizations for this feature are currently disabled. "
f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
)
OPTIONAL_DEPENDENCIES_AVAILABLE = False


class SelectStrategy(StrEnum):
"""Enum defining the different selection strategies requested."""
Expand Down Expand Up @@ -308,7 +323,7 @@ def similarity_post_process(
select_records = []
insert_records = []

if complexity_constant < 1000:
if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
select_records, insert_records = levenshtein_post_process(
load_records, query_records, fields, weights, threshold
)
Expand All @@ -328,6 +343,12 @@ def annoy_post_process(
threshold: T.Union[float, None],
) -> T.Tuple[T.List[dict], list]:
"""Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
# Add warning when threshold is 0
if threshold is not None and threshold == 0:
logger.warning(
"Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
)

selected_records = []
insertion_candidates = []

Expand Down
50 changes: 49 additions & 1 deletion cumulusci/tasks/bulkdata/tests/test_select_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

from cumulusci.tasks.bulkdata.select_utils import (
OPTIONAL_DEPENDENCIES_AVAILABLE,
SelectOperationExecutor,
SelectStrategy,
add_limit_offset_to_user_filter,
Expand All @@ -15,6 +15,14 @@
vectorize_records,
)

# Check for pandas availability
try:
import pandas as pd

PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False


# Test Cases for standard_generate_query
def test_standard_generate_query_with_default_record_declaration():
Expand Down Expand Up @@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
assert "Records must be same size as fields (weights)." in str(e.value)


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_numeric_columns():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
Expand All @@ -526,6 +538,10 @@ def test_all_numeric_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_numeric_columns__one_non_numeric():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
Expand All @@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_boolean_columns():
df_db = pd.DataFrame(
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
Expand All @@ -560,6 +580,10 @@ def test_all_boolean_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_categorical_columns():
df_db = pd.DataFrame(
{"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
Expand All @@ -579,6 +603,10 @@ def test_all_categorical_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_mixed_types():
df_db = pd.DataFrame(
{
Expand Down Expand Up @@ -606,6 +634,10 @@ def test_mixed_types():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_vectorize_records_mixed_numerical_boolean_categorical():
# Test data with mixed types: numerical and categorical only
db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
Expand Down Expand Up @@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
), "Query vectors column count mismatch"


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand All @@ -659,6 +695,10 @@ def test_annoy_post_process():
assert not insert_records


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down Expand Up @@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records_with_polymorphic_fields():
# Test data
load_records = [
Expand Down Expand Up @@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_single_record_match_annoy_post_process():
# Mock data where only the first query record matches the first load record
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down
3 changes: 3 additions & 0 deletions docs/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a

This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org.

**Important Note:**
For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process.

---

#### Example
Expand Down
12 changes: 8 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"annoy",
"click>=8.1",
"cryptography",
"python-dateutil",
Expand All @@ -35,8 +34,6 @@ dependencies = [
"defusedxml",
"lxml",
"MarkupSafe",
"numpy",
"pandas",
"psutil",
"pydantic<2",
"PyJWT",
Expand All @@ -53,7 +50,6 @@ dependencies = [
"rst2ansi>=0.1.5",
"salesforce-bulk",
"sarge",
"scikit-learn",
"selenium<4",
"simple-salesforce==1.11.4",
"snowfakery>=4.0.0",
Expand Down Expand Up @@ -88,6 +84,14 @@ lint = [
"pre-commit>=3.5.0",
]

[project.optional-dependencies]
select = [
"annoy",
"numpy",
"pandas",
"scikit-learn",
]

[project.scripts]
cci = "cumulusci.cli.cci:main"
snowfakery = "snowfakery.cli:main"
Expand Down
Loading