diff --git a/.github/workflows/test-and-deploy.yml b/.github/workflows/test-and-deploy.yml index 17ecd09f..d750d9d0 100644 --- a/.github/workflows/test-and-deploy.yml +++ b/.github/workflows/test-and-deploy.yml @@ -69,7 +69,7 @@ jobs: - name: run unit tests with coverage id: fast-tests - run: poetry run pytest --cov -m 'not slow' --doctest-modules + run: poetry run pytest --cov -m 'not slow and not datahub' --doctest-modules - name: Prepare Selenium # https://github.com/marketplace/actions/setup-chromedriver @@ -78,7 +78,7 @@ jobs: - name: run selenium tests id: slow-tests if: steps.fast-tests.outcome == 'success' - run: poetry run pytest -m 'slow' --axe-version 4.9.0 + run: poetry run pytest tests/selenium --axe-version 4.9.0 javascript: runs-on: ubuntu-latest diff --git a/README.md b/README.md index 14e51c96..07dc89ea 100644 --- a/README.md +++ b/README.md @@ -34,4 +34,5 @@ Run `pre-commit install` from inside the poetry environment to set up pre commit - Python unit tests: `pytest -m 'not slow'` - Javascript unit tests: `npm test` -- Selenium tests: `pytest -m 'slow'` +- Selenium tests: `pytest -m tests/selenium` +- Search benchmarks (these query the real Datahub backend): `pytest tests/benchmarks` diff --git a/core/settings.py b/core/settings.py index 307d7e7b..5184c4cb 100644 --- a/core/settings.py +++ b/core/settings.py @@ -1,7 +1,7 @@ import os import sys from pathlib import Path -from socket import gethostbyname, gethostname +from socket import gaierror, gethostbyname, gethostname import sentry_sdk import yaml @@ -24,7 +24,11 @@ DEBUG: bool = DEBUG_STR in TRUTHY_VALUES ALLOWED_HOSTS = str(os.environ.get("DJANGO_ALLOWED_HOSTS")).split(" ") -ALLOWED_HOSTS.append(gethostbyname(gethostname())) + +try: + ALLOWED_HOSTS.append(gethostbyname(gethostname())) +except gaierror: + pass # Application definition INSTALLED_APPS = [ diff --git a/pyproject.toml b/pyproject.toml index a6c78777..6ba0dd3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ markdown = "^3.5.2" python-dotenv = "^1.0.1" markdown-headdown = "^0.1.3" nltk = "^3.8.1" -sentry-sdk = {extras = ["django"], version = "^2.0.1"} +sentry-sdk = { extras = ["django"], version = "^2.0.1" } ministryofjustice-data-platform-catalogue = { path = "lib/datahub-client", develop = true } [tool.poetry.group.dev.dependencies] @@ -38,7 +38,10 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] DJANGO_SETTINGS_MODULE = "core.settings" python_files = ["test_*.py", "*_test.py", "testing/python/*.py"] -markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "datahub: marks tests that query Datahub rather than mocking", +] testpaths = ["tests", "lib/data_platform_catalogue/tests"] [tool.isort] diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/benchmark/test_exact_matches.py b/tests/benchmark/test_exact_matches.py new file mode 100644 index 00000000..503f9426 --- /dev/null +++ b/tests/benchmark/test_exact_matches.py @@ -0,0 +1,99 @@ +import re + +import pytest + +from home.forms.search import SearchForm +from home.service.search import SearchService + +WORD_TOKEN = re.compile(r"[^_\-\s]+") +OVERLAP_THRESHOLD = 0.75 + + +@pytest.mark.slow +@pytest.mark.datahub +@pytest.mark.parametrize( + "query,expected_urn", + [ + ( + "prison_population_history.chunk_assignment", + "urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.prison_population_history.chunk_assignment,PROD)", + ), + ( + "Accommodation on the first night following release", + "urn:li:chart:(justice-data,accommodation-on-release)", + ), + ( + "vcms_activations", + "urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.alpha_vcms_data.vcms_activations,PROD)", + ), + ( + "ns_postcode_lookup_latest_2011census", + "urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.common_lookup.ns_postcode_lookup_latest_2011census,PROD)", + ), + ], +) +def test_exact_title_match(query, expected_urn): + """ + Test that tables can be retrieved by searching for their exact name + """ + form = SearchForm({"query": query}) + assert form.is_valid() + + service = SearchService(form=form, page="1") + results = service.results + + assert results.total_results >= 1 + assert results.page_results[0].urn == expected_urn + + +@pytest.mark.xfail +@pytest.mark.slow +@pytest.mark.datahub +@pytest.mark.parametrize( + "query", + ( + ("prison_population_history.chunk_assignment",), + ("Accommodation on the first night following release",), + ("vcms_activations",), + ("ns_postcode_lookup_latest_2011census",), + ), +) +def test_no_duplicates(query): + """ + Test that there are no entries with similar names in the first page + + """ + form = SearchForm({"query": query}) + assert form.is_valid() + + service = SearchService(form=form, page="1") + results = service.results + + titles = [result.fully_qualified_name for result in results.page_results] + assert_no_fuzzy_match(titles) + + +def assert_no_fuzzy_match(titles): + """ + Check for similar looking titles by tokenising and comparing the number of tokens + common to both titles to the number of tokens that are unique to one or the other + """ + for i, title1 in enumerate(titles, 1): + for j, title2 in enumerate(titles, 1): + if i == j: + continue + + assert ( + title1 != title2 + ), f'"{title1}" @ position {i} duplicates {title2} @ position {j}"' + + tokens1 = set(WORD_TOKEN.findall(title1)) + if not tokens1: + continue + + tokens2 = set(WORD_TOKEN.findall(title2)) + intersection = tokens1.intersection(tokens2) + union = tokens1.union(tokens2) + assert ( + len(intersection) / len(union) <= OVERLAP_THRESHOLD + ), f'"{title1}" @ position {i} is similar to {title2} @ position {j}"' diff --git a/tests/conftest.py b/tests/conftest.py index a5bf6ca3..f020e698 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,21 @@ import pytest from data_platform_catalogue.client.datahub_client import DataHubCatalogueClient -from data_platform_catalogue.entities import RelationshipType, Table, DomainRef, Governance, OwnerRef, TagRef, Column, ColumnRef, UsageRestrictions, EntityRef, CustomEntityProperties, DataSummary, AccessInformation +from data_platform_catalogue.entities import ( + AccessInformation, + Column, + ColumnRef, + CustomEntityProperties, + DataSummary, + DomainRef, + EntityRef, + Governance, + OwnerRef, + RelationshipType, + Table, + TagRef, + UsageRestrictions, +) from data_platform_catalogue.search_types import ( FacetOption, ResultType, @@ -69,49 +83,47 @@ def generate_table_metadata( Generate a fake table metadata object """ return Table( - urn="urn:li:table:fake", - display_name="Foo.Dataset", - name=name or fake.unique.name(), - fully_qualified_name="Foo.Dataset", - description=description or fake.paragraph(), - relationships=relations or {RelationshipType.PARENT: []}, - domain=DomainRef(display_name="LAA", urn="LAA"), - governance=Governance( - data_owner=OwnerRef( - display_name="", email="Contact email for the user", urn="" - ), - data_stewards=[ - OwnerRef( - display_name="", email="Contact email for the user", urn="" + urn="urn:li:table:fake", + display_name="Foo.Dataset", + name=name or fake.unique.name(), + fully_qualified_name="Foo.Dataset", + description=description or fake.paragraph(), + relationships=relations or {RelationshipType.PARENT: []}, + domain=DomainRef(display_name="LAA", urn="LAA"), + governance=Governance( + data_owner=OwnerRef( + display_name="", email="Contact email for the user", urn="" + ), + data_stewards=[ + OwnerRef(display_name="", email="Contact email for the user", urn="") + ], + ), + tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], + last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + created=None, + column_details=[ + Column( + name="urn", + display_name="urn", + type="string", + description="The primary identifier for the dataset entity.", + nullable=False, + is_primary_key=True, + foreign_keys=[ + ColumnRef( + name="urn", + display_name="urn", + table=EntityRef( + urn="urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)", + display_name="Dataset", + ), ) ], ), - tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), - created=None, - column_details=[ - Column( - name="urn", - display_name="urn", - type="string", - description="The primary identifier for the dataset entity.", - nullable=False, - is_primary_key=True, - foreign_keys=[ - ColumnRef( - name="urn", - display_name="urn", - table=EntityRef( - urn="urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)", - display_name="Dataset", - ), - ) - ], - ), - ], - platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), - custom_properties=CustomEntityProperties(), - ) + ], + platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), + custom_properties=CustomEntityProperties(), + ) def generate_page(page_size=20, result_type: ResultType | None = None): @@ -147,7 +159,11 @@ def client(): @pytest.fixture(autouse=True) -def mock_catalogue(): +def mock_catalogue(request): + if "datahub" in request.keywords: + yield None + return + patcher = patch("home.service.base.GenericService._get_catalogue_client") mock_fn = patcher.start() mock_catalogue = MagicMock(spec=DataHubCatalogueClient) @@ -178,39 +194,37 @@ def mock_list_database_tables_response(mock_catalogue, total_results, page_resul def mock_get_table_details_response(mock_catalogue): mock_catalogue.get_table_details.return_value = Table( - urn="urn:li:table:fake", - display_name="abc", - name="abc", - fully_qualified_name="abc", - description="abc", - relationships={}, - domain=DomainRef(display_name="LAA", urn="LAA"), - governance=Governance( - data_owner=OwnerRef( - display_name="", email="Contact email for the user", urn="" - ), - data_stewards=[ - OwnerRef( - display_name="", email="Contact email for the user", urn="" - ) - ], + urn="urn:li:table:fake", + display_name="abc", + name="abc", + fully_qualified_name="abc", + description="abc", + relationships={}, + domain=DomainRef(display_name="LAA", urn="LAA"), + governance=Governance( + data_owner=OwnerRef( + display_name="", email="Contact email for the user", urn="" ), - tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], - last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), - created=None, - column_details=[ - Column( - name="foo", - display_name="foo", - type="string", - description="description **with markdown**", - nullable=False, - is_primary_key=True, - foreign_keys=[] - ), + data_stewards=[ + OwnerRef(display_name="", email="Contact email for the user", urn="") ], - platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), - ) + ), + tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")], + last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc), + created=None, + column_details=[ + Column( + name="foo", + display_name="foo", + type="string", + description="description **with markdown**", + nullable=False, + is_primary_key=True, + foreign_keys=[], + ), + ], + platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"), + ) def mock_search_response(mock_catalogue, total_results=0, page_results=()):