ministryofjustice · MatMoore · May 22, 2024 · May 22, 2024 · May 22, 2024
@@ -69,7 +69,7 @@ jobs:
 
       - name: run unit tests with coverage
         id: fast-tests
-        run: poetry run pytest --cov -m 'not slow' --doctest-modules
+        run: poetry run pytest --cov -m 'not slow and not datahub' --doctest-modules
 
       - name: Prepare Selenium
         # https://github.com/marketplace/actions/setup-chromedriver
@@ -78,7 +78,7 @@ jobs:
       - name: run selenium tests
         id: slow-tests
         if: steps.fast-tests.outcome == 'success'
-        run: poetry run pytest -m 'slow' --axe-version 4.9.0
+        run: poetry run pytest tests/selenium --axe-version 4.9.0
 
   javascript:
     runs-on: ubuntu-latest

@@ -34,4 +34,5 @@ Run `pre-commit install` from inside the poetry environment to set up pre commit
 
 - Python unit tests: `pytest -m 'not slow'`
 - Javascript unit tests: `npm test`
-- Selenium tests: `pytest -m 'slow'`
+- Selenium tests: `pytest -m tests/selenium`
+- Search benchmarks (these query the real Datahub backend): `pytest tests/benchmarks`
@@ -1,7 +1,7 @@
 import os
 import sys
 from pathlib import Path
-from socket import gethostbyname, gethostname
+from socket import gaierror, gethostbyname, gethostname
 
 import sentry_sdk
 import yaml
@@ -24,7 +24,11 @@
 DEBUG: bool = DEBUG_STR in TRUTHY_VALUES
 
 ALLOWED_HOSTS = str(os.environ.get("DJANGO_ALLOWED_HOSTS")).split(" ")
-ALLOWED_HOSTS.append(gethostbyname(gethostname()))
+
+try:
+    ALLOWED_HOSTS.append(gethostbyname(gethostname()))
+except gaierror:
+    pass
 
 # Application definition
 INSTALLED_APPS = [

@@ -16,7 +16,7 @@ markdown = "^3.5.2"
 python-dotenv = "^1.0.1"
 markdown-headdown = "^0.1.3"
 nltk = "^3.8.1"
-sentry-sdk = {extras = ["django"], version = "^2.0.1"}
+sentry-sdk = { extras = ["django"], version = "^2.0.1" }
 ministryofjustice-data-platform-catalogue = { path = "lib/datahub-client", develop = true }
 
 [tool.poetry.group.dev.dependencies]
@@ -38,7 +38,10 @@ build-backend = "poetry.core.masonry.api"
 [tool.pytest.ini_options]
 DJANGO_SETTINGS_MODULE = "core.settings"
 python_files = ["test_*.py", "*_test.py", "testing/python/*.py"]
-markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "datahub: marks tests that query Datahub rather than mocking",
+]
 testpaths = ["tests", "lib/data_platform_catalogue/tests"]
 
 [tool.isort]

@@ -0,0 +1,99 @@
+import re
+
+import pytest
+
+from home.forms.search import SearchForm
+from home.service.search import SearchService
+
+WORD_TOKEN = re.compile(r"[^_\-\s]+")
+OVERLAP_THRESHOLD = 0.75
+
+
+@pytest.mark.slow
+@pytest.mark.datahub
+@pytest.mark.parametrize(
+    "query,expected_urn",
+    [
+        (
+            "prison_population_history.chunk_assignment",
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.prison_population_history.chunk_assignment,PROD)",
+        ),
+        (
+            "Accommodation on the first night following release",
+            "urn:li:chart:(justice-data,accommodation-on-release)",
+        ),
+        (
+            "vcms_activations",
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.alpha_vcms_data.vcms_activations,PROD)",
+        ),
+        (
+            "ns_postcode_lookup_latest_2011census",
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.common_lookup.ns_postcode_lookup_latest_2011census,PROD)",
+        ),
+    ],
+)
+def test_exact_title_match(query, expected_urn):
+    """
+    Test that tables can be retrieved by searching for their exact name
+    """
+    form = SearchForm({"query": query})
+    assert form.is_valid()
+
+    service = SearchService(form=form, page="1")
+    results = service.results
+
+    assert results.total_results >= 1
+    assert results.page_results[0].urn == expected_urn
+
+
+@pytest.mark.xfail
+@pytest.mark.slow
+@pytest.mark.datahub
+@pytest.mark.parametrize(
+    "query",
+    (
+        ("prison_population_history.chunk_assignment",),
+        ("Accommodation on the first night following release",),
+        ("vcms_activations",),
+        ("ns_postcode_lookup_latest_2011census",),
+    ),
+)
+def test_no_duplicates(query):
+    """
+    Test that there are no entries with similar names in the first page
+
+    """
+    form = SearchForm({"query": query})
+    assert form.is_valid()
+
+    service = SearchService(form=form, page="1")
+    results = service.results
+
+    titles = [result.fully_qualified_name for result in results.page_results]
+    assert_no_fuzzy_match(titles)
+
+
+def assert_no_fuzzy_match(titles):
+    """
+    Check for similar looking titles by tokenising and comparing the number of tokens
+    common to both titles to the number of tokens that are unique to one or the other
+    """
+    for i, title1 in enumerate(titles, 1):
+        for j, title2 in enumerate(titles, 1):
+            if i == j:
+                continue
+
+            assert (
+                title1 != title2
+            ), f'"{title1}" @ position {i} duplicates {title2} @ position {j}"'
+
+            tokens1 = set(WORD_TOKEN.findall(title1))
+            if not tokens1:
+                continue
+
+            tokens2 = set(WORD_TOKEN.findall(title2))
+            intersection = tokens1.intersection(tokens2)
+            union = tokens1.union(tokens2)
+            assert (
+                len(intersection) / len(union) <= OVERLAP_THRESHOLD
+            ), f'"{title1}" @ position {i} is similar to {title2} @ position {j}"'
@@ -5,7 +5,21 @@
 
 import pytest
 from data_platform_catalogue.client.datahub_client import DataHubCatalogueClient
-from data_platform_catalogue.entities import RelationshipType, Table, DomainRef, Governance, OwnerRef, TagRef, Column, ColumnRef, UsageRestrictions, EntityRef, CustomEntityProperties, DataSummary, AccessInformation
+from data_platform_catalogue.entities import (
+    AccessInformation,
+    Column,
+    ColumnRef,
+    CustomEntityProperties,
+    DataSummary,
+    DomainRef,
+    EntityRef,
+    Governance,
+    OwnerRef,
+    RelationshipType,
+    Table,
+    TagRef,
+    UsageRestrictions,
+)
 from data_platform_catalogue.search_types import (
     FacetOption,
     ResultType,
@@ -69,49 +83,47 @@ def generate_table_metadata(
     Generate a fake table metadata object
     """
     return Table(
-            urn="urn:li:table:fake",
-            display_name="Foo.Dataset",
-            name=name or fake.unique.name(),
-            fully_qualified_name="Foo.Dataset",
-            description=description or fake.paragraph(),
-            relationships=relations or {RelationshipType.PARENT: []},
-            domain=DomainRef(display_name="LAA", urn="LAA"),
-            governance=Governance(
-                data_owner=OwnerRef(
-                    display_name="", email="Contact email for the user", urn=""
-                ),
-                data_stewards=[
-                    OwnerRef(
-                        display_name="", email="Contact email for the user", urn=""
+        urn="urn:li:table:fake",
+        display_name="Foo.Dataset",
+        name=name or fake.unique.name(),
+        fully_qualified_name="Foo.Dataset",
+        description=description or fake.paragraph(),
+        relationships=relations or {RelationshipType.PARENT: []},
+        domain=DomainRef(display_name="LAA", urn="LAA"),
+        governance=Governance(
+            data_owner=OwnerRef(
+                display_name="", email="Contact email for the user", urn=""
+            ),
+            data_stewards=[
+                OwnerRef(display_name="", email="Contact email for the user", urn="")
+            ],
+        ),
+        tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
+        last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
+        created=None,
+        column_details=[
+            Column(
+                name="urn",
+                display_name="urn",
+                type="string",
+                description="The primary identifier for the dataset entity.",
+                nullable=False,
+                is_primary_key=True,
+                foreign_keys=[
+                    ColumnRef(
+                        name="urn",
+                        display_name="urn",
+                        table=EntityRef(
+                            urn="urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)",
+                            display_name="Dataset",
+                        ),
                     )
                 ],
             ),
-            tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
-            last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
-            created=None,
-            column_details=[
-                Column(
-                    name="urn",
-                    display_name="urn",
-                    type="string",
-                    description="The primary identifier for the dataset entity.",
-                    nullable=False,
-                    is_primary_key=True,
-                    foreign_keys=[
-                        ColumnRef(
-                            name="urn",
-                            display_name="urn",
-                            table=EntityRef(
-                                urn="urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)",
-                                display_name="Dataset",
-                            ),
-                        )
-                    ],
-                ),
-            ],
-            platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
-            custom_properties=CustomEntityProperties(),
-        )
+        ],
+        platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
+        custom_properties=CustomEntityProperties(),
+    )
 
 
 def generate_page(page_size=20, result_type: ResultType | None = None):
@@ -147,7 +159,11 @@ def client():
 
 
 @pytest.fixture(autouse=True)
-def mock_catalogue():
+def mock_catalogue(request):
+    if "datahub" in request.keywords:
+        yield None
+        return
+
     patcher = patch("home.service.base.GenericService._get_catalogue_client")
     mock_fn = patcher.start()
     mock_catalogue = MagicMock(spec=DataHubCatalogueClient)
@@ -178,39 +194,37 @@ def mock_list_database_tables_response(mock_catalogue, total_results, page_resul
 
 def mock_get_table_details_response(mock_catalogue):
     mock_catalogue.get_table_details.return_value = Table(
-            urn="urn:li:table:fake",
-            display_name="abc",
-            name="abc",
-            fully_qualified_name="abc",
-            description="abc",
-            relationships={},
-            domain=DomainRef(display_name="LAA", urn="LAA"),
-            governance=Governance(
-                data_owner=OwnerRef(
-                    display_name="", email="Contact email for the user", urn=""
-                ),
-                data_stewards=[
-                    OwnerRef(
-                        display_name="", email="Contact email for the user", urn=""
-                    )
-                ],
+        urn="urn:li:table:fake",
+        display_name="abc",
+        name="abc",
+        fully_qualified_name="abc",
+        description="abc",
+        relationships={},
+        domain=DomainRef(display_name="LAA", urn="LAA"),
+        governance=Governance(
+            data_owner=OwnerRef(
+                display_name="", email="Contact email for the user", urn=""
             ),
-            tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
-            last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
-            created=None,
-            column_details=[
-                Column(
-                    name="foo",
-                    display_name="foo",
-                    type="string",
-                    description="description **with markdown**",
-                    nullable=False,
-                    is_primary_key=True,
-                    foreign_keys=[]
-                ),
+            data_stewards=[
+                OwnerRef(display_name="", email="Contact email for the user", urn="")
             ],
-            platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
-        )
+        ),
+        tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
+        last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
+        created=None,
+        column_details=[
+            Column(
+                name="foo",
+                display_name="foo",
+                type="string",
+                description="description **with markdown**",
+                nullable=False,
+                is_primary_key=True,
+                foreign_keys=[],
+            ),
+        ],
+        platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
+    )
 
 
 def mock_search_response(mock_catalogue, total_results=0, page_results=()):