Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search benchmark #362

Merged
merged 2 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/test-and-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:

- name: run unit tests with coverage
id: fast-tests
run: poetry run pytest --cov -m 'not slow' --doctest-modules
run: poetry run pytest --cov -m 'not slow and not datahub' --doctest-modules

- name: Prepare Selenium
# https://github.com/marketplace/actions/setup-chromedriver
Expand All @@ -78,7 +78,7 @@ jobs:
- name: run selenium tests
id: slow-tests
if: steps.fast-tests.outcome == 'success'
run: poetry run pytest -m 'slow' --axe-version 4.9.0
run: poetry run pytest tests/selenium --axe-version 4.9.0

javascript:
runs-on: ubuntu-latest
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ Run `pre-commit install` from inside the poetry environment to set up pre commit

- Python unit tests: `pytest -m 'not slow'`
- Javascript unit tests: `npm test`
- Selenium tests: `pytest -m 'slow'`
- Selenium tests: `pytest -m tests/selenium`
- Search benchmarks (these query the real Datahub backend): `pytest tests/benchmarks`
8 changes: 6 additions & 2 deletions core/settings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import sys
from pathlib import Path
from socket import gethostbyname, gethostname
from socket import gaierror, gethostbyname, gethostname

import sentry_sdk
import yaml
Expand All @@ -24,7 +24,11 @@
DEBUG: bool = DEBUG_STR in TRUTHY_VALUES

ALLOWED_HOSTS = str(os.environ.get("DJANGO_ALLOWED_HOSTS")).split(" ")
ALLOWED_HOSTS.append(gethostbyname(gethostname()))

try:
ALLOWED_HOSTS.append(gethostbyname(gethostname()))
except gaierror:
pass

# Application definition
INSTALLED_APPS = [
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ markdown = "^3.5.2"
python-dotenv = "^1.0.1"
markdown-headdown = "^0.1.3"
nltk = "^3.8.1"
sentry-sdk = {extras = ["django"], version = "^2.0.1"}
sentry-sdk = { extras = ["django"], version = "^2.0.1" }
ministryofjustice-data-platform-catalogue = { path = "lib/datahub-client", develop = true }

[tool.poetry.group.dev.dependencies]
Expand All @@ -38,7 +38,10 @@ build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
DJANGO_SETTINGS_MODULE = "core.settings"
python_files = ["test_*.py", "*_test.py", "testing/python/*.py"]
markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"datahub: marks tests that query Datahub rather than mocking",
]
testpaths = ["tests", "lib/data_platform_catalogue/tests"]

[tool.isort]
Expand Down
Empty file added tests/benchmark/__init__.py
Empty file.
99 changes: 99 additions & 0 deletions tests/benchmark/test_exact_matches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import re

import pytest

from home.forms.search import SearchForm
from home.service.search import SearchService

WORD_TOKEN = re.compile(r"[^_\-\s]+")
OVERLAP_THRESHOLD = 0.75


@pytest.mark.slow
@pytest.mark.datahub
@pytest.mark.parametrize(
"query,expected_urn",
[
(
"prison_population_history.chunk_assignment",
"urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.prison_population_history.chunk_assignment,PROD)",
),
(
"Accommodation on the first night following release",
"urn:li:chart:(justice-data,accommodation-on-release)",
),
(
"vcms_activations",
"urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.alpha_vcms_data.vcms_activations,PROD)",
),
(
"ns_postcode_lookup_latest_2011census",
"urn:li:dataset:(urn:li:dataPlatform:dbt,awsdatacatalog.common_lookup.ns_postcode_lookup_latest_2011census,PROD)",
),
],
)
def test_exact_title_match(query, expected_urn):
"""
Test that tables can be retrieved by searching for their exact name
"""
form = SearchForm({"query": query})
assert form.is_valid()

service = SearchService(form=form, page="1")
results = service.results

assert results.total_results >= 1
assert results.page_results[0].urn == expected_urn


@pytest.mark.xfail
@pytest.mark.slow
@pytest.mark.datahub
@pytest.mark.parametrize(
"query",
(
("prison_population_history.chunk_assignment",),
("Accommodation on the first night following release",),
("vcms_activations",),
("ns_postcode_lookup_latest_2011census",),
),
)
def test_no_duplicates(query):
"""
Test that there are no entries with similar names in the first page

"""
form = SearchForm({"query": query})
assert form.is_valid()

service = SearchService(form=form, page="1")
results = service.results

titles = [result.fully_qualified_name for result in results.page_results]
assert_no_fuzzy_match(titles)


def assert_no_fuzzy_match(titles):
"""
Check for similar looking titles by tokenising and comparing the number of tokens
common to both titles to the number of tokens that are unique to one or the other
"""
for i, title1 in enumerate(titles, 1):
for j, title2 in enumerate(titles, 1):
if i == j:
continue

assert (
title1 != title2
), f'"{title1}" @ position {i} duplicates {title2} @ position {j}"'

tokens1 = set(WORD_TOKEN.findall(title1))
if not tokens1:
continue

tokens2 = set(WORD_TOKEN.findall(title2))
intersection = tokens1.intersection(tokens2)
union = tokens1.union(tokens2)
assert (
len(intersection) / len(union) <= OVERLAP_THRESHOLD
), f'"{title1}" @ position {i} is similar to {title2} @ position {j}"'
160 changes: 87 additions & 73 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,21 @@

import pytest
from data_platform_catalogue.client.datahub_client import DataHubCatalogueClient
from data_platform_catalogue.entities import RelationshipType, Table, DomainRef, Governance, OwnerRef, TagRef, Column, ColumnRef, UsageRestrictions, EntityRef, CustomEntityProperties, DataSummary, AccessInformation
from data_platform_catalogue.entities import (
AccessInformation,
Column,
ColumnRef,
CustomEntityProperties,
DataSummary,
DomainRef,
EntityRef,
Governance,
OwnerRef,
RelationshipType,
Table,
TagRef,
UsageRestrictions,
)
from data_platform_catalogue.search_types import (
FacetOption,
ResultType,
Expand Down Expand Up @@ -69,49 +83,47 @@ def generate_table_metadata(
Generate a fake table metadata object
"""
return Table(
urn="urn:li:table:fake",
display_name="Foo.Dataset",
name=name or fake.unique.name(),
fully_qualified_name="Foo.Dataset",
description=description or fake.paragraph(),
relationships=relations or {RelationshipType.PARENT: []},
domain=DomainRef(display_name="LAA", urn="LAA"),
governance=Governance(
data_owner=OwnerRef(
display_name="", email="Contact email for the user", urn=""
),
data_stewards=[
OwnerRef(
display_name="", email="Contact email for the user", urn=""
urn="urn:li:table:fake",
display_name="Foo.Dataset",
name=name or fake.unique.name(),
fully_qualified_name="Foo.Dataset",
description=description or fake.paragraph(),
relationships=relations or {RelationshipType.PARENT: []},
domain=DomainRef(display_name="LAA", urn="LAA"),
governance=Governance(
data_owner=OwnerRef(
display_name="", email="Contact email for the user", urn=""
),
data_stewards=[
OwnerRef(display_name="", email="Contact email for the user", urn="")
],
),
tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
created=None,
column_details=[
Column(
name="urn",
display_name="urn",
type="string",
description="The primary identifier for the dataset entity.",
nullable=False,
is_primary_key=True,
foreign_keys=[
ColumnRef(
name="urn",
display_name="urn",
table=EntityRef(
urn="urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)",
display_name="Dataset",
),
)
],
),
tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
created=None,
column_details=[
Column(
name="urn",
display_name="urn",
type="string",
description="The primary identifier for the dataset entity.",
nullable=False,
is_primary_key=True,
foreign_keys=[
ColumnRef(
name="urn",
display_name="urn",
table=EntityRef(
urn="urn:li:dataset:(urn:li:dataPlatform:datahub,Dataset,PROD)",
display_name="Dataset",
),
)
],
),
],
platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
custom_properties=CustomEntityProperties(),
)
],
platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
custom_properties=CustomEntityProperties(),
)


def generate_page(page_size=20, result_type: ResultType | None = None):
Expand Down Expand Up @@ -147,7 +159,11 @@ def client():


@pytest.fixture(autouse=True)
def mock_catalogue():
def mock_catalogue(request):
if "datahub" in request.keywords:
yield None
return

patcher = patch("home.service.base.GenericService._get_catalogue_client")
mock_fn = patcher.start()
mock_catalogue = MagicMock(spec=DataHubCatalogueClient)
Expand Down Expand Up @@ -178,39 +194,37 @@ def mock_list_database_tables_response(mock_catalogue, total_results, page_resul

def mock_get_table_details_response(mock_catalogue):
mock_catalogue.get_table_details.return_value = Table(
urn="urn:li:table:fake",
display_name="abc",
name="abc",
fully_qualified_name="abc",
description="abc",
relationships={},
domain=DomainRef(display_name="LAA", urn="LAA"),
governance=Governance(
data_owner=OwnerRef(
display_name="", email="Contact email for the user", urn=""
),
data_stewards=[
OwnerRef(
display_name="", email="Contact email for the user", urn=""
)
],
urn="urn:li:table:fake",
display_name="abc",
name="abc",
fully_qualified_name="abc",
description="abc",
relationships={},
domain=DomainRef(display_name="LAA", urn="LAA"),
governance=Governance(
data_owner=OwnerRef(
display_name="", email="Contact email for the user", urn=""
),
tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
created=None,
column_details=[
Column(
name="foo",
display_name="foo",
type="string",
description="description **with markdown**",
nullable=False,
is_primary_key=True,
foreign_keys=[]
),
data_stewards=[
OwnerRef(display_name="", email="Contact email for the user", urn="")
],
platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
)
),
tags=[TagRef(display_name="some-tag", urn="urn:li:tag:Entity")],
last_modified=datetime(2024, 3, 5, 6, 16, 47, 814000, tzinfo=timezone.utc),
created=None,
column_details=[
Column(
name="foo",
display_name="foo",
type="string",
description="description **with markdown**",
nullable=False,
is_primary_key=True,
foreign_keys=[],
),
],
platform=EntityRef(urn="urn:li:dataPlatform:athena", display_name="athena"),
)


def mock_search_response(mock_catalogue, total_results=0, page_results=()):
Expand Down
Loading