Skip to content

Commit

Permalink
Merge branch 'main' into fmd-449-remove-to-level-domain
Browse files Browse the repository at this point in the history
  • Loading branch information
LavMatt authored Jun 27, 2024
2 parents 4be02ff + bc02e95 commit b0bc9d9
Show file tree
Hide file tree
Showing 34 changed files with 389 additions and 91 deletions.
32 changes: 30 additions & 2 deletions core/middleware.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import logging

from data_platform_catalogue.client.exceptions import ConnectivityError
from django.core.exceptions import BadRequest
from django.http import Http404
from django.shortcuts import render

logger = logging.getLogger(__name__)
Expand All @@ -14,6 +16,32 @@ def __call__(self, request):
return self.get_response(request)

def process_exception(self, request, exception):
logger.exception(exception)
if isinstance(exception, ConnectivityError):
logger.exception(exception)
return render(request, "500_datahub_unavailable.html", status=500)
return render(
request,
"500_datahub_unavailable.html",
context={"h1_value": "Catalogue service unavailable"},
status=500,
)
elif isinstance(exception, BadRequest):
return render(
request,
"400.html",
context={"h1_value": "Bad request"},
status=400,
)
elif isinstance(exception, Http404):
return render(
request,
"404.html",
context={"h1_value": "Page not found"},
status=404,
)
elif isinstance(exception, Exception):
return render(
request,
"500.html",
context={"h1_value": "Server error"},
status=500,
)
8 changes: 8 additions & 0 deletions home/forms/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from ..models.domain_model import Domain, DomainModel
from ..service.search_facet_fetcher import SearchFacetFetcher
from ..service.search_tag_fetcher import SearchTagFetcher


def get_domain_choices() -> list[Domain]:
Expand Down Expand Up @@ -47,6 +48,11 @@ def get_entity_types():
)


def get_tags():
tags = SearchTagFetcher().fetch()
return tags


class SelectWithOptionAttribute(forms.Select):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -125,6 +131,8 @@ class SearchForm(forms.Form):
clear_filter = forms.BooleanField(initial=False, required=False)
clear_label = forms.BooleanField(initial=False, required=False)

tags = forms.MultipleChoiceField(choices=get_tags, required=False)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.initial["sort"] = "relevance"
Expand Down
14 changes: 14 additions & 0 deletions home/service/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def _get_search_results(self, page: str, items_per_page: int) -> SearchResponse:
sort = form_data.get("sort", "relevance")
domain = form_data.get("domain", "")
subdomain = form_data.get("subdomain", "")
tags = form_data.get("tags", "")
domains_and_subdomains = domains_with_their_subdomains(
domain, subdomain, self.domain_model
)
Expand All @@ -92,6 +93,10 @@ def _get_search_results(self, page: str, items_per_page: int) -> SearchResponse:
filter_value.append(MultiSelectFilter("domains", domains_and_subdomains))
if where_to_access:
filter_value.append(MultiSelectFilter("customProperties", where_to_access))
if tags:
filter_value.append(
MultiSelectFilter("tags", [f"urn:li:tag:{tag}" for tag in tags])
)

page_for_search = str(int(page) - 1)
if sort == "ascending":
Expand Down Expand Up @@ -122,6 +127,7 @@ def _generate_remove_filter_hrefs(self) -> dict[str, dict[str, str]] | None:
domain = self.form.cleaned_data.get("domain", "")
entity_types = self.form.cleaned_data.get("entity_types", [])
where_to_access = self.form.cleaned_data.get("where_to_access", [])
tags = self.form.cleaned_data.get("tags", [])
remove_filter_hrefs = {}
if domain:
remove_filter_hrefs["domain"] = self._generate_domain_clear_href()
Expand All @@ -144,6 +150,14 @@ def _generate_remove_filter_hrefs(self) -> dict[str, dict[str, str]] | None:
)
)
remove_filter_hrefs["Where To Access"] = where_to_access_clear_href

if tags:
tags_clear_href = {}
for tag in tags:
tags_clear_href[tag] = self.form.encode_without_filter(
filter_name="tags", filter_value=tag
)
remove_filter_hrefs["Tags"] = tags_clear_href
else:
remove_filter_hrefs = None

Expand Down
2 changes: 1 addition & 1 deletion home/service/search_facet_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class SearchFacetFetcher(GenericService):
def __init__(self):
self.client = self._get_catalogue_client()
self.cache_key = "search_facets"
self.cache_timeout_seconds = 5
self.cache_timeout_seconds = 300

def fetch(self) -> SearchFacets:
"""
Expand Down
24 changes: 24 additions & 0 deletions home/service/search_tag_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from django.core.cache import cache

from .base import GenericService


class SearchTagFetcher(GenericService):
def __init__(self):
self.client = self._get_catalogue_client()
self.cache_key = "search_tags"
self.cache_timeout_seconds = 300

def fetch(self) -> list:
"""
Fetch a static list of options that is independent of the search query
and any applied filters. Values are cached for 5 seconds to avoid
unnecessary queries.
"""
result = cache.get(self.cache_key)
if not result:
result = self.client.get_tags()

cache.set(self.cache_key, result, timeout=self.cache_timeout_seconds)

return result
52 changes: 29 additions & 23 deletions lib/datahub-client/data_platform_catalogue/client/datahub_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,30 @@
from importlib.resources import files
from typing import Sequence

from datahub.configuration.common import ConfigurationError
from datahub.emitter import mce_builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
from datahub.ingestion.source.common.subtypes import (
DatasetContainerSubTypes,
DatasetSubTypes,
)
from datahub.metadata import schema_classes
from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance
from datahub.metadata.schema_classes import (
ChangeTypeClass,
ContainerClass,
ContainerPropertiesClass,
DatasetPropertiesClass,
DomainPropertiesClass,
DomainsClass,
OtherSchemaClass,
SchemaFieldClass,
SchemaFieldDataTypeClass,
SchemaMetadataClass,
SubTypesClass,
)

from data_platform_catalogue.client.exceptions import (
AspectDoesNotExist,
ConnectivityError,
Expand Down Expand Up @@ -39,29 +63,6 @@
SearchResponse,
SortOption,
)
from datahub.configuration.common import ConfigurationError
from datahub.emitter import mce_builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
from datahub.ingestion.source.common.subtypes import (
DatasetContainerSubTypes,
DatasetSubTypes,
)
from datahub.metadata import schema_classes
from datahub.metadata.com.linkedin.pegasus2avro.common import DataPlatformInstance
from datahub.metadata.schema_classes import (
ChangeTypeClass,
ContainerClass,
ContainerPropertiesClass,
DatasetPropertiesClass,
DomainPropertiesClass,
DomainsClass,
OtherSchemaClass,
SchemaFieldClass,
SchemaFieldDataTypeClass,
SchemaMetadataClass,
SubTypesClass,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -210,6 +211,7 @@ def search_facets(
result_types: Sequence[ResultType] = (
ResultType.TABLE,
ResultType.CHART,
ResultType.DATABASE,
),
filters: Sequence[MultiSelectFilter] = (),
) -> SearchFacets:
Expand All @@ -224,6 +226,10 @@ def get_glossary_terms(self, count: int = 1000) -> SearchResponse:
"""Wraps the client's glossary terms query"""
return self.search_client.get_glossary_terms(count)

def get_tags(self, count: int = 2000):
"""Wraps the client's get tags query"""
return self.search_client.get_tags(count)

def get_table_details(self, urn) -> Table:
if self.check_entity_exists_by_urn(urn):
response = self.graph.execute_graphql(self.dataset_query, {"urn": urn})[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ query getChartDetails($urn: String!) {
platform {
name
}
tags {
tags {
tag {
urn
properties {
name
description
}
}
}
}
domain {
domain {
urn
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
query getTags(
$count: Int!
) {
searchAcrossEntities(
input: {types: TAG, query: "*", start: 0, count: $count}
) {
start
count
total
searchResults {
entity {
urn
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def parse_tags(entity: dict[str, Any]) -> list[TagRef]:
tags = []
for tag in outer_tags.get("tags", []):
properties = tag.get("tag", {}).get("properties", {})
# This is needed because tags cerated by dbt seemily don't have properties
# populated
if not properties and tag.get("tag", {}).get("urn"):
properties = {
"name": tag.get("tag", {}).get("urn").replace("urn:li:tag:", "")
}

if properties:
tags.append(
TagRef(
Expand Down
35 changes: 35 additions & 0 deletions lib/datahub-client/data_platform_catalogue/client/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ def __init__(self, graph: DataHubGraph):
.joinpath("getGlossaryTerms.graphql")
.read_text()
)
self.get_tags_query = (
files("data_platform_catalogue.client.graphql")
.joinpath("getTags.graphql")
.read_text()
)

def search(
self,
Expand Down Expand Up @@ -321,6 +326,36 @@ def get_glossary_terms(self, count: int = 1000) -> SearchResponse:
total_results=response["total"], page_results=page_results
)

def get_tags(self, count: int = 2000):
"""
gets a list of tag urns from datahub.
If the total tags in datahub is more
than 2000 (we have too many tags) but the count should be increased to get
all tags
"""
variables = {"count": count}
try:
response = self.graph.execute_graphql(self.get_tags_query, variables)
except GraphError as e:
raise CatalogueError("Unable to execute getTags query") from e

response = response["searchAcrossEntities"]
logger.debug(json.dumps(response, indent=2))

return self._parse_global_tags(response)

def _parse_global_tags(self, tag_query_results) -> list[tuple[str, str]]:
"""parse results of get tags query"""

# name properties of tags are often not set, i.e. all those from dbt
# so better to get tag name from tag urn.
tags_list = [
(tag["entity"]["urn"].replace("urn:li:tag:", ""), tag["entity"]["urn"])
for tag in tag_query_results["searchResults"]
]
return tags_list

def _parse_container(self, entity: dict[str, Any], matches) -> SearchResult:
"""
Map a Container entity to a SearchResult
Expand Down
14 changes: 13 additions & 1 deletion lib/datahub-client/data_platform_catalogue/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ class Entity(BaseModel):
tags: list[TagRef] = Field(
default_factory=list,
description="Additional tags to add.",
examples=[[TagRef(display_name="ESDA", urn="urn:li:tag:PII")]],
examples=[[TagRef(display_name="ESDA", urn="urn:li:tag:ESDA")]],
)
glossary_terms: list[GlossaryTermRef] = Field(
default_factory=list,
Expand Down Expand Up @@ -383,6 +383,18 @@ class Entity(BaseModel):
description="Fields to add to DataHub custom properties",
default_factory=CustomEntityProperties,
)
tags_to_display: list[str] = Field(
description="a list of tag display_names where tags starting 'dc_' are filtered out", # noqa: E501
init=False,
default=[],
)

def model_post_init(self, __context):
self.tags_to_display = [
tag.display_name
for tag in self.tags
if not tag.display_name.startswith("dc_")
]


class Database(Entity):
Expand Down
6 changes: 4 additions & 2 deletions lib/datahub-client/data_platform_catalogue/search_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@ class SearchResult:
glossary_terms: list[GlossaryTermRef] = field(default_factory=list)
last_modified: datetime | None = None
created: datetime | None = None
tags_to_display: list[TagRef] = field(init=False)
tags_to_display: list[str] = field(init=False)

def __post_init__(self):
self.tags_to_display = [
tag for tag in self.tags if not tag.display_name.startswith("dc_")
tag.display_name
for tag in self.tags
if not tag.display_name.startswith("dc_")
]


Expand Down
Loading

0 comments on commit b0bc9d9

Please sign in to comment.