Skip to content

Commit

Permalink
feat: Add validation for import and export of format and media_type
Browse files Browse the repository at this point in the history
  • Loading branch information
kovalch committed Nov 27, 2023
1 parent 72f1c59 commit 7ce9385
Showing 1 changed file with 59 additions and 62 deletions.
121 changes: 59 additions & 62 deletions ckanext/dcatapchharvest/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,33 @@ def _qualified_relations(self, subject):

return qualified_relations

def _get_eu_or_iana_format(self, subject):
valid_formats_lower = {key.lower() for key in valid_formats.keys()}
valid_media_types_lower = {key.lower() for key in valid_media_types.keys()}

format_value = self._object_value(subject, DCT['format'])
if format_value.lower() in valid_formats_lower \
or format_value.lower() in valid_media_types_lower:
return format_value
else:
return ''

def _get_iana_media_type(self, subject):
valid_media_types_lower = {key.lower() for key in valid_media_types.keys()}

media_type_value_raw = self._object_value(subject, DCAT.mediaType)
pattern = r'[^/]+$' # Match one or more characters that are not '/'
media_type_value_re = re.search(pattern, media_type_value_raw)
if media_type_value_re:
media_type_value = media_type_value_re.group(0)
else:
media_type_value = media_type_value_raw

if media_type_value in valid_media_types_lower:
return media_type_value
else:
return ''

def _license_rights_name(self, subject, predicate):
for node in self.g.objects(subject, predicate):
# DCAT-AP CH v1: the license as a literal (should be
Expand Down Expand Up @@ -579,9 +606,6 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa
# Simple values
for key, predicate in (
('identifier', DCT.identifier),
('format', DCT['format']),
('mimetype', DCAT.mediaType),
('media_type', DCAT.mediaType),
('download_url', DCAT.downloadURL),
('url', DCAT.accessURL),
('coverage', DCT.coverage),
Expand All @@ -606,10 +630,15 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa
resource_dict['license'] = rights
resource_dict['rights'] = license

# if media type is not set, use format as fallback
if (not resource_dict.get('media_type') and
resource_dict.get('format')):
# Format & Media type
resource_dict['format'] = self._get_eu_or_iana_format(distribution)
resource_dict['media_type'] = self._get_iana_media_type(distribution)
# Set 'media_type' as 'format' if 'media_type' is not set but 'format' exists
if not resource_dict.get('media_type') and resource_dict.get('format'):
resource_dict['media_type'] = resource_dict['format']
# Set 'format' as 'media_type' if 'format' is not set but 'media_type' exists
elif not resource_dict.get('format') and resource_dict.get('media_type'):
resource_dict['format'] = resource_dict['media_type']

# Documentation
resource_dict['documentation'] = self._object_value_list(
Expand Down Expand Up @@ -957,14 +986,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa
datatype=XSD.duration)
))

# Mime-Type
if resource_dict.get('mimetype'):
g.add((
distribution,
DCAT.mediaType,
Literal(resource_dict['mimetype'])
))

# Dates
items = [
('issued', DCT.issued, None, Literal),
Expand Down Expand Up @@ -1028,43 +1049,32 @@ def _rights_and_license_to_graph(self, resource_dict, distribution):

def _format_and_media_type_to_graph(self, resource_dict, distribution):
g = self.g
# Format and Media Type Case 1:
# Format: Set Format value if format matches EU vocabulary
format_uri = None
# Set Format value if format matches EU vocabulary
# Exception: If a format is not available in the EU vocabulary,
# use IANA media type vocabulary

# Export format
if resource_dict.get('format'):
format = resource_dict.get('format').replace(' ', '_')
if format in valid_formats:
format_uri = URIRef(valid_formats[format])
g.add((distribution, DCT['format'], format_uri))

# Media Type: Set Format value
# if format matches EU vocabulary and media type is not set
if format_uri and resource_dict.get('media_type') is None:
g.add((distribution, DCAT.mediaType, format_uri))

# Format and Media Type Case 2:
# Set Media Type and Format value
# if format does not match EU vocabulary
# but media type matches IANA vocabulary
media_type_uri = None
format_uri = None
format_value = resource_dict.get('format')
valid_formats_lower = {key.lower() for key in valid_formats.keys()}
valid_media_types_lower = {key.lower() for key in valid_media_types.keys()}
if format_value.lower() in valid_formats_lower:
g.add(distribution, DCT['format'], URIRef(valid_formats_lower[format_value.lower()]))
elif format_value in valid_media_types:
g.add(distribution, DCT['format'], URIRef(valid_media_types_lower[format_value.lower()]))
else:
g.add(distribution, DCT['format'], BNode())
else:
g.add(distribution, DCT['format'], BNode())

# Export media type
if resource_dict.get('media_type'):
media_type = resource_dict.get('media_type')
if media_type in valid_media_types:
media_type_uri = URIRef(valid_media_types[media_type])
g.add((distribution, DCT['format'], media_type_uri))
g.add((distribution, DCAT.mediaType, media_type_uri))

# Format and Media Type Case 3:
# Set Media Type and Format value
# if format does not match EU vocabulary
# but format matches IANA vocabulary
if media_type_uri is None and resource_dict.get('format'):
format = resource_dict.get('format')
if format in valid_media_types:
media_type_uri = URIRef(valid_media_types[format])
g.add((distribution, DCT['format'], media_type_uri))
g.add((distribution, DCAT.mediaType, media_type_uri))
media_type_value = resource_dict.get('media_type')
valid_media_types_lower = {key.lower() for key in valid_media_types.keys()}
if media_type_value in valid_media_types:
g.add(distribution, DCAT.mediaType, URIRef(valid_media_types[media_type_value]))
else:
g.add(distribution, DCAT.mediaType, BNode())

def graph_from_catalog(self, catalog_dict, catalog_ref):
g = self.g
Expand Down Expand Up @@ -1286,7 +1296,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
("status", ADMS.status, None, Literal),
("coverage", DCT.coverage, None, Literal),
("identifier", DCT.identifier, None, Literal),
("media_type", SCHEMA.mediaType, None, Literal),
("spatial", DCT.spatial, None, Literal),
]

Expand All @@ -1309,18 +1318,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
# Download URL & Access URL
self.download_access_url(resource_dict, distribution, g)

# Format
if resource_dict.get("format"):
g.add((distribution, DCT["format"],
Literal(resource_dict["format"])))

# Mime-Type
if resource_dict.get("mimetype"):
g.add(
(distribution, SCHEMA.mediaType,
Literal(resource_dict["mimetype"]))
)

# Dates
items = [
("issued", DCT.issued, None, Literal),
Expand Down

0 comments on commit 7ce9385

Please sign in to comment.