Skip to content

Commit

Permalink
Merge pull request #96 from opendata-swiss/feat/output-media-type-as-…
Browse files Browse the repository at this point in the history
…iana-uri

feat: Add full media_type to valid_media_types mapping
  • Loading branch information
bellisk authored Dec 5, 2023
2 parents f87cc4b + 958f0bf commit 5a62cc9
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 22 deletions.
24 changes: 14 additions & 10 deletions ckanext/dcatapchharvest/dcat_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,17 +258,21 @@ def get_publisher_dict_from_dataset(publisher):


def get_iana_media_type_values():
media_type_values = {}
file = os.path.join(__location__, 'iana_media_types.xml')
tree = ET.parse(file)
root = tree.getroot()
records = root.findall('.//ns:record', media_types_namespaces)
media_type_values = {}
for record in records:
if record.find('ns:file', media_types_namespaces) is None:
continue
if record.find('ns:name', media_types_namespaces) is None:
continue
name = record.find('ns:name', media_types_namespaces).text.lower()
file_value = record.find('ns:file', media_types_namespaces).text
media_type_values[name] = media_types_namespaces['ns']+'/'+file_value
registries = root.findall('.//ns:registry', media_types_namespaces)
for registry in registries:
registry_type = registry.get('id')
records = registry.findall('.//ns:record', media_types_namespaces)
for record in records:
if record.find('ns:file', media_types_namespaces) is None:
continue
if record.find('ns:name', media_types_namespaces) is None:
continue
name = record.find('ns:name', media_types_namespaces).text.lower()
file_value = record.find('ns:file', media_types_namespaces).text
media_type_values[registry_type + '/' + name] = \
media_types_namespaces['ns'] + '/' + file_value
return media_type_values
15 changes: 8 additions & 7 deletions ckanext/dcatapchharvest/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def _get_eu_or_iana_format(self, subject):
if isinstance(format_value, dict):
log.debug("The format object is a dictionary type.")
else:
lowercase_format_value = format_value.lower()
lowercase_format_value = format_value.lower().split('/')[-1]
if lowercase_format_value in valid_formats \
or lowercase_format_value in valid_media_types:
return lowercase_format_value
Expand All @@ -252,10 +252,12 @@ def _get_iana_media_type(self, subject):
if isinstance(media_type_value_raw, dict):
log.debug("The media type object is a dictionary type.")
else:
pattern = r'[^/]+$' # Match characters that are not '/'
# This matches either a URI (http://example.com/foo/bar) or
# a string (foo/bar)
pattern = r'(.*\/|^)(.+\/.+)$'
media_type_value_re = re.search(pattern, media_type_value_raw)
if media_type_value_re:
media_type_value = media_type_value_re.group(0)
media_type_value = media_type_value_re.group(2)
else:
media_type_value = media_type_value_raw

Expand Down Expand Up @@ -1079,13 +1081,12 @@ def _format_and_media_type_to_graph(self, resource_dict, distribution):

# Export media type if it matches IANA media type vocabulary
if resource_dict.get('media_type'):
lowercase_media_type_value = \
resource_dict.get('media_type').lower()
if lowercase_media_type_value in valid_media_types:
media_type = resource_dict.get('media_type')
if media_type in valid_media_types:
g.add((
distribution,
DCAT.mediaType,
URIRef(valid_media_types[lowercase_media_type_value])
URIRef(valid_media_types[media_type])
))

def graph_from_catalog(self, catalog_dict, catalog_ref):
Expand Down
18 changes: 18 additions & 0 deletions ckanext/dcatapchharvest/tests/fixtures/dataset-media-types.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:dcterms="http://purl.org/dc/terms/">
<dcat:Dataset rdf:about="https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585">
<dcat:distribution>
<dcat:Distribution>
<dcat:mediaType rdf:resource="http://www.iana.org/assignments/text/html"/>
<dcterms:format rdf:resource="http://publications.europa.eu/resource/authority/file-type/HTML"/>
</dcat:Distribution>
</dcat:distribution>
<dcat:distribution>
<dcat:Distribution>
<dcat:mediaType rdf:datatype="http://www.w3.org/2001/XMLSchema#string">application/json</dcat:mediaType>
<dcterms:format rdf:datatype="http://www.w3.org/2001/XMLSchema#string">application/json</dcterms:format>
</dcat:Distribution>
</dcat:distribution>
</dcat:Dataset>
</rdf:RDF>
2 changes: 1 addition & 1 deletion ckanext/dcatapchharvest/tests/fixtures/dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@
"temporal_resolution":"P1D",
"rights": "http://dcat-ap.ch/vocabulary/licenses/terms_by_ask",
"license": "http://dcat-ap.ch/vocabulary/licenses/cc-by/4.0",
"media_type": "1d-interleaved-parityfec"
"media_type": "application/1d-interleaved-parityfec"
}
],
"extras": [
Expand Down
23 changes: 21 additions & 2 deletions ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def test_dataset_all_fields(self):
assert all(l in resource['description'] for l in ['de', 'fr', 'it', 'en']), "resource description contains all languages"
eq_(resource['description']['de'], u'')
eq_(resource['format'], u'html')
eq_(resource['media_type'], u'html')
eq_(resource['media_type'], u'text/html')
eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs')
eq_(resource['rights'], u'NonCommercialAllowed-CommercialAllowed-ReferenceRequired')
eq_(resource['license'], u'Creative Commons CC Zero License (cc-zero)')
Expand Down Expand Up @@ -294,7 +294,6 @@ def test_distribution_format_format_only(self):

resource = datasets[0]['resources'][0]


def test_temporals_accepted_formats(self):
contents = self._get_file_contents('dataset-datetimes.xml')
p = RDFParser(profiles=['swiss_dcat_ap'])
Expand Down Expand Up @@ -404,6 +403,8 @@ def test_multiple_rights_statements(self):
dataset = [d for d in p.datasets()][0]
resource = dataset["resources"][0]

eq_(resource['rights'], u"NonCommercialAllowed-CommercialWithPermission-ReferenceRequired")

def test_eu_themes_mapping(self):
contents = self._get_file_contents('catalog-themes.xml')
p = RDFParser(profiles=['swiss_dcat_ap'])
Expand All @@ -421,3 +422,21 @@ def test_eu_themes_mapping(self):
dataset['identifier']
)
)

def test_format_media_type(self):
"""Test that format and media type are parsed both from URIs and from
strings
"""
contents = self._get_file_contents('dataset-media-types.xml')
p = RDFParser(profiles=['swiss_dcat_ap'])
p.parse(contents)

dataset = [d for d in p.datasets()][0]
results = [
(resource.get('format'), resource.get('media_type'))
for resource in dataset['resources']
]
eq_(
sorted(results),
[('html', 'text/html'), ('json', 'application/json')]
)
4 changes: 2 additions & 2 deletions ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ def test_graph_from_dataset(self):
if resource_dict.get('format') == "HTML":
assert self._triple(g, distribution, DCT['format'], URIRef("http://publications.europa.eu/resource/authority/file-type/HTML"))

if resource_dict.get('format') == "1d-interleaved-parityfec":
assert self._triple(g, distribution, DCT['format'], URIRef("http://www.iana.org/assignments/video/1d-interleaved-parityfec"))
if resource_dict.get('media_type') == "application/1d-interleaved-parityfec":
assert self._triple(g, distribution, DCAT.mediaType, URIRef("http://www.iana.org/assignments/application/1d-interleaved-parityfec"))

if resource_dict.get('temporal_resolution') == "P1D":
expected_literal = Literal("P1D", datatype=XSD.duration)
Expand Down

0 comments on commit 5a62cc9

Please sign in to comment.