Merge pull request #96 from opendata-swiss/feat/output-media-type-as-…

…iana-uri feat: Add full media_type to valid_media_types mapping
opendata-swiss · Dec 5, 2023 · 5a62cc9 · 5a62cc9
2 parents f87cc4b + 958f0bf
commit 5a62cc9
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 22 deletions.
diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py
@@ -258,17 +258,21 @@ def get_publisher_dict_from_dataset(publisher):
 
 
 def get_iana_media_type_values():
+    media_type_values = {}
     file = os.path.join(__location__, 'iana_media_types.xml')
     tree = ET.parse(file)
     root = tree.getroot()
-    records = root.findall('.//ns:record', media_types_namespaces)
-    media_type_values = {}
-    for record in records:
-        if record.find('ns:file', media_types_namespaces) is None:
-            continue
-        if record.find('ns:name', media_types_namespaces) is None:
-            continue
-        name = record.find('ns:name', media_types_namespaces).text.lower()
-        file_value = record.find('ns:file', media_types_namespaces).text
-        media_type_values[name] = media_types_namespaces['ns']+'/'+file_value
+    registries = root.findall('.//ns:registry', media_types_namespaces)
+    for registry in registries:
+        registry_type = registry.get('id')
+        records = registry.findall('.//ns:record', media_types_namespaces)
+        for record in records:
+            if record.find('ns:file', media_types_namespaces) is None:
+                continue
+            if record.find('ns:name', media_types_namespaces) is None:
+                continue
+            name = record.find('ns:name', media_types_namespaces).text.lower()
+            file_value = record.find('ns:file', media_types_namespaces).text
+            media_type_values[registry_type + '/' + name] = \
+                media_types_namespaces['ns'] + '/' + file_value
     return media_type_values
diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py
@@ -240,7 +240,7 @@ def _get_eu_or_iana_format(self, subject):
         if isinstance(format_value, dict):
             log.debug("The format object is a dictionary type.")
         else:
-            lowercase_format_value = format_value.lower()
+            lowercase_format_value = format_value.lower().split('/')[-1]
             if lowercase_format_value in valid_formats \
                     or lowercase_format_value in valid_media_types:
                 return lowercase_format_value
@@ -252,10 +252,12 @@ def _get_iana_media_type(self, subject):
         if isinstance(media_type_value_raw, dict):
             log.debug("The media type object is a dictionary type.")
         else:
-            pattern = r'[^/]+$'  # Match characters that are not '/'
+            # This matches either a URI (http://example.com/foo/bar) or
+            # a string (foo/bar)
+            pattern = r'(.*\/|^)(.+\/.+)$'
             media_type_value_re = re.search(pattern, media_type_value_raw)
             if media_type_value_re:
-                media_type_value = media_type_value_re.group(0)
+                media_type_value = media_type_value_re.group(2)
             else:
                 media_type_value = media_type_value_raw
 
@@ -1079,13 +1081,12 @@ def _format_and_media_type_to_graph(self, resource_dict, distribution):
 
         # Export media type if it matches IANA media type vocabulary
         if resource_dict.get('media_type'):
-            lowercase_media_type_value = \
-                resource_dict.get('media_type').lower()
-            if lowercase_media_type_value in valid_media_types:
+            media_type = resource_dict.get('media_type')
+            if media_type in valid_media_types:
                 g.add((
                     distribution,
                     DCAT.mediaType,
-                    URIRef(valid_media_types[lowercase_media_type_value])
+                    URIRef(valid_media_types[media_type])
                 ))
 
     def graph_from_catalog(self, catalog_dict, catalog_ref):

diff --git a/ckanext/dcatapchharvest/tests/fixtures/dataset-media-types.xml b/ckanext/dcatapchharvest/tests/fixtures/dataset-media-types.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:dcterms="http://purl.org/dc/terms/">
+  <dcat:Dataset rdf:about="https://opendata.swiss/dataset/7451e012-64b2-4bbc-af20-a0e2bc61b585">
+    <dcat:distribution>
+      <dcat:Distribution>
+        <dcat:mediaType rdf:resource="http://www.iana.org/assignments/text/html"/>
+        <dcterms:format rdf:resource="http://publications.europa.eu/resource/authority/file-type/HTML"/>
+      </dcat:Distribution>
+    </dcat:distribution>
+    <dcat:distribution>
+      <dcat:Distribution>
+        <dcat:mediaType rdf:datatype="http://www.w3.org/2001/XMLSchema#string">application/json</dcat:mediaType>
+        <dcterms:format rdf:datatype="http://www.w3.org/2001/XMLSchema#string">application/json</dcterms:format>
+      </dcat:Distribution>
+    </dcat:distribution>
+  </dcat:Dataset>
+</rdf:RDF>
diff --git a/ckanext/dcatapchharvest/tests/fixtures/dataset.json b/ckanext/dcatapchharvest/tests/fixtures/dataset.json
@@ -138,7 +138,7 @@
       "temporal_resolution":"P1D",
       "rights": "http://dcat-ap.ch/vocabulary/licenses/terms_by_ask",
       "license": "http://dcat-ap.ch/vocabulary/licenses/cc-by/4.0",
-      "media_type": "1d-interleaved-parityfec"
+      "media_type": "application/1d-interleaved-parityfec"
     }
   ],
   "extras": [

diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py
@@ -144,7 +144,7 @@ def test_dataset_all_fields(self):
         assert all(l in resource['description'] for l in ['de', 'fr', 'it', 'en']), "resource description contains all languages"
         eq_(resource['description']['de'], u'')
         eq_(resource['format'], u'html')
-        eq_(resource['media_type'], u'html')
+        eq_(resource['media_type'], u'text/html')
         eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs')
         eq_(resource['rights'], u'NonCommercialAllowed-CommercialAllowed-ReferenceRequired')
         eq_(resource['license'], u'Creative Commons CC Zero License (cc-zero)')
@@ -294,7 +294,6 @@ def test_distribution_format_format_only(self):
 
         resource = datasets[0]['resources'][0]
 
-
     def test_temporals_accepted_formats(self):
         contents = self._get_file_contents('dataset-datetimes.xml')
         p = RDFParser(profiles=['swiss_dcat_ap'])
@@ -404,6 +403,8 @@ def test_multiple_rights_statements(self):
         dataset = [d for d in p.datasets()][0]
         resource = dataset["resources"][0]
 
+        eq_(resource['rights'], u"NonCommercialAllowed-CommercialWithPermission-ReferenceRequired")
+
     def test_eu_themes_mapping(self):
         contents = self._get_file_contents('catalog-themes.xml')
         p = RDFParser(profiles=['swiss_dcat_ap'])
@@ -421,3 +422,21 @@ def test_eu_themes_mapping(self):
                     dataset['identifier']
                 )
             )
+
+    def test_format_media_type(self):
+        """Test that format and media type are parsed both from URIs and from
+        strings
+        """
+        contents = self._get_file_contents('dataset-media-types.xml')
+        p = RDFParser(profiles=['swiss_dcat_ap'])
+        p.parse(contents)
+
+        dataset = [d for d in p.datasets()][0]
+        results = [
+            (resource.get('format'), resource.get('media_type'))
+            for resource in dataset['resources']
+        ]
+        eq_(
+            sorted(results),
+            [('html', 'text/html'), ('json', 'application/json')]
+        )
diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py
@@ -125,8 +125,8 @@ def test_graph_from_dataset(self):
             if resource_dict.get('format') == "HTML":
                 assert self._triple(g, distribution, DCT['format'], URIRef("http://publications.europa.eu/resource/authority/file-type/HTML"))
 
-            if resource_dict.get('format') == "1d-interleaved-parityfec":
-                assert self._triple(g, distribution, DCT['format'], URIRef("http://www.iana.org/assignments/video/1d-interleaved-parityfec"))
+            if resource_dict.get('media_type') == "application/1d-interleaved-parityfec":
+                assert self._triple(g, distribution, DCAT.mediaType, URIRef("http://www.iana.org/assignments/application/1d-interleaved-parityfec"))
 
             if resource_dict.get('temporal_resolution') == "P1D":
                 expected_literal = Literal("P1D", datatype=XSD.duration)