opendata-swiss · bellisk · Nov 23, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py
@@ -34,8 +34,6 @@
 SPDX = Namespace('http://spdx.org/rdf/terms#')
 XSD = Namespace('http://www.w3.org/2001/XMLSchema#')
 EUTHEMES = dh.EUTHEMES
-CHTHEMES_URI = "http://dcat-ap.ch/vocabulary/themes/"
-CHTHEMES = Namespace(CHTHEMES_URI)
 ODRS = Namespace("http://schema.theodi.org/odrs#")
 
 GEOJSON_IMT = 'https://www.iana.org/assignments/media-types/application/vnd.geo+json'  # noqa
@@ -64,7 +62,9 @@
     'odrs': ODRS,
 }
 
-ogd_theme_base_url = 'http://opendata.swiss/themes'
+OGD_THEMES_URI = 'http://opendata.swiss/themes/'
+CHTHEMES_URI = 'http://dcat-ap.ch/vocabulary/themes/'
+EUTHEMES_URI = 'http://publications.europa.eu/resource/authority/data-theme/'
 
 slug_id_pattern = re.compile('[^/]+(?=/$|$)')
 
@@ -418,6 +418,50 @@ def _get_eu_accrual_periodicity(self, subject):
                  "in the official list of frequencies" % ogdch_value)
         return ""
 
+    def _get_groups(self, subject):
+        """Map the DCAT.theme values of a dataset to themes from the EU theme
+        vocabulary http://publications.europa.eu/resource/authority/data-theme
+        """
+        group_names = []
+        dcat_theme_urls = self._object_value_list(subject, DCAT.theme)
+
+        if dcat_theme_urls:
+            for dcat_theme_url in dcat_theme_urls:
+                eu_theme_url = None
+
+                # Case 1: We get a deprecated opendata.swiss theme. Replace
+                #         the base url with the dcat-ap.ch base url, so we can
+                #         look it up in the theme mapping.
+                if dcat_theme_url.startswith(OGD_THEMES_URI):
+                    new_theme_url = dcat_theme_url.replace(
+                        OGD_THEMES_URI, CHTHEMES_URI)
+                    eu_theme_url = unicode(
+                        eu_theme_mapping[URIRef(new_theme_url)][0])
+
+                # Case 2: We get a dcat-ap.ch theme (the same as the
+                #         opendata.swiss themes, but different base url). Get
+                #         the correct EU theme from the theme mapping.
+                elif dcat_theme_url.startswith(CHTHEMES_URI):
+                    eu_theme_url = unicode(
+                        eu_theme_mapping[URIRef(dcat_theme_url)][0])
+
+                # Case 3: We get an EU theme and don't need to look it up in
+                #         the mapping.
+                elif dcat_theme_url.startswith(EUTHEMES_URI):
+                    eu_theme_url = dcat_theme_url
+
+                if eu_theme_url is None:
+                    log.info("Could not find an EU theme that matched the "
+                             "given theme: {}".format(dcat_theme_url))
+                    continue
+
+                search_result = slug_id_pattern.search(eu_theme_url)
+                eu_theme_slug = search_result.group()
+                group_names.append(eu_theme_slug)
+
+        # Deduplicate group names before returning list of group dicts
+        return [{'name': name} for name in list(set(group_names))]
+
     def parse_dataset(self, dataset_dict, dataset_ref):  # noqa
         log.debug("Parsing dataset '%r'" % dataset_ref)
 
@@ -475,13 +519,7 @@ def parse_dataset(self, dataset_dict, dataset_ref):  # noqa
         dataset_dict['keywords'] = self._keywords(dataset_ref)
 
         # Themes
-        dcat_theme_urls = self._object_value_list(dataset_ref, DCAT.theme)
-        if dcat_theme_urls:
-            dataset_dict['groups'] = []
-            for dcat_theme_url in dcat_theme_urls:
-                search_result = slug_id_pattern.search(dcat_theme_url)
-                dcat_theme_slug = search_result.group()
-                dataset_dict['groups'].append({'name': dcat_theme_slug})
+        dataset_dict['groups'] = self._get_groups(dataset_ref)
 
         #  Languages
         languages = self._object_value_list(dataset_ref, DCT.language)

diff --git a/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:hydra="http://www.w3.org/ns/hydra/core#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:schema="http://schema.org/" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:odrs="http://schema.theodi.org/odrs#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dct="http://purl.org/dc/terms/">
+    <dcat:Catalog>
+        <dct:description>BFS - OGD Collection</dct:description>
+        <dct:title>BFS - OGD Collection</dct:title>
+        <dcat:dataset>
+            <dcat:Dataset>
+                <dct:identifier>opendata-swiss-themes@bundesamt-fur-statistik-bfs</dct:identifier>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/statistical-basis"/>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/work"/>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/construction"/>
+            </dcat:Dataset>
+        </dcat:dataset>
+        <dcat:dataset>
+            <dcat:Dataset>
+                <dct:identifier>opendata-swiss-themes-dedupe@bundesamt-fur-statistik-bfs</dct:identifier>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/statistical-basis"/>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/work"/>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/finances"/>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/construction"/>
+                <dcat:theme rdf:resource="http://opendata.swiss/themes/population"/>
+            </dcat:Dataset>
+        </dcat:dataset>
+        <dcat:dataset>
+            <dcat:Dataset>
+                <dct:identifier>dcat-ap-ch-themes@bundesamt-fur-statistik-bfs</dct:identifier>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/statistical-basis"/>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/work"/>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/construction"/>
+            </dcat:Dataset>
+            <dcat:Dataset>
+                <dct:identifier>dcat-ap-ch-themes-dedupe@bundesamt-fur-statistik-bfs</dct:identifier>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/statistical-basis"/>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/work"/>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/finances"/>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/construction"/>
+                <dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/population"/>
+            </dcat:Dataset>
+        </dcat:dataset>
+        <dcat:dataset>
+            <dcat:Dataset>
+                <dct:identifier>eu-themes@bundesamt-fur-statistik-bfs</dct:identifier>
+                <dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/GOVE"/>
+                <dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/SOCI"/>
+                <dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/ECON"/>
+            </dcat:Dataset>
+        </dcat:dataset>
+    </dcat:Catalog>
+</rdf:RDF>
diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py
@@ -120,7 +120,7 @@ def test_dataset_all_fields(self):
 
         #  Lists
         eq_(sorted(dataset['language']), [u'de', u'fr'])
-        eq_(sorted(dataset['groups']), [{'name': u'statistical-basis'}])
+        eq_(sorted(dataset['groups']), [{'name': u'GOVE'}])
         eq_(
             sorted(dataset['documentation']),
             ['https://example.com/documentation-dataset-1', 'https://example.com/documentation-dataset-2']
@@ -405,3 +405,21 @@ def test_multiple_rights_statements(self):
         p.parse(contents)
         dataset = [d for d in p.datasets()][0]
         resource = dataset["resources"][0]
+
+    def test_eu_themes_mapping(self):
+        contents = self._get_file_contents('catalog-themes.xml')
+        p = RDFParser(profiles=['swiss_dcat_ap'])
+        p.parse(contents)
+
+        for dataset in p.datasets():
+            eq_(
+                sorted(dataset['groups']),
+                [
+                    {'name': u'ECON'},
+                    {'name': u'GOVE'},
+                    {'name': u'SOCI'},
+                ],
+                "Groups not mapped correctly for dataset {}".format(
+                    dataset['identifier']
+                )
+            )