From 9a4b893f4ce81cca76fe41afd74dd463fb09a142 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 15:50:10 +0100 Subject: [PATCH 01/12] feat: Map dataset groups as EU themes on import --- ckanext/dcatapchharvest/profiles.py | 49 +++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 60a272a..95e4e18 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -34,8 +34,6 @@ SPDX = Namespace('http://spdx.org/rdf/terms#') XSD = Namespace('http://www.w3.org/2001/XMLSchema#') EUTHEMES = dh.EUTHEMES -CHTHEMES_URI = "http://dcat-ap.ch/vocabulary/themes/" -CHTHEMES = Namespace(CHTHEMES_URI) ODRS = Namespace("http://schema.theodi.org/odrs#") GEOJSON_IMT = 'https://www.iana.org/assignments/media-types/application/vnd.geo+json' # noqa @@ -64,7 +62,9 @@ 'odrs': ODRS, } -ogd_theme_base_url = 'http://opendata.swiss/themes' +OGD_THEMES_URI = 'http://opendata.swiss/themes/' +CHTHEMES_URI = 'http://dcat-ap.ch/vocabulary/themes/' +EUTHEMES_URI = 'http://publications.europa.eu/resource/authority/data-theme/' slug_id_pattern = re.compile('[^/]+(?=/$|$)') @@ -418,6 +418,41 @@ def _get_eu_accrual_periodicity(self, subject): "in the official list of frequencies" % ogdch_value) return "" + def _get_groups(self, subject): + """Map the DCAT.theme values of a dataset to themes from the EU theme + vocabulary http://publications.europa.eu/resource/authority/data-theme + """ + groups = [] + dcat_theme_urls = self._object_value_list(subject, DCAT.theme) + log.warning(dcat_theme_urls) + if dcat_theme_urls: + for dcat_theme_url in dcat_theme_urls: + # Case 1: We get a deprecated opendata.swiss theme. Replace + # the base url with the dcat-ap.ch base url, so we can + # look it up in the theme mapping. + if dcat_theme_url.startswith(OGD_THEMES_URI): + dcat_theme_url = dcat_theme_url.replace( + OGD_THEMES_URI, CHTHEMES_URI) + + # Case 2: We get a dcat-ap.ch theme (the same as the + # opendata.swiss themes, but different base url). Get + # the correct EU theme from the theme mapping. + if dcat_theme_url.startswith(CHTHEMES_URI): + eu_theme_url = unicode( + eu_theme_mapping[URIRef(dcat_theme_url)]) + + # Case 3: We get an EU theme and don't need to look it up in + # the mapping. + if dcat_theme_url.startswith(EUTHEMES_URI): + eu_theme_url = dcat_theme_url + + search_result = slug_id_pattern.search(eu_theme_url) + eu_theme_slug = search_result.group() + groups.append({'name': eu_theme_slug}) + + log.warning(groups) + return groups + def parse_dataset(self, dataset_dict, dataset_ref): # noqa log.debug("Parsing dataset '%r'" % dataset_ref) @@ -475,13 +510,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa dataset_dict['keywords'] = self._keywords(dataset_ref) # Themes - dcat_theme_urls = self._object_value_list(dataset_ref, DCAT.theme) - if dcat_theme_urls: - dataset_dict['groups'] = [] - for dcat_theme_url in dcat_theme_urls: - search_result = slug_id_pattern.search(dcat_theme_url) - dcat_theme_slug = search_result.group() - dataset_dict['groups'].append({'name': dcat_theme_slug}) + dataset_dict['groups'] = self._get_groups(dataset_ref) # Languages languages = self._object_value_list(dataset_ref, DCT.language) From 4d3af7fc8408a7a04e0d1d591d7923e997865d68 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 15:54:24 +0100 Subject: [PATCH 02/12] tests: Update test to use EU theme, not opendata.swiss theme --- ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index d854379..320cf07 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -120,7 +120,7 @@ def test_dataset_all_fields(self): # Lists eq_(sorted(dataset['language']), [u'de', u'fr']) - eq_(sorted(dataset['groups']), [{'name': u'statistical-basis'}]) + eq_(sorted(dataset['groups']), [{'name': u'GOVE'}]) eq_( sorted(dataset['documentation']), ['https://example.com/documentation-dataset-1', 'https://example.com/documentation-dataset-2'] From bf42f968a21715f5559ba5a3366f96d79fdfb528 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 15:55:31 +0100 Subject: [PATCH 03/12] chore: Remove logging --- ckanext/dcatapchharvest/profiles.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 95e4e18..a0f2954 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -424,7 +424,7 @@ def _get_groups(self, subject): """ groups = [] dcat_theme_urls = self._object_value_list(subject, DCAT.theme) - log.warning(dcat_theme_urls) + if dcat_theme_urls: for dcat_theme_url in dcat_theme_urls: # Case 1: We get a deprecated opendata.swiss theme. Replace @@ -450,7 +450,6 @@ def _get_groups(self, subject): eu_theme_slug = search_result.group() groups.append({'name': eu_theme_slug}) - log.warning(groups) return groups def parse_dataset(self, dataset_dict, dataset_ref): # noqa From 70b7a8f43c293e65f2c2987c7880cda10637d022 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 15:55:55 +0100 Subject: [PATCH 04/12] feat: Deduplicate list of groups --- ckanext/dcatapchharvest/profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index a0f2954..365bf5d 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -450,7 +450,7 @@ def _get_groups(self, subject): eu_theme_slug = search_result.group() groups.append({'name': eu_theme_slug}) - return groups + return list(set(groups)) def parse_dataset(self, dataset_dict, dataset_ref): # noqa log.debug("Parsing dataset '%r'" % dataset_ref) From b4c32b1472d0b16a82292b847b8039750c513a52 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 16:08:01 +0100 Subject: [PATCH 05/12] feat: Improve error handling and deduplication of groups --- ckanext/dcatapchharvest/profiles.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 365bf5d..3a266ec 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -422,35 +422,45 @@ def _get_groups(self, subject): """Map the DCAT.theme values of a dataset to themes from the EU theme vocabulary http://publications.europa.eu/resource/authority/data-theme """ - groups = [] + group_names = [] dcat_theme_urls = self._object_value_list(subject, DCAT.theme) if dcat_theme_urls: for dcat_theme_url in dcat_theme_urls: + eu_theme_url = None + # Case 1: We get a deprecated opendata.swiss theme. Replace # the base url with the dcat-ap.ch base url, so we can # look it up in the theme mapping. if dcat_theme_url.startswith(OGD_THEMES_URI): - dcat_theme_url = dcat_theme_url.replace( + new_theme_url = dcat_theme_url.replace( OGD_THEMES_URI, CHTHEMES_URI) + eu_theme_url = unicode( + eu_theme_mapping[URIRef(new_theme_url)]) # Case 2: We get a dcat-ap.ch theme (the same as the # opendata.swiss themes, but different base url). Get # the correct EU theme from the theme mapping. - if dcat_theme_url.startswith(CHTHEMES_URI): + elif dcat_theme_url.startswith(CHTHEMES_URI): eu_theme_url = unicode( eu_theme_mapping[URIRef(dcat_theme_url)]) # Case 3: We get an EU theme and don't need to look it up in # the mapping. - if dcat_theme_url.startswith(EUTHEMES_URI): + elif dcat_theme_url.startswith(EUTHEMES_URI): eu_theme_url = dcat_theme_url + if eu_theme_url is None: + log.info("Could not find an EU theme that matched the " + "given theme: {}".format(dcat_theme_url)) + continue + search_result = slug_id_pattern.search(eu_theme_url) eu_theme_slug = search_result.group() - groups.append({'name': eu_theme_slug}) + group_names.append(eu_theme_slug) - return list(set(groups)) + # Deduplicate group names before returning list of group dicts + return [{'name': name} for name in list(set(group_names))] def parse_dataset(self, dataset_dict, dataset_ref): # noqa log.debug("Parsing dataset '%r'" % dataset_ref) From 5471f3fff86284dbbc173d77254d7a74eb8e8cb1 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 16:16:55 +0100 Subject: [PATCH 06/12] fix: Fix getting EU theme urls from the mapping The EU theme mapping returns a list of URIRefs of themes for each DCAT theme, but each list only contains one entry. --- ckanext/dcatapchharvest/profiles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 3a266ec..401ba80 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -436,14 +436,14 @@ def _get_groups(self, subject): new_theme_url = dcat_theme_url.replace( OGD_THEMES_URI, CHTHEMES_URI) eu_theme_url = unicode( - eu_theme_mapping[URIRef(new_theme_url)]) + eu_theme_mapping[URIRef(new_theme_url)][0]) # Case 2: We get a dcat-ap.ch theme (the same as the # opendata.swiss themes, but different base url). Get # the correct EU theme from the theme mapping. elif dcat_theme_url.startswith(CHTHEMES_URI): eu_theme_url = unicode( - eu_theme_mapping[URIRef(dcat_theme_url)]) + eu_theme_mapping[URIRef(dcat_theme_url)][0]) # Case 3: We get an EU theme and don't need to look it up in # the mapping. From 748ff7b3b30c9d28a2c6b21f4ff365e4fecc1615 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 16:30:56 +0100 Subject: [PATCH 07/12] tests: Add test for mapping EU themes --- .../tests/fixtures/catalog-themes.xml | 49 +++++++++++++++++++ .../tests/test_dcatap_ch_parse.py | 18 +++++++ 2 files changed, 67 insertions(+) create mode 100644 ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml diff --git a/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml new file mode 100644 index 0000000..0ebb8c9 --- /dev/null +++ b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml @@ -0,0 +1,49 @@ + + + + BFS - OGD Collection + BFS - OGD Collection + + + opendata-swiss-themes@bundesamt-fur-statistik-bfs + + + + + + + + opendata-swiss-themes-dedupe@bundesamt-fur-statistik-bfs + + + + + + + + + + dcat-ap-ch-themes@bundesamt-fur-statistik-bfs + + + + + + dcat-ap-ch-themes-dedupe@bundesamt-fur-statistik-bfs + + + + + + + + + + eu-themes@bundesamt-fur-statistik-bfs + + + + + + + diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index 320cf07..fd13cd3 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -405,3 +405,21 @@ def test_multiple_rights_statements(self): p.parse(contents) dataset = [d for d in p.datasets()][0] resource = dataset["resources"][0] + + def test_eu_themes_mapping(self): + contents = self._get_file_contents('catalog-themes.xml') + p = RDFParser(profiles=['swiss_dcat_ap']) + p.parse(contents) + + for dataset in p.datasets(): + eq_( + sorted(dataset['groups']), + [ + {'name': u'ECON'}, + {'name': u'GOVE'}, + {'name': u'SOCI'}, + ], + "Groups not mapped correctly for dataset {}".format( + dataset['identifier'] + ) + ) From 16d33cd1a018e9582414eb9849a148b2ff038e23 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Mon, 20 Nov 2023 16:35:06 +0100 Subject: [PATCH 08/12] tests: Fix xml file --- ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml index 0ebb8c9..1525b00 100644 --- a/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml +++ b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml @@ -28,6 +28,8 @@ + + dcat-ap-ch-themes-dedupe@bundesamt-fur-statistik-bfs From a6fc7af21392d19d1f0ced7de0940d010608fbab Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Tue, 21 Nov 2023 17:15:30 +0100 Subject: [PATCH 09/12] feat: Export EU themes to RDF --- ckanext/dcatapchharvest/profiles.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 401ba80..d499d67 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -874,18 +874,11 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa # Themes groups = self._get_dataset_value(dataset_dict, 'groups', []) for group_name in groups: - ogdch_theme_ref = URIRef(CHTHEMES_URI + group_name.get('name')) - eu_theme_ref_list = eu_theme_mapping.get(ogdch_theme_ref) - for eu_theme_ref in eu_theme_ref_list: - g.add(( - dataset_ref, - DCAT.theme, - eu_theme_ref, - )) + eu_theme_ref = URIRef(EUTHEMES_URI + group_name.get('name')) g.add(( dataset_ref, DCAT.theme, - ogdch_theme_ref, + eu_theme_ref, )) # Resources From 8437704d7f3c5237c0a05dd37c390f0897c06a42 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Wed, 22 Nov 2023 16:02:52 +0100 Subject: [PATCH 10/12] feat: Set group names to lower case --- ckanext/dcatapchharvest/profiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index d499d67..df39aba 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -456,7 +456,7 @@ def _get_groups(self, subject): continue search_result = slug_id_pattern.search(eu_theme_url) - eu_theme_slug = search_result.group() + eu_theme_slug = search_result.group().lower() group_names.append(eu_theme_slug) # Deduplicate group names before returning list of group dicts From 7594afb8f049cf12eabceb2e8f7a79e2a4f37871 Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Wed, 22 Nov 2023 16:23:10 +0100 Subject: [PATCH 11/12] tests: Update tests with lower-case group names --- ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index fd13cd3..fb0a7e9 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -120,7 +120,7 @@ def test_dataset_all_fields(self): # Lists eq_(sorted(dataset['language']), [u'de', u'fr']) - eq_(sorted(dataset['groups']), [{'name': u'GOVE'}]) + eq_(sorted(dataset['groups']), [{'name': u'gove'}]) eq_( sorted(dataset['documentation']), ['https://example.com/documentation-dataset-1', 'https://example.com/documentation-dataset-2'] @@ -415,9 +415,9 @@ def test_eu_themes_mapping(self): eq_( sorted(dataset['groups']), [ - {'name': u'ECON'}, - {'name': u'GOVE'}, - {'name': u'SOCI'}, + {'name': u'econ'}, + {'name': u'gove'}, + {'name': u'soci'}, ], "Groups not mapped correctly for dataset {}".format( dataset['identifier'] From 62625d5e88f6e7cc12ed8d910e87c8660f3e370b Mon Sep 17 00:00:00 2001 From: Rae Knowler Date: Thu, 23 Nov 2023 10:33:50 +0100 Subject: [PATCH 12/12] style: Make quote marks consistent --- ckanext/dcatapchharvest/profiles.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index df39aba..984752d 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -22,11 +22,11 @@ DCT = dh.DCT -DCAT = Namespace("http://www.w3.org/ns/dcat#") -VCARD = Namespace("http://www.w3.org/2006/vcard/ns#") +DCAT = Namespace('http://www.w3.org/ns/dcat#') +VCARD = Namespace('http://www.w3.org/2006/vcard/ns#') SCHEMA = Namespace('http://schema.org/') -ADMS = Namespace("http://www.w3.org/ns/adms#") -FOAF = Namespace("http://xmlns.com/foaf/0.1/") +ADMS = Namespace('http://www.w3.org/ns/adms#') +FOAF = Namespace('http://xmlns.com/foaf/0.1/') TIME = Namespace('http://www.w3.org/2006/time') LOCN = Namespace('http://www.w3.org/ns/locn#') GSP = Namespace('http://www.opengis.net/ont/geosparql#') @@ -34,7 +34,7 @@ SPDX = Namespace('http://spdx.org/rdf/terms#') XSD = Namespace('http://www.w3.org/2001/XMLSchema#') EUTHEMES = dh.EUTHEMES -ODRS = Namespace("http://schema.theodi.org/odrs#") +ODRS = Namespace('http://schema.theodi.org/odrs#') GEOJSON_IMT = 'https://www.iana.org/assignments/media-types/application/vnd.geo+json' # noqa