diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 60a272a..984752d 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -22,11 +22,11 @@ DCT = dh.DCT -DCAT = Namespace("http://www.w3.org/ns/dcat#") -VCARD = Namespace("http://www.w3.org/2006/vcard/ns#") +DCAT = Namespace('http://www.w3.org/ns/dcat#') +VCARD = Namespace('http://www.w3.org/2006/vcard/ns#') SCHEMA = Namespace('http://schema.org/') -ADMS = Namespace("http://www.w3.org/ns/adms#") -FOAF = Namespace("http://xmlns.com/foaf/0.1/") +ADMS = Namespace('http://www.w3.org/ns/adms#') +FOAF = Namespace('http://xmlns.com/foaf/0.1/') TIME = Namespace('http://www.w3.org/2006/time') LOCN = Namespace('http://www.w3.org/ns/locn#') GSP = Namespace('http://www.opengis.net/ont/geosparql#') @@ -34,9 +34,7 @@ SPDX = Namespace('http://spdx.org/rdf/terms#') XSD = Namespace('http://www.w3.org/2001/XMLSchema#') EUTHEMES = dh.EUTHEMES -CHTHEMES_URI = "http://dcat-ap.ch/vocabulary/themes/" -CHTHEMES = Namespace(CHTHEMES_URI) -ODRS = Namespace("http://schema.theodi.org/odrs#") +ODRS = Namespace('http://schema.theodi.org/odrs#') GEOJSON_IMT = 'https://www.iana.org/assignments/media-types/application/vnd.geo+json' # noqa @@ -64,7 +62,9 @@ 'odrs': ODRS, } -ogd_theme_base_url = 'http://opendata.swiss/themes' +OGD_THEMES_URI = 'http://opendata.swiss/themes/' +CHTHEMES_URI = 'http://dcat-ap.ch/vocabulary/themes/' +EUTHEMES_URI = 'http://publications.europa.eu/resource/authority/data-theme/' slug_id_pattern = re.compile('[^/]+(?=/$|$)') @@ -418,6 +418,50 @@ def _get_eu_accrual_periodicity(self, subject): "in the official list of frequencies" % ogdch_value) return "" + def _get_groups(self, subject): + """Map the DCAT.theme values of a dataset to themes from the EU theme + vocabulary http://publications.europa.eu/resource/authority/data-theme + """ + group_names = [] + dcat_theme_urls = self._object_value_list(subject, DCAT.theme) + + if dcat_theme_urls: + for dcat_theme_url in dcat_theme_urls: + eu_theme_url = None + + # Case 1: We get a deprecated opendata.swiss theme. Replace + # the base url with the dcat-ap.ch base url, so we can + # look it up in the theme mapping. + if dcat_theme_url.startswith(OGD_THEMES_URI): + new_theme_url = dcat_theme_url.replace( + OGD_THEMES_URI, CHTHEMES_URI) + eu_theme_url = unicode( + eu_theme_mapping[URIRef(new_theme_url)][0]) + + # Case 2: We get a dcat-ap.ch theme (the same as the + # opendata.swiss themes, but different base url). Get + # the correct EU theme from the theme mapping. + elif dcat_theme_url.startswith(CHTHEMES_URI): + eu_theme_url = unicode( + eu_theme_mapping[URIRef(dcat_theme_url)][0]) + + # Case 3: We get an EU theme and don't need to look it up in + # the mapping. + elif dcat_theme_url.startswith(EUTHEMES_URI): + eu_theme_url = dcat_theme_url + + if eu_theme_url is None: + log.info("Could not find an EU theme that matched the " + "given theme: {}".format(dcat_theme_url)) + continue + + search_result = slug_id_pattern.search(eu_theme_url) + eu_theme_slug = search_result.group().lower() + group_names.append(eu_theme_slug) + + # Deduplicate group names before returning list of group dicts + return [{'name': name} for name in list(set(group_names))] + def parse_dataset(self, dataset_dict, dataset_ref): # noqa log.debug("Parsing dataset '%r'" % dataset_ref) @@ -475,13 +519,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa dataset_dict['keywords'] = self._keywords(dataset_ref) # Themes - dcat_theme_urls = self._object_value_list(dataset_ref, DCAT.theme) - if dcat_theme_urls: - dataset_dict['groups'] = [] - for dcat_theme_url in dcat_theme_urls: - search_result = slug_id_pattern.search(dcat_theme_url) - dcat_theme_slug = search_result.group() - dataset_dict['groups'].append({'name': dcat_theme_slug}) + dataset_dict['groups'] = self._get_groups(dataset_ref) # Languages languages = self._object_value_list(dataset_ref, DCT.language) @@ -836,18 +874,11 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa # Themes groups = self._get_dataset_value(dataset_dict, 'groups', []) for group_name in groups: - ogdch_theme_ref = URIRef(CHTHEMES_URI + group_name.get('name')) - eu_theme_ref_list = eu_theme_mapping.get(ogdch_theme_ref) - for eu_theme_ref in eu_theme_ref_list: - g.add(( - dataset_ref, - DCAT.theme, - eu_theme_ref, - )) + eu_theme_ref = URIRef(EUTHEMES_URI + group_name.get('name')) g.add(( dataset_ref, DCAT.theme, - ogdch_theme_ref, + eu_theme_ref, )) # Resources diff --git a/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml new file mode 100644 index 0000000..1525b00 --- /dev/null +++ b/ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml @@ -0,0 +1,51 @@ + + + + BFS - OGD Collection + BFS - OGD Collection + + + opendata-swiss-themes@bundesamt-fur-statistik-bfs + + + + + + + + opendata-swiss-themes-dedupe@bundesamt-fur-statistik-bfs + + + + + + + + + + dcat-ap-ch-themes@bundesamt-fur-statistik-bfs + + + + + + + + dcat-ap-ch-themes-dedupe@bundesamt-fur-statistik-bfs + + + + + + + + + + eu-themes@bundesamt-fur-statistik-bfs + + + + + + + diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index d854379..fb0a7e9 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -120,7 +120,7 @@ def test_dataset_all_fields(self): # Lists eq_(sorted(dataset['language']), [u'de', u'fr']) - eq_(sorted(dataset['groups']), [{'name': u'statistical-basis'}]) + eq_(sorted(dataset['groups']), [{'name': u'gove'}]) eq_( sorted(dataset['documentation']), ['https://example.com/documentation-dataset-1', 'https://example.com/documentation-dataset-2'] @@ -405,3 +405,21 @@ def test_multiple_rights_statements(self): p.parse(contents) dataset = [d for d in p.datasets()][0] resource = dataset["resources"][0] + + def test_eu_themes_mapping(self): + contents = self._get_file_contents('catalog-themes.xml') + p = RDFParser(profiles=['swiss_dcat_ap']) + p.parse(contents) + + for dataset in p.datasets(): + eq_( + sorted(dataset['groups']), + [ + {'name': u'econ'}, + {'name': u'gove'}, + {'name': u'soci'}, + ], + "Groups not mapped correctly for dataset {}".format( + dataset['identifier'] + ) + )