Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Map dataset groups as EU themes on import #93

Merged
merged 12 commits into from
Nov 23, 2023
58 changes: 48 additions & 10 deletions ckanext/dcatapchharvest/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@
SPDX = Namespace('http://spdx.org/rdf/terms#')
XSD = Namespace('http://www.w3.org/2001/XMLSchema#')
EUTHEMES = dh.EUTHEMES
CHTHEMES_URI = "http://dcat-ap.ch/vocabulary/themes/"
CHTHEMES = Namespace(CHTHEMES_URI)
bellisk marked this conversation as resolved.
Show resolved Hide resolved
ODRS = Namespace("http://schema.theodi.org/odrs#")
bellisk marked this conversation as resolved.
Show resolved Hide resolved

GEOJSON_IMT = 'https://www.iana.org/assignments/media-types/application/vnd.geo+json' # noqa
Expand Down Expand Up @@ -64,7 +62,9 @@
'odrs': ODRS,
}

ogd_theme_base_url = 'http://opendata.swiss/themes'
OGD_THEMES_URI = 'http://opendata.swiss/themes/'
CHTHEMES_URI = 'http://dcat-ap.ch/vocabulary/themes/'
EUTHEMES_URI = 'http://publications.europa.eu/resource/authority/data-theme/'

slug_id_pattern = re.compile('[^/]+(?=/$|$)')

Expand Down Expand Up @@ -418,6 +418,50 @@ def _get_eu_accrual_periodicity(self, subject):
"in the official list of frequencies" % ogdch_value)
return ""

def _get_groups(self, subject):
"""Map the DCAT.theme values of a dataset to themes from the EU theme
vocabulary http://publications.europa.eu/resource/authority/data-theme
"""
group_names = []
dcat_theme_urls = self._object_value_list(subject, DCAT.theme)

if dcat_theme_urls:
for dcat_theme_url in dcat_theme_urls:
eu_theme_url = None

# Case 1: We get a deprecated opendata.swiss theme. Replace
# the base url with the dcat-ap.ch base url, so we can
# look it up in the theme mapping.
if dcat_theme_url.startswith(OGD_THEMES_URI):
new_theme_url = dcat_theme_url.replace(
OGD_THEMES_URI, CHTHEMES_URI)
eu_theme_url = unicode(
eu_theme_mapping[URIRef(new_theme_url)][0])

# Case 2: We get a dcat-ap.ch theme (the same as the
# opendata.swiss themes, but different base url). Get
# the correct EU theme from the theme mapping.
elif dcat_theme_url.startswith(CHTHEMES_URI):
eu_theme_url = unicode(
eu_theme_mapping[URIRef(dcat_theme_url)][0])

# Case 3: We get an EU theme and don't need to look it up in
# the mapping.
elif dcat_theme_url.startswith(EUTHEMES_URI):
eu_theme_url = dcat_theme_url

if eu_theme_url is None:
log.info("Could not find an EU theme that matched the "
"given theme: {}".format(dcat_theme_url))
continue

search_result = slug_id_pattern.search(eu_theme_url)
eu_theme_slug = search_result.group()
group_names.append(eu_theme_slug)

# Deduplicate group names before returning list of group dicts
return [{'name': name} for name in list(set(group_names))]

def parse_dataset(self, dataset_dict, dataset_ref): # noqa
log.debug("Parsing dataset '%r'" % dataset_ref)

Expand Down Expand Up @@ -475,13 +519,7 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa
dataset_dict['keywords'] = self._keywords(dataset_ref)

# Themes
dcat_theme_urls = self._object_value_list(dataset_ref, DCAT.theme)
if dcat_theme_urls:
dataset_dict['groups'] = []
for dcat_theme_url in dcat_theme_urls:
search_result = slug_id_pattern.search(dcat_theme_url)
dcat_theme_slug = search_result.group()
dataset_dict['groups'].append({'name': dcat_theme_slug})
dataset_dict['groups'] = self._get_groups(dataset_ref)

# Languages
languages = self._object_value_list(dataset_ref, DCT.language)
Expand Down
49 changes: 49 additions & 0 deletions ckanext/dcatapchharvest/tests/fixtures/catalog-themes.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:hydra="http://www.w3.org/ns/hydra/core#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:dcat="http://www.w3.org/ns/dcat#" xmlns:schema="http://schema.org/" xmlns:vcard="http://www.w3.org/2006/vcard/ns#" xmlns:odrs="http://schema.theodi.org/odrs#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dct="http://purl.org/dc/terms/">
<dcat:Catalog>
<dct:description>BFS - OGD Collection</dct:description>
<dct:title>BFS - OGD Collection</dct:title>
<dcat:dataset>
<dcat:Dataset>
<dct:identifier>opendata-swiss-themes@bundesamt-fur-statistik-bfs</dct:identifier>
<dcat:theme rdf:resource="http://opendata.swiss/themes/statistical-basis"/>
<dcat:theme rdf:resource="http://opendata.swiss/themes/work"/>
<dcat:theme rdf:resource="http://opendata.swiss/themes/construction"/>
</dcat:Dataset>
</dcat:dataset>
<dcat:dataset>
<dcat:Dataset>
<dct:identifier>opendata-swiss-themes-dedupe@bundesamt-fur-statistik-bfs</dct:identifier>
<dcat:theme rdf:resource="http://opendata.swiss/themes/statistical-basis"/>
<dcat:theme rdf:resource="http://opendata.swiss/themes/work"/>
<dcat:theme rdf:resource="http://opendata.swiss/themes/finances"/>
<dcat:theme rdf:resource="http://opendata.swiss/themes/construction"/>
<dcat:theme rdf:resource="http://opendata.swiss/themes/population"/>
</dcat:Dataset>
</dcat:dataset>
<dcat:dataset>
<dcat:Dataset>
<dct:identifier>dcat-ap-ch-themes@bundesamt-fur-statistik-bfs</dct:identifier>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/statistical-basis"/>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/work"/>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/construction"/>
</dcat:Dataset>
<dcat:Dataset>
<dct:identifier>dcat-ap-ch-themes-dedupe@bundesamt-fur-statistik-bfs</dct:identifier>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/statistical-basis"/>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/work"/>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/finances"/>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/construction"/>
<dcat:theme rdf:resource="http://dcat-ap.ch/vocabulary/themes/population"/>
</dcat:Dataset>
</dcat:dataset>
<dcat:dataset>
<dcat:Dataset>
<dct:identifier>eu-themes@bundesamt-fur-statistik-bfs</dct:identifier>
<dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/GOVE"/>
<dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/SOCI"/>
<dcat:theme rdf:resource="http://publications.europa.eu/resource/authority/data-theme/ECON"/>
</dcat:Dataset>
</dcat:dataset>
</dcat:Catalog>
</rdf:RDF>
20 changes: 19 additions & 1 deletion ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_dataset_all_fields(self):

# Lists
eq_(sorted(dataset['language']), [u'de', u'fr'])
eq_(sorted(dataset['groups']), [{'name': u'statistical-basis'}])
eq_(sorted(dataset['groups']), [{'name': u'GOVE'}])
eq_(
sorted(dataset['documentation']),
['https://example.com/documentation-dataset-1', 'https://example.com/documentation-dataset-2']
Expand Down Expand Up @@ -405,3 +405,21 @@ def test_multiple_rights_statements(self):
p.parse(contents)
dataset = [d for d in p.datasets()][0]
resource = dataset["resources"][0]

def test_eu_themes_mapping(self):
contents = self._get_file_contents('catalog-themes.xml')
p = RDFParser(profiles=['swiss_dcat_ap'])
p.parse(contents)

for dataset in p.datasets():
eq_(
sorted(dataset['groups']),
[
{'name': u'ECON'},
{'name': u'GOVE'},
{'name': u'SOCI'},
],
"Groups not mapped correctly for dataset {}".format(
dataset['identifier']
)
)
kovalch marked this conversation as resolved.
Show resolved Hide resolved
Loading