From 41eb58945ee1d77ca515bdef5f44b96f3f074458 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 29 Jul 2024 16:07:32 +0200 Subject: [PATCH 01/19] fix: Update parsing of the dataset of the license and rights fields --- ckanext/dcatapchharvest/dcat_helpers.py | 89 ++++++++++++++++++++----- ckanext/dcatapchharvest/profiles.py | 37 +++++----- 2 files changed, 91 insertions(+), 35 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 2e1d800..9312cd5 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -4,7 +4,7 @@ from urlparse import urlparse from ckantoolkit import config from rdflib import URIRef, Graph -from rdflib.namespace import Namespace, RDF, SKOS +from rdflib.namespace import Namespace, RDF, SKOS, FOAF import xml.etree.ElementTree as ET import logging @@ -17,6 +17,7 @@ SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#") RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") frequency_namespaces = { "skos": SKOS, @@ -38,6 +39,7 @@ "skosxl": SKOSXL, "rdf": RDF, "rdfs": RDFS, + "foaf": FOAF, } theme_namespaces = { @@ -167,38 +169,95 @@ def get_frequency_values(): return frequency_mapping -def get_license_uri_by_name(vocabulary_name): - license_vocabulary = get_license_values() - for key, value in license_vocabulary.items(): +def get_license_ref_uri_by_name(vocabulary_name): + _, license_ref_literal_vocabulary, _ = get_license_values() + for key, value in license_ref_literal_vocabulary.items(): if unicode(vocabulary_name) == unicode(value): return key return None -def get_license_name_by_uri(vocabulary_uri): - license_vocabulary = get_license_values() - for key, value in license_vocabulary.items(): +def get_license_ref_uri_by_homepage_uri(vocabulary_name): + _, _, license_homepage_ref_vocabulary = get_license_values() + for key, value in license_homepage_ref_vocabulary.items(): + if unicode(vocabulary_name) == unicode(key): + return value + return None + + +def get_license_name_by_ref_uri(vocabulary_uri): + _, license_ref_literal_vocabulary, _ = get_license_values() + for key, value in license_ref_literal_vocabulary.items(): + if unicode(vocabulary_uri) == unicode(key): + return unicode(value) + return None + + +def get_license_name_by_homepage_uri(vocabulary_uri): + license_homepages_literal_vocabulary, _, _ = get_license_values() + for key, value in license_homepages_literal_vocabulary.items(): if unicode(vocabulary_uri) == unicode(key): return unicode(value) return None +def get_license_homepage_uri_by_name(vocabulary_name): + license_homepages_literal_vocabulary, _, _ = get_license_values() + for key, value in license_homepages_literal_vocabulary.items(): + if unicode(vocabulary_name) == unicode(value): + return key + return None + + +def get_license_homepage_uri_by_uri(vocabulary_uri): + _, _, license_homepage_ref_vocabulary = get_license_values() + license_homepages = list(license_homepage_ref_vocabulary.keys()) + if vocabulary_uri in license_homepages: + return unicode(vocabulary_uri) + else: + for key, value in license_homepage_ref_vocabulary.items(): + if unicode(vocabulary_uri) == unicode(value): + return unicode(key) + return + + def get_license_values(): g = Graph() - license_mapping = {} + license_ref_literal_mapping = {} + license_homepages_literal_mapping = {} + license_homepage_ref_mapping = {} + for prefix, namespace in license_namespaces.items(): g.bind(prefix, namespace) file = os.path.join(__location__, 'license.ttl') g.parse(file, format='turtle') for ogdch_license_ref in g.subjects(predicate=RDF.type, object=SKOS.Concept): - license_mapping[ogdch_license_ref] = None - for license_pref_label in g.objects(subject=ogdch_license_ref, - predicate=SKOSXL.prefLabel): - for license_literal in g.objects(subject=license_pref_label, - predicate=SKOSXL.literalForm): - license_mapping[ogdch_license_ref] = license_literal - return license_mapping + license_homepage = None + for homepage in g.objects(subject=ogdch_license_ref, + predicate=FOAF.homepage): + license_homepage = homepage + break # Assume one homepage per concept + + license_literal = None + try: + for license_pref_label in g.objects(subject=ogdch_license_ref, + predicate=SKOSXL.prefLabel): + for literal in g.objects(subject=license_pref_label, + predicate=SKOSXL.literalForm): + license_literal = literal + break # Assume one literal per concept + + license_homepages_literal_mapping[license_homepage] = license_literal # noqa + license_ref_literal_mapping[ogdch_license_ref] = license_literal + license_homepage_ref_mapping[license_homepage] = ogdch_license_ref + + except Exception as e: + raise ValueError("SKOSXL.prefLabel is missing in the RDF-file: %s" + % e) + + return (license_homepages_literal_mapping, license_ref_literal_mapping, + license_homepage_ref_mapping) def get_theme_mapping(): diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 16accca..b64234f 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -277,23 +277,13 @@ def _get_iana_media_type(self, subject): if media_type_key in valid_media_types: return media_type_key - def _license_rights_name(self, subject, predicate): - for node in self.g.objects(subject, predicate): - # DCAT-AP CH v1: the license as a literal (should be - # the code for one of the DCAT-AP CH licenses) - if isinstance(node, Literal): - return unicode(node) - if isinstance(node, URIRef): - return dh.get_license_name_by_uri(node) - return None - - def _license_rights_uri(self, subject, predicate): + def _license_rights_homepage_uri(self, subject, predicate): for node in self.g.objects(subject, predicate): # DCAT-AP CH v2 compatible license has to be a URI. if isinstance(node, Literal): - return dh.get_license_uri_by_name(node) + return dh.get_license_homepage_uri_by_name(node) if isinstance(node, URIRef): - return node + return dh.get_license_homepage_uri_by_uri(node) return None def _keywords(self, subject): @@ -633,21 +623,28 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa if value: resource_dict[key] = value - # Rights & License save name - rights = self._license_rights_name(distribution, DCT.rights) - license = self._license_rights_name(distribution, DCT.license) + # Rights & License save homepage uri + rights = self._license_rights_homepage_uri(distribution, DCT.rights) + license = self._license_rights_homepage_uri(distribution, DCT.license) + if rights is None and license is not None: resource_dict['license'] = license resource_dict['rights'] = license - if rights is not None and license is None: - resource_dict['license'] = rights + elif rights is not None and license is None: resource_dict['rights'] = rights - if license is not None and rights is not None: + if 'cc' not in rights: + resource_dict['license'] = rights + else: + resource_dict['license'] = None + elif license is not None and rights is not None: resource_dict['license'] = license resource_dict['rights'] = rights - if 'cc' in rights: + if 'cc' in license and 'cc' not in rights: resource_dict['license'] = rights resource_dict['rights'] = license + else: + resource_dict['license'] = None + resource_dict['rights'] = None # Format & Media type resource_dict['format'] = \ From c5263bfe500e76afc1d565b815ad3fc197b27781 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 29 Jul 2024 16:40:27 +0200 Subject: [PATCH 02/19] fix: Update the dataset to graph logic --- ckanext/dcatapchharvest/dcat_helpers.py | 2 +- ckanext/dcatapchharvest/profiles.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 9312cd5..bc57608 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -13,11 +13,11 @@ DCT = Namespace("http://purl.org/dc/terms/") EUTHEMES = \ Namespace("http://publications.europa.eu/resource/authority/data-theme/") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") # noqa HYDRA = Namespace('http://www.w3.org/ns/hydra/core#') SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#") RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#") -FOAF = Namespace("http://xmlns.com/foaf/0.1/") frequency_namespaces = { "skos": SKOS, diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index b64234f..040d561 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -1035,7 +1035,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa def _rights_and_license_to_graph(self, resource_dict, distribution): g = self.g if resource_dict.get('rights'): - rights_uri = dh.get_license_uri_by_name( + rights_uri = dh.get_license_ref_uri_by_homepage_uri( resource_dict.get('rights') ) if rights_uri is not None: @@ -1043,13 +1043,12 @@ def _rights_and_license_to_graph(self, resource_dict, distribution): g.add((rights_ref, RDF.type, DCT.RightsStatement)) g.add((distribution, DCT.rights, rights_ref)) if rights_uri is None: - rights_name = dh.get_license_name_by_uri( + rights_name = dh.get_license_name_by_homepage_uri( resource_dict.get('rights') ) if rights_name is not None: - resource_rights_ref = URIRef( - resource_dict.get('rights') - ) + resource_rights_ref = dh.get_license_ref_uri_by_name( + rights_name) g.add(( resource_rights_ref, RDF.type, @@ -1058,7 +1057,7 @@ def _rights_and_license_to_graph(self, resource_dict, distribution): g.add((distribution, DCT.rights, resource_rights_ref)) if resource_dict.get('license'): - license_uri = dh.get_license_uri_by_name( + license_uri = dh.get_license_ref_uri_by_homepage_uri( resource_dict.get('license') ) if license_uri is not None: @@ -1066,13 +1065,12 @@ def _rights_and_license_to_graph(self, resource_dict, distribution): g.add((license_ref, RDF.type, DCT.LicenseDocument)) g.add((distribution, DCT.license, license_ref)) if license_uri is None: - license_name = dh.get_license_name_by_uri( + license_name = dh.get_license_name_by_homepage_uri( resource_dict.get('license') ) if license_name is not None: - resource_license_ref = URIRef( - resource_dict.get('license') - ) + resource_license_ref = dh.get_license_ref_uri_by_name( + license_name) g.add(( resource_license_ref, RDF.type, From 0a114cec3fbbb3d062367e129b094118709ce0e9 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 29 Jul 2024 20:08:33 +0200 Subject: [PATCH 03/19] fix: Update tests tu use uri and not str as a value for license and rights --- ckanext/dcatapchharvest/dcat_helpers.py | 2 +- ckanext/dcatapchharvest/profiles.py | 8 ++++++-- .../dcatapchharvest/tests/test_dcatap_ch_parse.py | 10 +++++----- .../tests/test_dcatap_ch_serialize.py | 12 ++++++------ 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index bc57608..62292b2 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -13,7 +13,7 @@ DCT = Namespace("http://purl.org/dc/terms/") EUTHEMES = \ Namespace("http://publications.europa.eu/resource/authority/data-theme/") -FOAF = Namespace("http://xmlns.com/foaf/0.1/") # noqa +FOAF = Namespace("http://xmlns.com/foaf/0.1/") HYDRA = Namespace('http://www.w3.org/ns/hydra/core#') SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#") diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 040d561..6f030c5 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -624,8 +624,12 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa resource_dict[key] = value # Rights & License save homepage uri - rights = self._license_rights_homepage_uri(distribution, DCT.rights) - license = self._license_rights_homepage_uri(distribution, DCT.license) + rights = self._license_rights_homepage_uri( + distribution, DCT.rights + ) + license = self._license_rights_homepage_uri( + distribution, DCT.license + ) if rights is None and license is not None: resource_dict['license'] = license diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index 26de93d..d729b38 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -32,8 +32,8 @@ def test_rights_license(self): # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] - eq_(resource['rights'], u'NonCommercialAllowed-CommercialAllowed-ReferenceRequired') - eq_(resource['license'], u'NonCommercialAllowed-CommercialWithPermission-ReferenceRequired') + eq_(unicode(resource['rights']), u'https://opendata.swiss/en/terms-of-use/#terms_by') + eq_(unicode(resource['license']), u'https://opendata.swiss/en/terms-of-use/#terms_by_ask') def test_dataset_all_fields(self): @@ -146,8 +146,8 @@ def test_dataset_all_fields(self): eq_(resource['format'], u'html') eq_(resource['media_type'], u'text/html') eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs') - eq_(resource['rights'], u'NonCommercialAllowed-CommercialAllowed-ReferenceRequired') - eq_(resource['license'], u'Creative Commons Zero 1.0 Universal (CC0 1.0)') + eq_(resource['license'], u'https://opendata.swiss/en/terms-of-use/#terms_by') + eq_(resource['rights'], u'http://www.opendefinition.org/licenses/cc-zero') eq_(resource['language'], [u'fr']) eq_(resource['issued'], u'1900-12-31T00:00:00') eq_(resource['temporal_resolution'], u'P1D') @@ -402,7 +402,7 @@ def test_multiple_rights_statements(self): dataset = [d for d in p.datasets()][0] resource = dataset["resources"][0] - eq_(resource['rights'], u"NonCommercialAllowed-CommercialWithPermission-ReferenceRequired") + eq_(unicode(resource['rights']), u"https://opendata.swiss/en/terms-of-use/#terms_by_ask") def test_eu_themes_mapping(self): contents = self._get_file_contents('catalog-themes.xml') diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py index f686931..e8cc47e 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py @@ -113,22 +113,22 @@ def test_graph_from_dataset(self): if resource_dict.get('rights') == 'Creative Commons Zero 1.0 Universal (CC0 1.0)': assert self._triple(g, distribution, DCT.rights, URIRef("https://creativecommons.org/publicdomain/zero/1.0/")) - if resource_dict.get('license') == 'NonCommercialAllowed-CommercialAllowed-ReferenceNotRequired': + if resource_dict.get('license') == 'https://opendata.swiss/terms-of-use/#terms_open': assert self._triple(g, distribution, DCT.license, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_open")) # 28e75e40-e1a1-497b-a1b9-8c1834d60201 - if resource_dict.get('rights') == "http://dcat-ap.ch/vocabulary/licenses/terms_by": + if resource_dict.get('rights') == "https://opendata.swiss/terms-of-use#terms_by": assert self._triple(g, distribution, DCT.rights, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by")) - if resource_dict.get('license') == "NonCommercialAllowed-CommercialAllowed-ReferenceRequired": + if resource_dict.get('license') == "https://opendata.swiss/terms-of-use#terms_by": assert self._triple(g, distribution, DCT.license, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by")) # 0cfce6ba-28f4-4229-b733-f6492c650395 - if resource_dict.get('rights') == "http://dcat-ap.ch/vocabulary/licenses/terms_by_ask": + if resource_dict.get('rights') == "https://opendata.swiss/terms-of-use#terms_by_ask": assert self._triple(g, distribution, DCT.rights, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by_ask")) - if resource_dict.get('license') == "https://creativecommons.org/licenses/by/4.0/": - assert self._triple(g, distribution, DCT.license, URIRef("https://creativecommons.org/licenses/by/4.0/")) + if resource_dict.get('rights') == " http://www.opendefinition.org/licenses/cc-by/": + assert self._triple(g, distribution, DCT.rights, URIRef("https://creativecommons.org/licenses/by/4.0/")) if resource_dict.get('format') == "CSV": assert self._triple(g, distribution, DCT['format'], URIRef("http://publications.europa.eu/resource/authority/file-type/CSV")) From 14fd57f79d5d9ca679f6b1e1e55aeab5cca26ddc Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 29 Jul 2024 20:13:18 +0200 Subject: [PATCH 04/19] fix: Remove unsuded Namespace --- ckanext/dcatapchharvest/dcat_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 62292b2..992b758 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -4,7 +4,7 @@ from urlparse import urlparse from ckantoolkit import config from rdflib import URIRef, Graph -from rdflib.namespace import Namespace, RDF, SKOS, FOAF +from rdflib.namespace import Namespace, RDF, SKOS import xml.etree.ElementTree as ET import logging From c6b7d567b0478588c3ed07aceb78aec4ec94a8ee Mon Sep 17 00:00:00 2001 From: kovalch Date: Tue, 30 Jul 2024 10:17:11 +0200 Subject: [PATCH 05/19] fix: License and Rights could be just a homepage URI in our dataset --- ckanext/dcatapchharvest/profiles.py | 49 ++++++----------------------- 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 6f030c5..4aa79d9 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -279,7 +279,7 @@ def _get_iana_media_type(self, subject): def _license_rights_homepage_uri(self, subject, predicate): for node in self.g.objects(subject, predicate): - # DCAT-AP CH v2 compatible license has to be a URI. + # Rights and license has to be a homepage URI if isinstance(node, Literal): return dh.get_license_homepage_uri_by_name(node) if isinstance(node, URIRef): @@ -1039,50 +1039,21 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa def _rights_and_license_to_graph(self, resource_dict, distribution): g = self.g if resource_dict.get('rights'): - rights_uri = dh.get_license_ref_uri_by_homepage_uri( + rights_ref_uri = dh.get_license_ref_uri_by_homepage_uri( resource_dict.get('rights') ) - if rights_uri is not None: - rights_ref = URIRef(rights_uri) - g.add((rights_ref, RDF.type, DCT.RightsStatement)) - g.add((distribution, DCT.rights, rights_ref)) - if rights_uri is None: - rights_name = dh.get_license_name_by_homepage_uri( - resource_dict.get('rights') - ) - if rights_name is not None: - resource_rights_ref = dh.get_license_ref_uri_by_name( - rights_name) - g.add(( - resource_rights_ref, - RDF.type, - DCT.RightsStatement) - ) - g.add((distribution, DCT.rights, resource_rights_ref)) + rights_ref = URIRef(rights_ref_uri) + g.add((rights_ref, RDF.type, DCT.RightsStatement)) + g.add((distribution, DCT.rights, rights_ref)) if resource_dict.get('license'): - license_uri = dh.get_license_ref_uri_by_homepage_uri( + license_ref_uri = dh.get_license_ref_uri_by_homepage_uri( resource_dict.get('license') ) - if license_uri is not None: - license_ref = URIRef(license_uri) - g.add((license_ref, RDF.type, DCT.LicenseDocument)) - g.add((distribution, DCT.license, license_ref)) - if license_uri is None: - license_name = dh.get_license_name_by_homepage_uri( - resource_dict.get('license') - ) - if license_name is not None: - resource_license_ref = dh.get_license_ref_uri_by_name( - license_name) - g.add(( - resource_license_ref, - RDF.type, - DCT.LicenseDocument) - ) - g.add( - (distribution, DCT.license, resource_license_ref) - ) + license_ref = URIRef(license_ref_uri) + g.add((license_ref, RDF.type, DCT.LicenseDocument)) + g.add((distribution, DCT.license, license_ref)) + def _format_and_media_type_to_graph(self, resource_dict, distribution): g = self.g From d54a75a74ad18ef2b9fa94355165633e1e5922f8 Mon Sep 17 00:00:00 2001 From: kovalch Date: Tue, 30 Jul 2024 10:19:18 +0200 Subject: [PATCH 06/19] style: Remove blank line --- ckanext/dcatapchharvest/profiles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 4aa79d9..458de82 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -1054,7 +1054,6 @@ def _rights_and_license_to_graph(self, resource_dict, distribution): g.add((license_ref, RDF.type, DCT.LicenseDocument)) g.add((distribution, DCT.license, license_ref)) - def _format_and_media_type_to_graph(self, resource_dict, distribution): g = self.g # Export format value if it matches EU vocabulary From eec20dd4f296effc46b5452bf3e735ecce9d83b0 Mon Sep 17 00:00:00 2001 From: kovalch Date: Tue, 30 Jul 2024 10:44:13 +0200 Subject: [PATCH 07/19] fix: Rollback to the previous function look, before we migrate all licenses to uri --- ckanext/dcatapchharvest/profiles.py | 48 +++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 458de82..6f030c5 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -279,7 +279,7 @@ def _get_iana_media_type(self, subject): def _license_rights_homepage_uri(self, subject, predicate): for node in self.g.objects(subject, predicate): - # Rights and license has to be a homepage URI + # DCAT-AP CH v2 compatible license has to be a URI. if isinstance(node, Literal): return dh.get_license_homepage_uri_by_name(node) if isinstance(node, URIRef): @@ -1039,20 +1039,50 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa def _rights_and_license_to_graph(self, resource_dict, distribution): g = self.g if resource_dict.get('rights'): - rights_ref_uri = dh.get_license_ref_uri_by_homepage_uri( + rights_uri = dh.get_license_ref_uri_by_homepage_uri( resource_dict.get('rights') ) - rights_ref = URIRef(rights_ref_uri) - g.add((rights_ref, RDF.type, DCT.RightsStatement)) - g.add((distribution, DCT.rights, rights_ref)) + if rights_uri is not None: + rights_ref = URIRef(rights_uri) + g.add((rights_ref, RDF.type, DCT.RightsStatement)) + g.add((distribution, DCT.rights, rights_ref)) + if rights_uri is None: + rights_name = dh.get_license_name_by_homepage_uri( + resource_dict.get('rights') + ) + if rights_name is not None: + resource_rights_ref = dh.get_license_ref_uri_by_name( + rights_name) + g.add(( + resource_rights_ref, + RDF.type, + DCT.RightsStatement) + ) + g.add((distribution, DCT.rights, resource_rights_ref)) if resource_dict.get('license'): - license_ref_uri = dh.get_license_ref_uri_by_homepage_uri( + license_uri = dh.get_license_ref_uri_by_homepage_uri( resource_dict.get('license') ) - license_ref = URIRef(license_ref_uri) - g.add((license_ref, RDF.type, DCT.LicenseDocument)) - g.add((distribution, DCT.license, license_ref)) + if license_uri is not None: + license_ref = URIRef(license_uri) + g.add((license_ref, RDF.type, DCT.LicenseDocument)) + g.add((distribution, DCT.license, license_ref)) + if license_uri is None: + license_name = dh.get_license_name_by_homepage_uri( + resource_dict.get('license') + ) + if license_name is not None: + resource_license_ref = dh.get_license_ref_uri_by_name( + license_name) + g.add(( + resource_license_ref, + RDF.type, + DCT.LicenseDocument) + ) + g.add( + (distribution, DCT.license, resource_license_ref) + ) def _format_and_media_type_to_graph(self, resource_dict, distribution): g = self.g From b8643298bae509c7744435f525e2c9ea823a402b Mon Sep 17 00:00:00 2001 From: kovalch Date: Tue, 30 Jul 2024 10:45:02 +0200 Subject: [PATCH 08/19] fix: Update license and right values for the tests --- ckanext/dcatapchharvest/tests/fixtures/dataset.json | 6 +++--- ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ckanext/dcatapchharvest/tests/fixtures/dataset.json b/ckanext/dcatapchharvest/tests/fixtures/dataset.json index cf863af..7bc5cbc 100644 --- a/ckanext/dcatapchharvest/tests/fixtures/dataset.json +++ b/ckanext/dcatapchharvest/tests/fixtures/dataset.json @@ -121,8 +121,8 @@ "https://example.com/documentation-resource-1", "https://example.com/documentation-resource-2" ], - "rights": "Creative Commons Zero 1.0 Universell (CC0 1.0)", - "license": "NonCommercialAllowed-CommercialAllowed-ReferenceNotRequired", + "rights": "http://www.opendefinition.org/licenses/cc-zero", + "license": "https://opendata.swiss/terms-of-use/#terms_open", "format": "CSV", "issued": "2015-06-26T15:21:09.034694", "modified": "2015-06-30T15:21:09.000000" @@ -135,7 +135,7 @@ "https://example.com/documentation-resource-2" ], "rights": "http://dcat-ap.ch/vocabulary/licenses/terms_by", - "license": "NonCommercialAllowed-CommercialAllowed-ReferenceRequired", + "license": "https://opendata.swiss/en/terms-of-use/#terms_by", "format": "HTML" }, { diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py index e8cc47e..333936b 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py @@ -110,10 +110,10 @@ def test_graph_from_dataset(self): assert self._triple(g, distribution, DCAT.accessService, URIRef(link)) # e2c50e70-67ad-4f86-bb1b-3f93867eadaa - if resource_dict.get('rights') == 'Creative Commons Zero 1.0 Universal (CC0 1.0)': + if resource_dict.get('rights') == "http://www.opendefinition.org/licenses/cc-zero": assert self._triple(g, distribution, DCT.rights, URIRef("https://creativecommons.org/publicdomain/zero/1.0/")) - if resource_dict.get('license') == 'https://opendata.swiss/terms-of-use/#terms_open': + if resource_dict.get('license') == "https://opendata.swiss/terms-of-use/#terms_open": assert self._triple(g, distribution, DCT.license, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_open")) # 28e75e40-e1a1-497b-a1b9-8c1834d60201 @@ -127,7 +127,7 @@ def test_graph_from_dataset(self): if resource_dict.get('rights') == "https://opendata.swiss/terms-of-use#terms_by_ask": assert self._triple(g, distribution, DCT.rights, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by_ask")) - if resource_dict.get('rights') == " http://www.opendefinition.org/licenses/cc-by/": + if resource_dict.get('rights') == "http://www.opendefinition.org/licenses/cc-by/": assert self._triple(g, distribution, DCT.rights, URIRef("https://creativecommons.org/licenses/by/4.0/")) if resource_dict.get('format') == "CSV": From 71e0e7e90971a1412f38db32d33bf441a7765621 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 09:49:20 +0200 Subject: [PATCH 09/19] feat: Add additional check if cc-license in dct:license --- ckanext/dcatapchharvest/profiles.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 6f030c5..103791d 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -646,6 +646,8 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa if 'cc' in license and 'cc' not in rights: resource_dict['license'] = rights resource_dict['rights'] = license + elif 'cc' in license and 'cc' in rights: + resource_dict['license'] = None else: resource_dict['license'] = None resource_dict['rights'] = None From 67b9ae83614131d1c01e32c5a786798349beffd5 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 09:57:58 +0200 Subject: [PATCH 10/19] fix: Update the license_literal mapping --- ckanext/dcatapchharvest/dcat_helpers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 992b758..444d594 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -243,11 +243,10 @@ def get_license_values(): try: for license_pref_label in g.objects(subject=ogdch_license_ref, predicate=SKOSXL.prefLabel): - for literal in g.objects(subject=license_pref_label, - predicate=SKOSXL.literalForm): - license_literal = literal + license_literal = next(g.objects(subject=license_pref_label, + predicate=SKOSXL.literalForm)) + if license_literal is not None: break # Assume one literal per concept - license_homepages_literal_mapping[license_homepage] = license_literal # noqa license_ref_literal_mapping[ogdch_license_ref] = license_literal license_homepage_ref_mapping[license_homepage] = ogdch_license_ref From 46f7d842a1ec77461df35e41f05aa89fca0e5d51 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 10:24:20 +0200 Subject: [PATCH 11/19] fix: Restracture _rights_license to graph function --- ckanext/dcatapchharvest/profiles.py | 75 ++++++++++++----------------- 1 file changed, 31 insertions(+), 44 deletions(-) diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 103791d..6087d26 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -1038,53 +1038,40 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa g.add((distribution, DCAT.byteSize, Literal(resource_dict['byte_size']))) + def _get_rights_and_license_uri(self, resource_dict, property='license'): + if property not in ['license', 'rights']: + raise ValueError("Property must be 'license' or 'rights'") + + homepage_uri = resource_dict.get(property) + if not homepage_uri: + return None + + uri = dh.get_license_ref_uri_by_homepage_uri(homepage_uri) + if uri is not None: + return URIRef(uri) + + name = dh.get_license_name_by_homepage_uri(homepage_uri) + if name is not None: + uri = dh.get_license_ref_uri_by_name(name) + if uri is not None: + return URIRef(uri) + + return None + def _rights_and_license_to_graph(self, resource_dict, distribution): g = self.g - if resource_dict.get('rights'): - rights_uri = dh.get_license_ref_uri_by_homepage_uri( - resource_dict.get('rights') - ) - if rights_uri is not None: - rights_ref = URIRef(rights_uri) - g.add((rights_ref, RDF.type, DCT.RightsStatement)) - g.add((distribution, DCT.rights, rights_ref)) - if rights_uri is None: - rights_name = dh.get_license_name_by_homepage_uri( - resource_dict.get('rights') - ) - if rights_name is not None: - resource_rights_ref = dh.get_license_ref_uri_by_name( - rights_name) - g.add(( - resource_rights_ref, - RDF.type, - DCT.RightsStatement) - ) - g.add((distribution, DCT.rights, resource_rights_ref)) - if resource_dict.get('license'): - license_uri = dh.get_license_ref_uri_by_homepage_uri( - resource_dict.get('license') - ) - if license_uri is not None: - license_ref = URIRef(license_uri) - g.add((license_ref, RDF.type, DCT.LicenseDocument)) - g.add((distribution, DCT.license, license_ref)) - if license_uri is None: - license_name = dh.get_license_name_by_homepage_uri( - resource_dict.get('license') - ) - if license_name is not None: - resource_license_ref = dh.get_license_ref_uri_by_name( - license_name) - g.add(( - resource_license_ref, - RDF.type, - DCT.LicenseDocument) - ) - g.add( - (distribution, DCT.license, resource_license_ref) - ) + rights_uri_ref = self._get_rights_and_license_uri(resource_dict, + 'rights') + if rights_uri_ref is not None: + g.add((rights_uri_ref, RDF.type, DCT.RightsStatement)) + g.add((distribution, DCT.rights, rights_uri_ref)) + + license_uri_ref = self._get_rights_and_license_uri(resource_dict, + 'license') + if license_uri_ref is not None: + g.add((license_uri_ref, RDF.type, DCT.LicenseDocument)) + g.add((distribution, DCT.license, license_uri_ref)) def _format_and_media_type_to_graph(self, resource_dict, distribution): g = self.g From df83d15aa38f4c30dfb1978deafa32818c0f689c Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 10:35:50 +0200 Subject: [PATCH 12/19] fix: Set keys and values to unicode when setting up the mappings --- ckanext/dcatapchharvest/dcat_helpers.py | 32 ++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 444d594..a99cbc1 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -172,7 +172,7 @@ def get_frequency_values(): def get_license_ref_uri_by_name(vocabulary_name): _, license_ref_literal_vocabulary, _ = get_license_values() for key, value in license_ref_literal_vocabulary.items(): - if unicode(vocabulary_name) == unicode(value): + if vocabulary_name == value: return key return None @@ -180,7 +180,7 @@ def get_license_ref_uri_by_name(vocabulary_name): def get_license_ref_uri_by_homepage_uri(vocabulary_name): _, _, license_homepage_ref_vocabulary = get_license_values() for key, value in license_homepage_ref_vocabulary.items(): - if unicode(vocabulary_name) == unicode(key): + if vocabulary_name == key: return value return None @@ -188,23 +188,23 @@ def get_license_ref_uri_by_homepage_uri(vocabulary_name): def get_license_name_by_ref_uri(vocabulary_uri): _, license_ref_literal_vocabulary, _ = get_license_values() for key, value in license_ref_literal_vocabulary.items(): - if unicode(vocabulary_uri) == unicode(key): - return unicode(value) + if vocabulary_uri == key: + return value return None def get_license_name_by_homepage_uri(vocabulary_uri): license_homepages_literal_vocabulary, _, _ = get_license_values() for key, value in license_homepages_literal_vocabulary.items(): - if unicode(vocabulary_uri) == unicode(key): - return unicode(value) + if vocabulary_uri == key: + return value return None def get_license_homepage_uri_by_name(vocabulary_name): license_homepages_literal_vocabulary, _, _ = get_license_values() for key, value in license_homepages_literal_vocabulary.items(): - if unicode(vocabulary_name) == unicode(value): + if vocabulary_name == value: return key return None @@ -213,12 +213,12 @@ def get_license_homepage_uri_by_uri(vocabulary_uri): _, _, license_homepage_ref_vocabulary = get_license_values() license_homepages = list(license_homepage_ref_vocabulary.keys()) if vocabulary_uri in license_homepages: - return unicode(vocabulary_uri) + return vocabulary_uri else: for key, value in license_homepage_ref_vocabulary.items(): - if unicode(vocabulary_uri) == unicode(value): - return unicode(key) - return + if vocabulary_uri == value: + return key + return None def get_license_values(): @@ -247,9 +247,13 @@ def get_license_values(): predicate=SKOSXL.literalForm)) if license_literal is not None: break # Assume one literal per concept - license_homepages_literal_mapping[license_homepage] = license_literal # noqa - license_ref_literal_mapping[ogdch_license_ref] = license_literal - license_homepage_ref_mapping[license_homepage] = ogdch_license_ref + + license_homepages_literal_mapping[unicode(license_homepage)] = \ + unicode(license_literal) + license_ref_literal_mapping[unicode(ogdch_license_ref)] = \ + unicode(license_literal) + license_homepage_ref_mapping[unicode(license_homepage)] = \ + unicode(ogdch_license_ref) except Exception as e: raise ValueError("SKOSXL.prefLabel is missing in the RDF-file: %s" From 4c2ac834c5bd0255d0597982c1fc6c06a3dc70ba Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 12:43:50 +0200 Subject: [PATCH 13/19] fix: Make vocabulary_uri to unicode for the comparison in helper functions --- ckanext/dcatapchharvest/dcat_helpers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index a99cbc1..8e4b263 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -172,7 +172,7 @@ def get_frequency_values(): def get_license_ref_uri_by_name(vocabulary_name): _, license_ref_literal_vocabulary, _ = get_license_values() for key, value in license_ref_literal_vocabulary.items(): - if vocabulary_name == value: + if unicode(vocabulary_name) == value: return key return None @@ -180,7 +180,7 @@ def get_license_ref_uri_by_name(vocabulary_name): def get_license_ref_uri_by_homepage_uri(vocabulary_name): _, _, license_homepage_ref_vocabulary = get_license_values() for key, value in license_homepage_ref_vocabulary.items(): - if vocabulary_name == key: + if unicode(vocabulary_name) == key: return value return None @@ -188,7 +188,7 @@ def get_license_ref_uri_by_homepage_uri(vocabulary_name): def get_license_name_by_ref_uri(vocabulary_uri): _, license_ref_literal_vocabulary, _ = get_license_values() for key, value in license_ref_literal_vocabulary.items(): - if vocabulary_uri == key: + if unicode(vocabulary_uri) == key: return value return None @@ -196,7 +196,7 @@ def get_license_name_by_ref_uri(vocabulary_uri): def get_license_name_by_homepage_uri(vocabulary_uri): license_homepages_literal_vocabulary, _, _ = get_license_values() for key, value in license_homepages_literal_vocabulary.items(): - if vocabulary_uri == key: + if unicode(vocabulary_uri) == key: return value return None @@ -204,7 +204,7 @@ def get_license_name_by_homepage_uri(vocabulary_uri): def get_license_homepage_uri_by_name(vocabulary_name): license_homepages_literal_vocabulary, _, _ = get_license_values() for key, value in license_homepages_literal_vocabulary.items(): - if vocabulary_name == value: + if unicode(vocabulary_name) == value: return key return None @@ -212,11 +212,11 @@ def get_license_homepage_uri_by_name(vocabulary_name): def get_license_homepage_uri_by_uri(vocabulary_uri): _, _, license_homepage_ref_vocabulary = get_license_values() license_homepages = list(license_homepage_ref_vocabulary.keys()) - if vocabulary_uri in license_homepages: - return vocabulary_uri + if unicode(vocabulary_uri) in license_homepages: + return unicode(vocabulary_uri) else: for key, value in license_homepage_ref_vocabulary.items(): - if vocabulary_uri == value: + if unicode(vocabulary_uri) == value: return key return None From ab73216844bbefe91a0536c36a513699913e1fec Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 13:34:15 +0200 Subject: [PATCH 14/19] refactor: Add LicenseHandler class to handle cashing of license vocabulary --- ckanext/dcatapchharvest/dcat_helpers.py | 149 ++++++++++++++++-------- ckanext/dcatapchharvest/profiles.py | 13 +-- 2 files changed, 107 insertions(+), 55 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 8e4b263..c5785d4 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -169,56 +169,109 @@ def get_frequency_values(): return frequency_mapping -def get_license_ref_uri_by_name(vocabulary_name): - _, license_ref_literal_vocabulary, _ = get_license_values() - for key, value in license_ref_literal_vocabulary.items(): - if unicode(vocabulary_name) == value: - return key - return None - - -def get_license_ref_uri_by_homepage_uri(vocabulary_name): - _, _, license_homepage_ref_vocabulary = get_license_values() - for key, value in license_homepage_ref_vocabulary.items(): - if unicode(vocabulary_name) == key: - return value - return None - - -def get_license_name_by_ref_uri(vocabulary_uri): - _, license_ref_literal_vocabulary, _ = get_license_values() - for key, value in license_ref_literal_vocabulary.items(): - if unicode(vocabulary_uri) == key: - return value - return None - - -def get_license_name_by_homepage_uri(vocabulary_uri): - license_homepages_literal_vocabulary, _, _ = get_license_values() - for key, value in license_homepages_literal_vocabulary.items(): - if unicode(vocabulary_uri) == key: - return value - return None - - -def get_license_homepage_uri_by_name(vocabulary_name): - license_homepages_literal_vocabulary, _, _ = get_license_values() - for key, value in license_homepages_literal_vocabulary.items(): - if unicode(vocabulary_name) == value: - return key - return None - - -def get_license_homepage_uri_by_uri(vocabulary_uri): - _, _, license_homepage_ref_vocabulary = get_license_values() - license_homepages = list(license_homepage_ref_vocabulary.keys()) - if unicode(vocabulary_uri) in license_homepages: - return unicode(vocabulary_uri) - else: +class LicenseHandler: + def __init__(self): + self._license_cache = None + + def _get_license_values(self): + if self._license_cache is None: + try: + g = Graph() + license_ref_literal_mapping = {} + license_homepages_literal_mapping = {} + license_homepage_ref_mapping = {} + + for prefix, namespace in license_namespaces.items(): + g.bind(prefix, namespace) + file = os.path.join(__location__, 'license.ttl') + g.parse(file, format='turtle') + for ogdch_license_ref in g.subjects(predicate=RDF.type, + object=SKOS.Concept): + license_homepage = None + for homepage in g.objects(subject=ogdch_license_ref, + predicate=FOAF.homepage): + license_homepage = homepage + break # Assume one homepage per concept + + license_literal = None + try: + for license_pref_label in g.objects( + subject=ogdch_license_ref, + predicate=SKOSXL.prefLabel): + license_literal = next( + g.objects(subject=license_pref_label, + predicate=SKOSXL.literalForm)) + if license_literal is not None: + break # Assume one literal per concept + + license_homepages_literal_mapping[ + unicode(license_homepage)] = \ + unicode(license_literal) + license_ref_literal_mapping[ + unicode(ogdch_license_ref)] = \ + unicode(license_literal) + license_homepage_ref_mapping[ + unicode(license_homepage)] = \ + unicode(ogdch_license_ref) + + except Exception as e: + raise ValueError( + "SKOSXL.prefLabel is missing in the RDF-file: %s" + % e) + + self._license_cache = (license_homepages_literal_mapping, + license_ref_literal_mapping, + license_homepage_ref_mapping) + except Exception as e: + raise RuntimeError("Failed to load license values: %s" + % e) + return self._license_cache + + def get_license_ref_uri_by_name(self, vocabulary_name): + _, license_ref_literal_vocabulary, _ = self._get_license_values() + for key, value in license_ref_literal_vocabulary.items(): + if unicode(vocabulary_name) == value: + return key + return None + + def get_license_ref_uri_by_homepage_uri(self, vocabulary_name): + _, _, license_homepage_ref_vocabulary = self._get_license_values() for key, value in license_homepage_ref_vocabulary.items(): - if unicode(vocabulary_uri) == value: + if unicode(vocabulary_name) == key: + return value + return None + + def get_license_name_by_ref_uri(self, vocabulary_uri): + _, license_ref_literal_vocabulary, _ = self._get_license_values() + for key, value in license_ref_literal_vocabulary.items(): + if unicode(vocabulary_uri) == key: + return value + return None + + def get_license_name_by_homepage_uri(self, vocabulary_uri): + license_homepages_literal_vocabulary, _, _ = self._get_license_values() + for key, value in license_homepages_literal_vocabulary.items(): + if unicode(vocabulary_uri) == key: + return value + return None + + def get_license_homepage_uri_by_name(self, vocabulary_name): + license_homepages_literal_vocabulary, _, _ = self._get_license_values() + for key, value in license_homepages_literal_vocabulary.items(): + if unicode(vocabulary_name) == value: return key - return None + return None + + def get_license_homepage_uri_by_uri(self, vocabulary_uri): + _, _, license_homepage_ref_vocabulary = self._get_license_values() + license_homepages = list(license_homepage_ref_vocabulary.keys()) + if unicode(vocabulary_uri) in license_homepages: + return unicode(vocabulary_uri) + else: + for key, value in license_homepage_ref_vocabulary.items(): + if unicode(vocabulary_uri) == value: + return key + return None def get_license_values(): diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 6087d26..335e494 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -13,9 +13,8 @@ from ckanext.dcat.profiles import CleanedURIRef, RDFProfile, SchemaOrgProfile log = logging.getLogger(__name__) - +license_handler = dh.LicenseHandler() valid_frequencies = dh.get_frequency_values() -valid_licenses = dh.get_license_values() eu_theme_mapping = dh.get_theme_mapping() valid_formats = dh.get_format_values() valid_media_types = dh.get_iana_media_type_values() @@ -281,9 +280,9 @@ def _license_rights_homepage_uri(self, subject, predicate): for node in self.g.objects(subject, predicate): # DCAT-AP CH v2 compatible license has to be a URI. if isinstance(node, Literal): - return dh.get_license_homepage_uri_by_name(node) + return license_handler.get_license_homepage_uri_by_name(node) if isinstance(node, URIRef): - return dh.get_license_homepage_uri_by_uri(node) + return license_handler.get_license_homepage_uri_by_uri(node) return None def _keywords(self, subject): @@ -1046,13 +1045,13 @@ def _get_rights_and_license_uri(self, resource_dict, property='license'): if not homepage_uri: return None - uri = dh.get_license_ref_uri_by_homepage_uri(homepage_uri) + uri = license_handler.get_license_ref_uri_by_homepage_uri(homepage_uri) if uri is not None: return URIRef(uri) - name = dh.get_license_name_by_homepage_uri(homepage_uri) + name = license_handler.get_license_name_by_homepage_uri(homepage_uri) if name is not None: - uri = dh.get_license_ref_uri_by_name(name) + uri = license_handler.get_license_ref_uri_by_name(name) if uri is not None: return URIRef(uri) From 8638c5848354d60e270f6fc2a2b15641f8f3b136 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 14:07:07 +0200 Subject: [PATCH 15/19] refactor: Devide license mapping function to smaller stages --- ckanext/dcatapchharvest/dcat_helpers.py | 142 +++++++++++++----------- 1 file changed, 76 insertions(+), 66 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index c5785d4..17c279c 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -173,52 +173,63 @@ class LicenseHandler: def __init__(self): self._license_cache = None + def _bind_namespaces(self, graph): + for prefix, namespace in license_namespaces.items(): + graph.bind(prefix, namespace) + + def _parse_graph(self, graph): + file = os.path.join(__location__, 'license.ttl') + graph.parse(file, format='turtle') + + def _get_license_homepage(self, graph, license_ref): + for homepage in graph.objects(subject=license_ref, + predicate=FOAF.homepage): + return homepage + return None + + def _get_license_literal(self, graph, license_ref): + for license_pref_label in graph.objects(subject=license_ref, + predicate=SKOSXL.prefLabel): + try: + return next(graph.objects(subject=license_pref_label, + predicate=SKOSXL.literalForm)) + except StopIteration: + continue + return None + + def _process_graph(self, graph): + license_ref_literal_mapping = {} + license_homepages_literal_mapping = {} + license_homepage_ref_mapping = {} + + for ogdch_license_ref in graph.subjects(predicate=RDF.type, + object=SKOS.Concept): + license_homepage = self._get_license_homepage(graph, + ogdch_license_ref) + license_literal = self._get_license_literal(graph, + ogdch_license_ref) + + license_homepages_literal_mapping[unicode(license_homepage)] = \ + unicode(license_literal) + license_ref_literal_mapping[unicode(ogdch_license_ref)] = \ + unicode(license_literal) + license_homepage_ref_mapping[unicode(license_homepage)] = \ + unicode(ogdch_license_ref) + + return (license_homepages_literal_mapping, + license_ref_literal_mapping, license_homepage_ref_mapping) + + def _get_license_values(self): if self._license_cache is None: try: g = Graph() - license_ref_literal_mapping = {} - license_homepages_literal_mapping = {} - license_homepage_ref_mapping = {} - - for prefix, namespace in license_namespaces.items(): - g.bind(prefix, namespace) - file = os.path.join(__location__, 'license.ttl') - g.parse(file, format='turtle') - for ogdch_license_ref in g.subjects(predicate=RDF.type, - object=SKOS.Concept): - license_homepage = None - for homepage in g.objects(subject=ogdch_license_ref, - predicate=FOAF.homepage): - license_homepage = homepage - break # Assume one homepage per concept - - license_literal = None - try: - for license_pref_label in g.objects( - subject=ogdch_license_ref, - predicate=SKOSXL.prefLabel): - license_literal = next( - g.objects(subject=license_pref_label, - predicate=SKOSXL.literalForm)) - if license_literal is not None: - break # Assume one literal per concept - - license_homepages_literal_mapping[ - unicode(license_homepage)] = \ - unicode(license_literal) - license_ref_literal_mapping[ - unicode(ogdch_license_ref)] = \ - unicode(license_literal) - license_homepage_ref_mapping[ - unicode(license_homepage)] = \ - unicode(ogdch_license_ref) - - except Exception as e: - raise ValueError( - "SKOSXL.prefLabel is missing in the RDF-file: %s" - % e) + self._bind_namespaces(g) + self._parse_graph(g) + license_homepages_literal_mapping, \ + license_ref_literal_mapping, \ + license_homepage_ref_mapping = self._process_graph(g) self._license_cache = (license_homepages_literal_mapping, license_ref_literal_mapping, license_homepage_ref_mapping) @@ -229,49 +240,48 @@ def _get_license_values(self): def get_license_ref_uri_by_name(self, vocabulary_name): _, license_ref_literal_vocabulary, _ = self._get_license_values() - for key, value in license_ref_literal_vocabulary.items(): - if unicode(vocabulary_name) == value: - return key - return None + return next((key for key, value in + license_ref_literal_vocabulary.items() + if unicode(vocabulary_name) == value), + None) def get_license_ref_uri_by_homepage_uri(self, vocabulary_name): _, _, license_homepage_ref_vocabulary = self._get_license_values() - for key, value in license_homepage_ref_vocabulary.items(): - if unicode(vocabulary_name) == key: - return value - return None + return next((value for key, value in + license_homepage_ref_vocabulary.items() + if unicode(vocabulary_name) == key), + None) def get_license_name_by_ref_uri(self, vocabulary_uri): _, license_ref_literal_vocabulary, _ = self._get_license_values() - for key, value in license_ref_literal_vocabulary.items(): - if unicode(vocabulary_uri) == key: - return value - return None + return next((value for key, value in + license_ref_literal_vocabulary.items() + if unicode(vocabulary_uri) == key), + None) def get_license_name_by_homepage_uri(self, vocabulary_uri): license_homepages_literal_vocabulary, _, _ = self._get_license_values() - for key, value in license_homepages_literal_vocabulary.items(): - if unicode(vocabulary_uri) == key: - return value - return None + return next((value for key, value in + license_homepages_literal_vocabulary.items() + if unicode(vocabulary_uri) == key), + None) def get_license_homepage_uri_by_name(self, vocabulary_name): license_homepages_literal_vocabulary, _, _ = self._get_license_values() - for key, value in license_homepages_literal_vocabulary.items(): - if unicode(vocabulary_name) == value: - return key - return None + return next((key for key, value in + license_homepages_literal_vocabulary.items() + if unicode(vocabulary_name) == value), + None) def get_license_homepage_uri_by_uri(self, vocabulary_uri): _, _, license_homepage_ref_vocabulary = self._get_license_values() license_homepages = list(license_homepage_ref_vocabulary.keys()) if unicode(vocabulary_uri) in license_homepages: return unicode(vocabulary_uri) - else: - for key, value in license_homepage_ref_vocabulary.items(): - if unicode(vocabulary_uri) == value: - return key - return None + return next((key for key, value in + license_homepage_ref_vocabulary.items() + if unicode(vocabulary_uri) == value), + None) def get_license_values(): From d7efda16e0d0f8c37b6379870e5911249e8f78ce Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 14:41:54 +0200 Subject: [PATCH 16/19] fix: Remove blank line --- ckanext/dcatapchharvest/dcat_helpers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 17c279c..fde330b 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -219,7 +219,6 @@ def _process_graph(self, graph): return (license_homepages_literal_mapping, license_ref_literal_mapping, license_homepage_ref_mapping) - def _get_license_values(self): if self._license_cache is None: try: @@ -227,9 +226,10 @@ def _get_license_values(self): self._bind_namespaces(g) self._parse_graph(g) - license_homepages_literal_mapping, \ - license_ref_literal_mapping, \ - license_homepage_ref_mapping = self._process_graph(g) + (license_homepages_literal_mapping, + license_ref_literal_mapping, + license_homepage_ref_mapping) = self._process_graph(g) + self._license_cache = (license_homepages_literal_mapping, license_ref_literal_mapping, license_homepage_ref_mapping) From f723c7633f672c42eb0bb3dd981f8763eb7c4a45 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 21:47:05 +0200 Subject: [PATCH 17/19] fix: Remove unused function --- ckanext/dcatapchharvest/dcat_helpers.py | 42 ------------------------- 1 file changed, 42 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index fde330b..0f8e609 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -284,48 +284,6 @@ def get_license_homepage_uri_by_uri(self, vocabulary_uri): None) -def get_license_values(): - g = Graph() - license_ref_literal_mapping = {} - license_homepages_literal_mapping = {} - license_homepage_ref_mapping = {} - - for prefix, namespace in license_namespaces.items(): - g.bind(prefix, namespace) - file = os.path.join(__location__, 'license.ttl') - g.parse(file, format='turtle') - for ogdch_license_ref in g.subjects(predicate=RDF.type, - object=SKOS.Concept): - license_homepage = None - for homepage in g.objects(subject=ogdch_license_ref, - predicate=FOAF.homepage): - license_homepage = homepage - break # Assume one homepage per concept - - license_literal = None - try: - for license_pref_label in g.objects(subject=ogdch_license_ref, - predicate=SKOSXL.prefLabel): - license_literal = next(g.objects(subject=license_pref_label, - predicate=SKOSXL.literalForm)) - if license_literal is not None: - break # Assume one literal per concept - - license_homepages_literal_mapping[unicode(license_homepage)] = \ - unicode(license_literal) - license_ref_literal_mapping[unicode(ogdch_license_ref)] = \ - unicode(license_literal) - license_homepage_ref_mapping[unicode(license_homepage)] = \ - unicode(ogdch_license_ref) - - except Exception as e: - raise ValueError("SKOSXL.prefLabel is missing in the RDF-file: %s" - % e) - - return (license_homepages_literal_mapping, license_ref_literal_mapping, - license_homepage_ref_mapping) - - def get_theme_mapping(): g = Graph() theme_mapping = {} From 3aa4ddfff974e02dc78c8fb61a2335f79530666d Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 5 Aug 2024 21:56:47 +0200 Subject: [PATCH 18/19] fix: Return a value of a dict --- ckanext/dcatapchharvest/dcat_helpers.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 0f8e609..68c1d6e 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -247,24 +247,17 @@ def get_license_ref_uri_by_name(self, vocabulary_name): def get_license_ref_uri_by_homepage_uri(self, vocabulary_name): _, _, license_homepage_ref_vocabulary = self._get_license_values() - return next((value for key, value in - license_homepage_ref_vocabulary.items() - if unicode(vocabulary_name) == key), - None) + return license_homepage_ref_vocabulary.get(unicode(vocabulary_name)) def get_license_name_by_ref_uri(self, vocabulary_uri): _, license_ref_literal_vocabulary, _ = self._get_license_values() - return next((value for key, value in - license_ref_literal_vocabulary.items() - if unicode(vocabulary_uri) == key), - None) + return license_ref_literal_vocabulary.get( + unicode(vocabulary_uri)) def get_license_name_by_homepage_uri(self, vocabulary_uri): license_homepages_literal_vocabulary, _, _ = self._get_license_values() - return next((value for key, value in - license_homepages_literal_vocabulary.items() - if unicode(vocabulary_uri) == key), - None) + return license_homepages_literal_vocabulary.get( + unicode(vocabulary_uri)) def get_license_homepage_uri_by_name(self, vocabulary_name): license_homepages_literal_vocabulary, _, _ = self._get_license_values() From 2e2ed151badac04f11b65db7fd761d88c3ba3c49 Mon Sep 17 00:00:00 2001 From: kovalch Date: Wed, 7 Aug 2024 15:52:00 +0200 Subject: [PATCH 19/19] fix: Remove language lable from the uri --- ckanext/dcatapchharvest/license.ttl | 6 +++--- ckanext/dcatapchharvest/tests/fixtures/dataset.json | 2 +- ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ckanext/dcatapchharvest/license.ttl b/ckanext/dcatapchharvest/license.ttl index 660043a..720da79 100644 --- a/ckanext/dcatapchharvest/license.ttl +++ b/ckanext/dcatapchharvest/license.ttl @@ -46,7 +46,7 @@ skosxl:literalForm "NonCommercialAllowed-CommercialAllowed-ReferenceRequired"@en ; rdfs:label "NonCommercialAllowed-CommercialAllowed-ReferenceRequired"@de ] ; - foaf:homepage . + foaf:homepage . a skos:Concept ; @@ -61,7 +61,7 @@ skosxl:literalForm "NonCommercialAllowed-CommercialWithPermission-ReferenceNotRequired"@en ; rdfs:label "NonCommercialAllowed-CommercialWithPermission-ReferenceNotRequired"@de ] ; - foaf:homepage . + foaf:homepage . a skos:Concept ; @@ -76,7 +76,7 @@ skosxl:literalForm "NonCommercialAllowed-CommercialWithPermission-ReferenceRequired"@en ; rdfs:label "NonCommercialAllowed-CommercialWithPermission-ReferenceRequired"@de ] ; - foaf:homepage . + foaf:homepage . a skos:Concept, cc:License ; diff --git a/ckanext/dcatapchharvest/tests/fixtures/dataset.json b/ckanext/dcatapchharvest/tests/fixtures/dataset.json index 7bc5cbc..ebd1883 100644 --- a/ckanext/dcatapchharvest/tests/fixtures/dataset.json +++ b/ckanext/dcatapchharvest/tests/fixtures/dataset.json @@ -135,7 +135,7 @@ "https://example.com/documentation-resource-2" ], "rights": "http://dcat-ap.ch/vocabulary/licenses/terms_by", - "license": "https://opendata.swiss/en/terms-of-use/#terms_by", + "license": "https://opendata.swiss/terms-of-use/#terms_by", "format": "HTML" }, { diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index d729b38..5866fd8 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -32,8 +32,8 @@ def test_rights_license(self): # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] - eq_(unicode(resource['rights']), u'https://opendata.swiss/en/terms-of-use/#terms_by') - eq_(unicode(resource['license']), u'https://opendata.swiss/en/terms-of-use/#terms_by_ask') + eq_(unicode(resource['rights']), u'https://opendata.swiss/terms-of-use/#terms_by') + eq_(unicode(resource['license']), u'https://opendata.swiss/terms-of-use/#terms_by_ask') def test_dataset_all_fields(self): @@ -146,7 +146,7 @@ def test_dataset_all_fields(self): eq_(resource['format'], u'html') eq_(resource['media_type'], u'text/html') eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs') - eq_(resource['license'], u'https://opendata.swiss/en/terms-of-use/#terms_by') + eq_(resource['license'], u'https://opendata.swiss/terms-of-use/#terms_by') eq_(resource['rights'], u'http://www.opendefinition.org/licenses/cc-zero') eq_(resource['language'], [u'fr']) eq_(resource['issued'], u'1900-12-31T00:00:00') @@ -402,7 +402,7 @@ def test_multiple_rights_statements(self): dataset = [d for d in p.datasets()][0] resource = dataset["resources"][0] - eq_(unicode(resource['rights']), u"https://opendata.swiss/en/terms-of-use/#terms_by_ask") + eq_(unicode(resource['rights']), u"https://opendata.swiss/terms-of-use/#terms_by_ask") def test_eu_themes_mapping(self): contents = self._get_file_contents('catalog-themes.xml')