From f1671d11e878c3a39ee76247eaf80c69e8b916b5 Mon Sep 17 00:00:00 2001 From: kovalch Date: Mon, 29 Jul 2024 16:07:32 +0200 Subject: [PATCH] fix: Update parsing of the dataset of the license and rights fields --- ckanext/dcatapchharvest/dcat_helpers.py | 89 ++++++++++++++++++++----- ckanext/dcatapchharvest/profiles.py | 37 +++++----- 2 files changed, 91 insertions(+), 35 deletions(-) diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 2e1d800..9312cd5 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -4,7 +4,7 @@ from urlparse import urlparse from ckantoolkit import config from rdflib import URIRef, Graph -from rdflib.namespace import Namespace, RDF, SKOS +from rdflib.namespace import Namespace, RDF, SKOS, FOAF import xml.etree.ElementTree as ET import logging @@ -17,6 +17,7 @@ SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#") RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") frequency_namespaces = { "skos": SKOS, @@ -38,6 +39,7 @@ "skosxl": SKOSXL, "rdf": RDF, "rdfs": RDFS, + "foaf": FOAF, } theme_namespaces = { @@ -167,38 +169,95 @@ def get_frequency_values(): return frequency_mapping -def get_license_uri_by_name(vocabulary_name): - license_vocabulary = get_license_values() - for key, value in license_vocabulary.items(): +def get_license_ref_uri_by_name(vocabulary_name): + _, license_ref_literal_vocabulary, _ = get_license_values() + for key, value in license_ref_literal_vocabulary.items(): if unicode(vocabulary_name) == unicode(value): return key return None -def get_license_name_by_uri(vocabulary_uri): - license_vocabulary = get_license_values() - for key, value in license_vocabulary.items(): +def get_license_ref_uri_by_homepage_uri(vocabulary_name): + _, _, license_homepage_ref_vocabulary = get_license_values() + for key, value in license_homepage_ref_vocabulary.items(): + if unicode(vocabulary_name) == unicode(key): + return value + return None + + +def get_license_name_by_ref_uri(vocabulary_uri): + _, license_ref_literal_vocabulary, _ = get_license_values() + for key, value in license_ref_literal_vocabulary.items(): + if unicode(vocabulary_uri) == unicode(key): + return unicode(value) + return None + + +def get_license_name_by_homepage_uri(vocabulary_uri): + license_homepages_literal_vocabulary, _, _ = get_license_values() + for key, value in license_homepages_literal_vocabulary.items(): if unicode(vocabulary_uri) == unicode(key): return unicode(value) return None +def get_license_homepage_uri_by_name(vocabulary_name): + license_homepages_literal_vocabulary, _, _ = get_license_values() + for key, value in license_homepages_literal_vocabulary.items(): + if unicode(vocabulary_name) == unicode(value): + return key + return None + + +def get_license_homepage_uri_by_uri(vocabulary_uri): + _, _, license_homepage_ref_vocabulary = get_license_values() + license_homepages = list(license_homepage_ref_vocabulary.keys()) + if vocabulary_uri in license_homepages: + return unicode(vocabulary_uri) + else: + for key, value in license_homepage_ref_vocabulary.items(): + if unicode(vocabulary_uri) == unicode(value): + return unicode(key) + return + + def get_license_values(): g = Graph() - license_mapping = {} + license_ref_literal_mapping = {} + license_homepages_literal_mapping = {} + license_homepage_ref_mapping = {} + for prefix, namespace in license_namespaces.items(): g.bind(prefix, namespace) file = os.path.join(__location__, 'license.ttl') g.parse(file, format='turtle') for ogdch_license_ref in g.subjects(predicate=RDF.type, object=SKOS.Concept): - license_mapping[ogdch_license_ref] = None - for license_pref_label in g.objects(subject=ogdch_license_ref, - predicate=SKOSXL.prefLabel): - for license_literal in g.objects(subject=license_pref_label, - predicate=SKOSXL.literalForm): - license_mapping[ogdch_license_ref] = license_literal - return license_mapping + license_homepage = None + for homepage in g.objects(subject=ogdch_license_ref, + predicate=FOAF.homepage): + license_homepage = homepage + break # Assume one homepage per concept + + license_literal = None + try: + for license_pref_label in g.objects(subject=ogdch_license_ref, + predicate=SKOSXL.prefLabel): + for literal in g.objects(subject=license_pref_label, + predicate=SKOSXL.literalForm): + license_literal = literal + break # Assume one literal per concept + + license_homepages_literal_mapping[license_homepage] = license_literal # noqa + license_ref_literal_mapping[ogdch_license_ref] = license_literal + license_homepage_ref_mapping[license_homepage] = ogdch_license_ref + + except Exception as e: + raise ValueError("SKOSXL.prefLabel is missing in the RDF-file: %s" + % e) + + return (license_homepages_literal_mapping, license_ref_literal_mapping, + license_homepage_ref_mapping) def get_theme_mapping(): diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 50bf661..92a92fd 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -277,23 +277,13 @@ def _get_iana_media_type(self, subject): if media_type_key in valid_media_types: return media_type_key - def _license_rights_name(self, subject, predicate): - for node in self.g.objects(subject, predicate): - # DCAT-AP CH v1: the license as a literal (should be - # the code for one of the DCAT-AP CH licenses) - if isinstance(node, Literal): - return unicode(node) - if isinstance(node, URIRef): - return dh.get_license_name_by_uri(node) - return None - - def _license_rights_uri(self, subject, predicate): + def _license_rights_homepage_uri(self, subject, predicate): for node in self.g.objects(subject, predicate): # DCAT-AP CH v2 compatible license has to be a URI. if isinstance(node, Literal): - return dh.get_license_uri_by_name(node) + return dh.get_license_homepage_uri_by_name(node) if isinstance(node, URIRef): - return node + return dh.get_license_homepage_uri_by_uri(node) return None def _keywords(self, subject): @@ -633,21 +623,28 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa if value: resource_dict[key] = value - # Rights & License save name - rights = self._license_rights_name(distribution, DCT.rights) - license = self._license_rights_name(distribution, DCT.license) + # Rights & License save homepage uri + rights = self._license_rights_homepage_uri(distribution, DCT.rights) + license = self._license_rights_homepage_uri(distribution, DCT.license) + if rights is None and license is not None: resource_dict['license'] = license resource_dict['rights'] = license - if rights is not None and license is None: - resource_dict['license'] = rights + elif rights is not None and license is None: resource_dict['rights'] = rights - if license is not None and rights is not None: + if 'cc' not in rights: + resource_dict['license'] = rights + else: + resource_dict['license'] = None + elif license is not None and rights is not None: resource_dict['license'] = license resource_dict['rights'] = rights - if 'cc' in rights: + if 'cc' in license and 'cc' not in rights: resource_dict['license'] = rights resource_dict['rights'] = license + else: + resource_dict['license'] = None + resource_dict['rights'] = None # Format & Media type resource_dict['format'] = \