Skip to content

Commit

Permalink
fix: Update parsing of the dataset of the license and rights fields
Browse files Browse the repository at this point in the history
  • Loading branch information
kovalch committed Jul 29, 2024
1 parent e6ba527 commit f1671d1
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 35 deletions.
89 changes: 74 additions & 15 deletions ckanext/dcatapchharvest/dcat_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from urlparse import urlparse
from ckantoolkit import config
from rdflib import URIRef, Graph
from rdflib.namespace import Namespace, RDF, SKOS
from rdflib.namespace import Namespace, RDF, SKOS, FOAF
import xml.etree.ElementTree as ET
import logging

Expand All @@ -17,6 +17,7 @@

SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")

frequency_namespaces = {
"skos": SKOS,
Expand All @@ -38,6 +39,7 @@
"skosxl": SKOSXL,
"rdf": RDF,
"rdfs": RDFS,
"foaf": FOAF,
}

theme_namespaces = {
Expand Down Expand Up @@ -167,38 +169,95 @@ def get_frequency_values():
return frequency_mapping


def get_license_uri_by_name(vocabulary_name):
license_vocabulary = get_license_values()
for key, value in license_vocabulary.items():
def get_license_ref_uri_by_name(vocabulary_name):
_, license_ref_literal_vocabulary, _ = get_license_values()
for key, value in license_ref_literal_vocabulary.items():
if unicode(vocabulary_name) == unicode(value):
return key
return None


def get_license_name_by_uri(vocabulary_uri):
license_vocabulary = get_license_values()
for key, value in license_vocabulary.items():
def get_license_ref_uri_by_homepage_uri(vocabulary_name):
_, _, license_homepage_ref_vocabulary = get_license_values()
for key, value in license_homepage_ref_vocabulary.items():
if unicode(vocabulary_name) == unicode(key):
return value
return None


def get_license_name_by_ref_uri(vocabulary_uri):
_, license_ref_literal_vocabulary, _ = get_license_values()
for key, value in license_ref_literal_vocabulary.items():
if unicode(vocabulary_uri) == unicode(key):
return unicode(value)
return None


def get_license_name_by_homepage_uri(vocabulary_uri):
license_homepages_literal_vocabulary, _, _ = get_license_values()
for key, value in license_homepages_literal_vocabulary.items():
if unicode(vocabulary_uri) == unicode(key):
return unicode(value)
return None


def get_license_homepage_uri_by_name(vocabulary_name):
license_homepages_literal_vocabulary, _, _ = get_license_values()
for key, value in license_homepages_literal_vocabulary.items():
if unicode(vocabulary_name) == unicode(value):
return key
return None


def get_license_homepage_uri_by_uri(vocabulary_uri):
_, _, license_homepage_ref_vocabulary = get_license_values()
license_homepages = list(license_homepage_ref_vocabulary.keys())
if vocabulary_uri in license_homepages:
return unicode(vocabulary_uri)
else:
for key, value in license_homepage_ref_vocabulary.items():
if unicode(vocabulary_uri) == unicode(value):
return unicode(key)
return


def get_license_values():
g = Graph()
license_mapping = {}
license_ref_literal_mapping = {}
license_homepages_literal_mapping = {}
license_homepage_ref_mapping = {}

for prefix, namespace in license_namespaces.items():
g.bind(prefix, namespace)
file = os.path.join(__location__, 'license.ttl')
g.parse(file, format='turtle')
for ogdch_license_ref in g.subjects(predicate=RDF.type,
object=SKOS.Concept):
license_mapping[ogdch_license_ref] = None
for license_pref_label in g.objects(subject=ogdch_license_ref,
predicate=SKOSXL.prefLabel):
for license_literal in g.objects(subject=license_pref_label,
predicate=SKOSXL.literalForm):
license_mapping[ogdch_license_ref] = license_literal
return license_mapping
license_homepage = None
for homepage in g.objects(subject=ogdch_license_ref,
predicate=FOAF.homepage):
license_homepage = homepage
break # Assume one homepage per concept

license_literal = None
try:
for license_pref_label in g.objects(subject=ogdch_license_ref,
predicate=SKOSXL.prefLabel):
for literal in g.objects(subject=license_pref_label,
predicate=SKOSXL.literalForm):
license_literal = literal
break # Assume one literal per concept

license_homepages_literal_mapping[license_homepage] = license_literal # noqa
license_ref_literal_mapping[ogdch_license_ref] = license_literal
license_homepage_ref_mapping[license_homepage] = ogdch_license_ref

except Exception as e:
raise ValueError("SKOSXL.prefLabel is missing in the RDF-file: %s"
% e)

return (license_homepages_literal_mapping, license_ref_literal_mapping,
license_homepage_ref_mapping)


def get_theme_mapping():
Expand Down
37 changes: 17 additions & 20 deletions ckanext/dcatapchharvest/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,23 +277,13 @@ def _get_iana_media_type(self, subject):
if media_type_key in valid_media_types:
return media_type_key

def _license_rights_name(self, subject, predicate):
for node in self.g.objects(subject, predicate):
# DCAT-AP CH v1: the license as a literal (should be
# the code for one of the DCAT-AP CH licenses)
if isinstance(node, Literal):
return unicode(node)
if isinstance(node, URIRef):
return dh.get_license_name_by_uri(node)
return None

def _license_rights_uri(self, subject, predicate):
def _license_rights_homepage_uri(self, subject, predicate):
for node in self.g.objects(subject, predicate):
# DCAT-AP CH v2 compatible license has to be a URI.
if isinstance(node, Literal):
return dh.get_license_uri_by_name(node)
return dh.get_license_homepage_uri_by_name(node)
if isinstance(node, URIRef):
return node
return dh.get_license_homepage_uri_by_uri(node)
return None

def _keywords(self, subject):
Expand Down Expand Up @@ -633,21 +623,28 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa
if value:
resource_dict[key] = value

# Rights & License save name
rights = self._license_rights_name(distribution, DCT.rights)
license = self._license_rights_name(distribution, DCT.license)
# Rights & License save homepage uri
rights = self._license_rights_homepage_uri(distribution, DCT.rights)
license = self._license_rights_homepage_uri(distribution, DCT.license)

if rights is None and license is not None:
resource_dict['license'] = license
resource_dict['rights'] = license
if rights is not None and license is None:
resource_dict['license'] = rights
elif rights is not None and license is None:
resource_dict['rights'] = rights
if license is not None and rights is not None:
if 'cc' not in rights:
resource_dict['license'] = rights
else:
resource_dict['license'] = None
elif license is not None and rights is not None:
resource_dict['license'] = license
resource_dict['rights'] = rights
if 'cc' in rights:
if 'cc' in license and 'cc' not in rights:
resource_dict['license'] = rights
resource_dict['rights'] = license
else:
resource_dict['license'] = None
resource_dict['rights'] = None

# Format & Media type
resource_dict['format'] = \
Expand Down

0 comments on commit f1671d1

Please sign in to comment.