diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py index 2e1d800..68c1d6e 100644 --- a/ckanext/dcatapchharvest/dcat_helpers.py +++ b/ckanext/dcatapchharvest/dcat_helpers.py @@ -13,6 +13,7 @@ DCT = Namespace("http://purl.org/dc/terms/") EUTHEMES = \ Namespace("http://publications.europa.eu/resource/authority/data-theme/") +FOAF = Namespace("http://xmlns.com/foaf/0.1/") HYDRA = Namespace('http://www.w3.org/ns/hydra/core#') SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#") @@ -38,6 +39,7 @@ "skosxl": SKOSXL, "rdf": RDF, "rdfs": RDFS, + "foaf": FOAF, } theme_namespaces = { @@ -167,38 +169,112 @@ def get_frequency_values(): return frequency_mapping -def get_license_uri_by_name(vocabulary_name): - license_vocabulary = get_license_values() - for key, value in license_vocabulary.items(): - if unicode(vocabulary_name) == unicode(value): - return key - return None - - -def get_license_name_by_uri(vocabulary_uri): - license_vocabulary = get_license_values() - for key, value in license_vocabulary.items(): - if unicode(vocabulary_uri) == unicode(key): - return unicode(value) - return None - - -def get_license_values(): - g = Graph() - license_mapping = {} - for prefix, namespace in license_namespaces.items(): - g.bind(prefix, namespace) - file = os.path.join(__location__, 'license.ttl') - g.parse(file, format='turtle') - for ogdch_license_ref in g.subjects(predicate=RDF.type, - object=SKOS.Concept): - license_mapping[ogdch_license_ref] = None - for license_pref_label in g.objects(subject=ogdch_license_ref, - predicate=SKOSXL.prefLabel): - for license_literal in g.objects(subject=license_pref_label, - predicate=SKOSXL.literalForm): - license_mapping[ogdch_license_ref] = license_literal - return license_mapping +class LicenseHandler: + def __init__(self): + self._license_cache = None + + def _bind_namespaces(self, graph): + for prefix, namespace in license_namespaces.items(): + graph.bind(prefix, namespace) + + def _parse_graph(self, graph): + file = os.path.join(__location__, 'license.ttl') + graph.parse(file, format='turtle') + + def _get_license_homepage(self, graph, license_ref): + for homepage in graph.objects(subject=license_ref, + predicate=FOAF.homepage): + return homepage + return None + + def _get_license_literal(self, graph, license_ref): + for license_pref_label in graph.objects(subject=license_ref, + predicate=SKOSXL.prefLabel): + try: + return next(graph.objects(subject=license_pref_label, + predicate=SKOSXL.literalForm)) + except StopIteration: + continue + return None + + def _process_graph(self, graph): + license_ref_literal_mapping = {} + license_homepages_literal_mapping = {} + license_homepage_ref_mapping = {} + + for ogdch_license_ref in graph.subjects(predicate=RDF.type, + object=SKOS.Concept): + license_homepage = self._get_license_homepage(graph, + ogdch_license_ref) + license_literal = self._get_license_literal(graph, + ogdch_license_ref) + + license_homepages_literal_mapping[unicode(license_homepage)] = \ + unicode(license_literal) + license_ref_literal_mapping[unicode(ogdch_license_ref)] = \ + unicode(license_literal) + license_homepage_ref_mapping[unicode(license_homepage)] = \ + unicode(ogdch_license_ref) + + return (license_homepages_literal_mapping, + license_ref_literal_mapping, license_homepage_ref_mapping) + + def _get_license_values(self): + if self._license_cache is None: + try: + g = Graph() + self._bind_namespaces(g) + self._parse_graph(g) + + (license_homepages_literal_mapping, + license_ref_literal_mapping, + license_homepage_ref_mapping) = self._process_graph(g) + + self._license_cache = (license_homepages_literal_mapping, + license_ref_literal_mapping, + license_homepage_ref_mapping) + except Exception as e: + raise RuntimeError("Failed to load license values: %s" + % e) + return self._license_cache + + def get_license_ref_uri_by_name(self, vocabulary_name): + _, license_ref_literal_vocabulary, _ = self._get_license_values() + return next((key for key, value in + license_ref_literal_vocabulary.items() + if unicode(vocabulary_name) == value), + None) + + def get_license_ref_uri_by_homepage_uri(self, vocabulary_name): + _, _, license_homepage_ref_vocabulary = self._get_license_values() + return license_homepage_ref_vocabulary.get(unicode(vocabulary_name)) + + def get_license_name_by_ref_uri(self, vocabulary_uri): + _, license_ref_literal_vocabulary, _ = self._get_license_values() + return license_ref_literal_vocabulary.get( + unicode(vocabulary_uri)) + + def get_license_name_by_homepage_uri(self, vocabulary_uri): + license_homepages_literal_vocabulary, _, _ = self._get_license_values() + return license_homepages_literal_vocabulary.get( + unicode(vocabulary_uri)) + + def get_license_homepage_uri_by_name(self, vocabulary_name): + license_homepages_literal_vocabulary, _, _ = self._get_license_values() + return next((key for key, value in + license_homepages_literal_vocabulary.items() + if unicode(vocabulary_name) == value), + None) + + def get_license_homepage_uri_by_uri(self, vocabulary_uri): + _, _, license_homepage_ref_vocabulary = self._get_license_values() + license_homepages = list(license_homepage_ref_vocabulary.keys()) + if unicode(vocabulary_uri) in license_homepages: + return unicode(vocabulary_uri) + return next((key for key, value in + license_homepage_ref_vocabulary.items() + if unicode(vocabulary_uri) == value), + None) def get_theme_mapping(): diff --git a/ckanext/dcatapchharvest/license.ttl b/ckanext/dcatapchharvest/license.ttl index 660043a..720da79 100644 --- a/ckanext/dcatapchharvest/license.ttl +++ b/ckanext/dcatapchharvest/license.ttl @@ -46,7 +46,7 @@ skosxl:literalForm "NonCommercialAllowed-CommercialAllowed-ReferenceRequired"@en ; rdfs:label "NonCommercialAllowed-CommercialAllowed-ReferenceRequired"@de ] ; - foaf:homepage . + foaf:homepage . a skos:Concept ; @@ -61,7 +61,7 @@ skosxl:literalForm "NonCommercialAllowed-CommercialWithPermission-ReferenceNotRequired"@en ; rdfs:label "NonCommercialAllowed-CommercialWithPermission-ReferenceNotRequired"@de ] ; - foaf:homepage . + foaf:homepage . a skos:Concept ; @@ -76,7 +76,7 @@ skosxl:literalForm "NonCommercialAllowed-CommercialWithPermission-ReferenceRequired"@en ; rdfs:label "NonCommercialAllowed-CommercialWithPermission-ReferenceRequired"@de ] ; - foaf:homepage . + foaf:homepage . a skos:Concept, cc:License ; diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py index 16accca..335e494 100644 --- a/ckanext/dcatapchharvest/profiles.py +++ b/ckanext/dcatapchharvest/profiles.py @@ -13,9 +13,8 @@ from ckanext.dcat.profiles import CleanedURIRef, RDFProfile, SchemaOrgProfile log = logging.getLogger(__name__) - +license_handler = dh.LicenseHandler() valid_frequencies = dh.get_frequency_values() -valid_licenses = dh.get_license_values() eu_theme_mapping = dh.get_theme_mapping() valid_formats = dh.get_format_values() valid_media_types = dh.get_iana_media_type_values() @@ -277,23 +276,13 @@ def _get_iana_media_type(self, subject): if media_type_key in valid_media_types: return media_type_key - def _license_rights_name(self, subject, predicate): - for node in self.g.objects(subject, predicate): - # DCAT-AP CH v1: the license as a literal (should be - # the code for one of the DCAT-AP CH licenses) - if isinstance(node, Literal): - return unicode(node) - if isinstance(node, URIRef): - return dh.get_license_name_by_uri(node) - return None - - def _license_rights_uri(self, subject, predicate): + def _license_rights_homepage_uri(self, subject, predicate): for node in self.g.objects(subject, predicate): # DCAT-AP CH v2 compatible license has to be a URI. if isinstance(node, Literal): - return dh.get_license_uri_by_name(node) + return license_handler.get_license_homepage_uri_by_name(node) if isinstance(node, URIRef): - return node + return license_handler.get_license_homepage_uri_by_uri(node) return None def _keywords(self, subject): @@ -633,21 +622,34 @@ def parse_dataset(self, dataset_dict, dataset_ref): # noqa if value: resource_dict[key] = value - # Rights & License save name - rights = self._license_rights_name(distribution, DCT.rights) - license = self._license_rights_name(distribution, DCT.license) + # Rights & License save homepage uri + rights = self._license_rights_homepage_uri( + distribution, DCT.rights + ) + license = self._license_rights_homepage_uri( + distribution, DCT.license + ) + if rights is None and license is not None: resource_dict['license'] = license resource_dict['rights'] = license - if rights is not None and license is None: - resource_dict['license'] = rights + elif rights is not None and license is None: resource_dict['rights'] = rights - if license is not None and rights is not None: + if 'cc' not in rights: + resource_dict['license'] = rights + else: + resource_dict['license'] = None + elif license is not None and rights is not None: resource_dict['license'] = license resource_dict['rights'] = rights - if 'cc' in rights: + if 'cc' in license and 'cc' not in rights: resource_dict['license'] = rights resource_dict['rights'] = license + elif 'cc' in license and 'cc' in rights: + resource_dict['license'] = None + else: + resource_dict['license'] = None + resource_dict['rights'] = None # Format & Media type resource_dict['format'] = \ @@ -1035,55 +1037,40 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa g.add((distribution, DCAT.byteSize, Literal(resource_dict['byte_size']))) + def _get_rights_and_license_uri(self, resource_dict, property='license'): + if property not in ['license', 'rights']: + raise ValueError("Property must be 'license' or 'rights'") + + homepage_uri = resource_dict.get(property) + if not homepage_uri: + return None + + uri = license_handler.get_license_ref_uri_by_homepage_uri(homepage_uri) + if uri is not None: + return URIRef(uri) + + name = license_handler.get_license_name_by_homepage_uri(homepage_uri) + if name is not None: + uri = license_handler.get_license_ref_uri_by_name(name) + if uri is not None: + return URIRef(uri) + + return None + def _rights_and_license_to_graph(self, resource_dict, distribution): g = self.g - if resource_dict.get('rights'): - rights_uri = dh.get_license_uri_by_name( - resource_dict.get('rights') - ) - if rights_uri is not None: - rights_ref = URIRef(rights_uri) - g.add((rights_ref, RDF.type, DCT.RightsStatement)) - g.add((distribution, DCT.rights, rights_ref)) - if rights_uri is None: - rights_name = dh.get_license_name_by_uri( - resource_dict.get('rights') - ) - if rights_name is not None: - resource_rights_ref = URIRef( - resource_dict.get('rights') - ) - g.add(( - resource_rights_ref, - RDF.type, - DCT.RightsStatement) - ) - g.add((distribution, DCT.rights, resource_rights_ref)) - if resource_dict.get('license'): - license_uri = dh.get_license_uri_by_name( - resource_dict.get('license') - ) - if license_uri is not None: - license_ref = URIRef(license_uri) - g.add((license_ref, RDF.type, DCT.LicenseDocument)) - g.add((distribution, DCT.license, license_ref)) - if license_uri is None: - license_name = dh.get_license_name_by_uri( - resource_dict.get('license') - ) - if license_name is not None: - resource_license_ref = URIRef( - resource_dict.get('license') - ) - g.add(( - resource_license_ref, - RDF.type, - DCT.LicenseDocument) - ) - g.add( - (distribution, DCT.license, resource_license_ref) - ) + rights_uri_ref = self._get_rights_and_license_uri(resource_dict, + 'rights') + if rights_uri_ref is not None: + g.add((rights_uri_ref, RDF.type, DCT.RightsStatement)) + g.add((distribution, DCT.rights, rights_uri_ref)) + + license_uri_ref = self._get_rights_and_license_uri(resource_dict, + 'license') + if license_uri_ref is not None: + g.add((license_uri_ref, RDF.type, DCT.LicenseDocument)) + g.add((distribution, DCT.license, license_uri_ref)) def _format_and_media_type_to_graph(self, resource_dict, distribution): g = self.g diff --git a/ckanext/dcatapchharvest/tests/fixtures/dataset.json b/ckanext/dcatapchharvest/tests/fixtures/dataset.json index cf863af..ebd1883 100644 --- a/ckanext/dcatapchharvest/tests/fixtures/dataset.json +++ b/ckanext/dcatapchharvest/tests/fixtures/dataset.json @@ -121,8 +121,8 @@ "https://example.com/documentation-resource-1", "https://example.com/documentation-resource-2" ], - "rights": "Creative Commons Zero 1.0 Universell (CC0 1.0)", - "license": "NonCommercialAllowed-CommercialAllowed-ReferenceNotRequired", + "rights": "http://www.opendefinition.org/licenses/cc-zero", + "license": "https://opendata.swiss/terms-of-use/#terms_open", "format": "CSV", "issued": "2015-06-26T15:21:09.034694", "modified": "2015-06-30T15:21:09.000000" @@ -135,7 +135,7 @@ "https://example.com/documentation-resource-2" ], "rights": "http://dcat-ap.ch/vocabulary/licenses/terms_by", - "license": "NonCommercialAllowed-CommercialAllowed-ReferenceRequired", + "license": "https://opendata.swiss/terms-of-use/#terms_by", "format": "HTML" }, { diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py index 26de93d..5866fd8 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_parse.py @@ -32,8 +32,8 @@ def test_rights_license(self): # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] - eq_(resource['rights'], u'NonCommercialAllowed-CommercialAllowed-ReferenceRequired') - eq_(resource['license'], u'NonCommercialAllowed-CommercialWithPermission-ReferenceRequired') + eq_(unicode(resource['rights']), u'https://opendata.swiss/terms-of-use/#terms_by') + eq_(unicode(resource['license']), u'https://opendata.swiss/terms-of-use/#terms_by_ask') def test_dataset_all_fields(self): @@ -146,8 +146,8 @@ def test_dataset_all_fields(self): eq_(resource['format'], u'html') eq_(resource['media_type'], u'text/html') eq_(resource['identifier'], u'346265-fr@bundesamt-fur-statistik-bfs') - eq_(resource['rights'], u'NonCommercialAllowed-CommercialAllowed-ReferenceRequired') - eq_(resource['license'], u'Creative Commons Zero 1.0 Universal (CC0 1.0)') + eq_(resource['license'], u'https://opendata.swiss/terms-of-use/#terms_by') + eq_(resource['rights'], u'http://www.opendefinition.org/licenses/cc-zero') eq_(resource['language'], [u'fr']) eq_(resource['issued'], u'1900-12-31T00:00:00') eq_(resource['temporal_resolution'], u'P1D') @@ -402,7 +402,7 @@ def test_multiple_rights_statements(self): dataset = [d for d in p.datasets()][0] resource = dataset["resources"][0] - eq_(resource['rights'], u"NonCommercialAllowed-CommercialWithPermission-ReferenceRequired") + eq_(unicode(resource['rights']), u"https://opendata.swiss/terms-of-use/#terms_by_ask") def test_eu_themes_mapping(self): contents = self._get_file_contents('catalog-themes.xml') diff --git a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py index f686931..333936b 100644 --- a/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py +++ b/ckanext/dcatapchharvest/tests/test_dcatap_ch_serialize.py @@ -110,25 +110,25 @@ def test_graph_from_dataset(self): assert self._triple(g, distribution, DCAT.accessService, URIRef(link)) # e2c50e70-67ad-4f86-bb1b-3f93867eadaa - if resource_dict.get('rights') == 'Creative Commons Zero 1.0 Universal (CC0 1.0)': + if resource_dict.get('rights') == "http://www.opendefinition.org/licenses/cc-zero": assert self._triple(g, distribution, DCT.rights, URIRef("https://creativecommons.org/publicdomain/zero/1.0/")) - if resource_dict.get('license') == 'NonCommercialAllowed-CommercialAllowed-ReferenceNotRequired': + if resource_dict.get('license') == "https://opendata.swiss/terms-of-use/#terms_open": assert self._triple(g, distribution, DCT.license, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_open")) # 28e75e40-e1a1-497b-a1b9-8c1834d60201 - if resource_dict.get('rights') == "http://dcat-ap.ch/vocabulary/licenses/terms_by": + if resource_dict.get('rights') == "https://opendata.swiss/terms-of-use#terms_by": assert self._triple(g, distribution, DCT.rights, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by")) - if resource_dict.get('license') == "NonCommercialAllowed-CommercialAllowed-ReferenceRequired": + if resource_dict.get('license') == "https://opendata.swiss/terms-of-use#terms_by": assert self._triple(g, distribution, DCT.license, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by")) # 0cfce6ba-28f4-4229-b733-f6492c650395 - if resource_dict.get('rights') == "http://dcat-ap.ch/vocabulary/licenses/terms_by_ask": + if resource_dict.get('rights') == "https://opendata.swiss/terms-of-use#terms_by_ask": assert self._triple(g, distribution, DCT.rights, URIRef("http://dcat-ap.ch/vocabulary/licenses/terms_by_ask")) - if resource_dict.get('license') == "https://creativecommons.org/licenses/by/4.0/": - assert self._triple(g, distribution, DCT.license, URIRef("https://creativecommons.org/licenses/by/4.0/")) + if resource_dict.get('rights') == "http://www.opendefinition.org/licenses/cc-by/": + assert self._triple(g, distribution, DCT.rights, URIRef("https://creativecommons.org/licenses/by/4.0/")) if resource_dict.get('format') == "CSV": assert self._triple(g, distribution, DCT['format'], URIRef("http://publications.europa.eu/resource/authority/file-type/CSV"))