From 4f718e6b0836a2dc3610899709d5e8e07c32dc9d Mon Sep 17 00:00:00 2001 From: kovalch Date: Thu, 29 Feb 2024 08:43:11 +0100 Subject: [PATCH 1/8] feat: Add a DCAT-AP Switzerland I14Y RDF Harvester from harvesters.py template --- ckanext/dcatapchharvest/harvesters_i14y.py | 188 +++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 ckanext/dcatapchharvest/harvesters_i14y.py diff --git a/ckanext/dcatapchharvest/harvesters_i14y.py b/ckanext/dcatapchharvest/harvesters_i14y.py new file mode 100644 index 0000000..351f820 --- /dev/null +++ b/ckanext/dcatapchharvest/harvesters_i14y.py @@ -0,0 +1,188 @@ +import json + +import ckan.plugins as p +import ckan.model as model + +from ckanext.dcat.harvesters.rdf import DCATRDFHarvester +from ckanext.dcat.interfaces import IDCATRDFHarvester +from ckanext.dcatapchharvest.dcat_helpers import get_pagination +from ckanext.dcatapchharvest.harvest_helper import ( + map_resources_to_ids, + check_package_change, + create_activity, +) + +import logging +log = logging.getLogger(__name__) + + +class SwissDCATI14YRDFHarvester(DCATRDFHarvester): + p.implements(IDCATRDFHarvester, inherit=True) + + harvest_job = None + current_page_url = None + + def info(self): + return { + 'name': 'dcat_ch_i14y_rdf', + 'title': 'DCAT-AP Switzerland I14Y RDF Harvester', + 'description': 'Harvester for DCAT-AP Switzerland datasets from an RDF graph desighned for i14Y' # noqa + } + + def validate_config(self, source_config): + source_config = super(SwissDCATI14YRDFHarvester, self).validate_config(source_config) # noqa + + if not source_config: + return source_config + + source_config_obj = json.loads(source_config) + + if 'excluded_dataset_identifiers' in source_config_obj: + excluded_dataset_identifiers = source_config_obj['excluded_dataset_identifiers'] # noqa + if not isinstance(excluded_dataset_identifiers, list): + raise ValueError('excluded_dataset_identifiers must be ' + 'a list of strings') + if not all(isinstance(item, basestring) + for item in excluded_dataset_identifiers): + raise ValueError('excluded_dataset_identifiers must be ' + 'a list of strings') + + if 'excluded_rights' in source_config_obj: + excluded_rights = source_config_obj['excluded_rights'] + if not isinstance(excluded_rights, list): + raise ValueError('excluded_rights must be ' + 'a list of strings') + if not all(isinstance(item, basestring) + for item in excluded_rights): + raise ValueError('excluded_rights must be ' + 'a list of strings') + + return source_config + + def before_download(self, url, harvest_job): + # save the harvest_job on the instance + self.harvest_job = harvest_job + self.current_page_url = url + + # fix broken URL for City of Zurich + url = url.replace('ogd.global.szh.loc', 'data.stadt-zuerich.ch') + return url, [] + + def _get_guid(self, dataset_dict, source_url=None): # noqa + ''' + Try to get a unique identifier for a harvested dataset + It will be the first found of: + * URI (rdf:about) + * dcat:identifier + * Source URL + Dataset name + * Dataset name + The last two are obviously not optimal, as depend on title, which + might change. + Returns None if no guid could be decided. + ''' + guid = None + + if dataset_dict.get('identifier'): + guid = dataset_dict['identifier'] + # check if the owner_org matches the identifier + try: + if '@' in guid: + org_name = guid.split('@')[-1] # get last element + org = model.Group.by_name(org_name) + if not org: + error_msg = ( + 'The organization in the dataset identifier (%s) ' + 'does not not exist. ' % org_name + ) + log.error(error_msg) + self._save_gather_error(error_msg, self.harvest_job) + return None + + if org.id != dataset_dict['owner_org']: + error_msg = ( + 'The organization in the dataset identifier (%s) ' + 'does not match the organization in the harvester ' + 'config (%s)' % (org.id, dataset_dict['owner_org']) + ) + log.error(error_msg) + self._save_gather_error(error_msg, self.harvest_job) + return None + except Exception as e: + log.exception("Error when getting identifier: %s" % e) + return None + return dataset_dict['identifier'] + + for extra in dataset_dict.get('extras', []): + if extra['key'] == 'uri' and extra['value']: + return extra['value'] + + if dataset_dict.get('uri'): + return dataset_dict['uri'] + + for extra in dataset_dict.get('extras', []): + if extra['key'] == 'identifier' and extra['value']: + return extra['value'] + + for extra in dataset_dict.get('extras', []): + if extra['key'] == 'dcat_identifier' and extra['value']: + return extra['value'] + + if dataset_dict.get('name'): + guid = dataset_dict['name'] + if source_url: + guid = source_url.rstrip('/') + '/' + guid + + return guid + + def _gen_new_name(self, title): + return super(SwissDCATI14YRDFHarvester, self)._gen_new_name(_derive_flat_title(title)) # noqa + + def before_create(self, harvest_object, dataset_dict, temp_dict): + try: + source_config_obj = json.loads(harvest_object.job.source.config) + for excluded_dataset_identifier in source_config_obj.get('excluded_dataset_identifiers', []): # noqa + if excluded_dataset_identifier == dataset_dict.get('identifier'): # noqa + dataset_dict.clear() + excluded_rights = source_config_obj.get('excluded_rights', []) + dataset_rights = set([res.get('rights') for res in dataset_dict.get('resources', [])]) # noqa + if [rights for rights in dataset_rights if rights in excluded_rights]: # noqa + dataset_dict.clear() + except ValueError: + pass + + def before_update(self, harvest_object, dataset_dict, temp_dict): + existing_pkg = map_resources_to_ids(dataset_dict, dataset_dict['name']) + package_changed, msg = check_package_change(existing_pkg, dataset_dict) + if package_changed: + create_activity(package_id=dataset_dict['id'], message=msg) + + def after_download(self, content, harvest_job): + if not content: + after_download_error_msg = \ + 'The content of page-url {} could not be read'.format( + self.current_page_url + ) + log.info(after_download_error_msg) + return False, [after_download_error_msg] + return content, [] + + def after_parsing(self, rdf_parser, harvest_job): + parsed_content = rdf_parser.datasets() + dataset_identifiers = [dataset.get('identifier') + for dataset in parsed_content] + pagination = get_pagination(rdf_parser.g) + log.debug("pagination-info: {}".format(pagination)) + if not dataset_identifiers: + after_parsing_error_msg = \ + 'The content of page-url {} could not be parsed. ' \ + 'Therefore the harvesting was stopped.' \ + 'Pagination info: {}'.format(self.page_url, pagination) + log.info(after_parsing_error_msg) + return False, [after_parsing_error_msg] + log.debug("datasets parsed: {}".format(','.join(dataset_identifiers))) + return rdf_parser, [] + + +def _derive_flat_title(title_dict): + """localizes language dict if no language is specified""" + return title_dict.get('de') or title_dict.get('fr') or title_dict.get('en') or title_dict.get('it') or "" # noqa From 3e1623a335bff5de26e3acca9919e8d3cd946e91 Mon Sep 17 00:00:00 2001 From: kovalch Date: Thu, 29 Feb 2024 08:45:51 +0100 Subject: [PATCH 2/8] feat: Add new entry point for the i14y harvester --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index f769ce9..6f0869a 100644 --- a/setup.py +++ b/setup.py @@ -79,6 +79,7 @@ [ckan.plugins] ogdch_dcat=ckanext.dcatapchharvest.plugins:OgdchDcatPlugin dcat_ch_rdf_harvester=ckanext.dcatapchharvest.harvesters:SwissDCATRDFHarvester + dcat_ch_i14y_rdf_harvester=ckanext.dcatapchharvest.harvesters_i14y:SwissDCATI14YRDFHarvester [ckan.rdf.profiles] swiss_dcat_ap=ckanext.dcatapchharvest.profiles:SwissDCATAPProfile From 9252c82aef76d9ab4a520ca48fb538f4d34828e8 Mon Sep 17 00:00:00 2001 From: kovalch Date: Thu, 29 Feb 2024 11:18:01 +0100 Subject: [PATCH 3/8] feat: Add SwissDCATI14YRDFHarvester I14Y harvester that adapt the identifier --- ckanext/dcatapchharvest/harvesters.py | 47 +++++- ckanext/dcatapchharvest/harvesters_i14y.py | 188 --------------------- setup.py | 2 +- 3 files changed, 47 insertions(+), 190 deletions(-) delete mode 100644 ckanext/dcatapchharvest/harvesters_i14y.py diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py index b4cfe2b..b783e19 100644 --- a/ckanext/dcatapchharvest/harvesters.py +++ b/ckanext/dcatapchharvest/harvesters.py @@ -2,6 +2,7 @@ import ckan.plugins as p import ckan.model as model +from ckan.logic import NotFound, get_action from ckanext.dcat.harvesters.rdf import DCATRDFHarvester from ckanext.dcat.interfaces import IDCATRDFHarvester @@ -73,7 +74,7 @@ def _get_guid(self, dataset_dict, source_url=None): # noqa Try to get a unique identifier for a harvested dataset It will be the first found of: * URI (rdf:about) - * dcat:identifier + * dct:identifier * Source URL + Dataset name * Dataset name The last two are obviously not optimal, as depend on title, which @@ -186,3 +187,47 @@ def after_parsing(self, rdf_parser, harvest_job): def _derive_flat_title(title_dict): """localizes language dict if no language is specified""" return title_dict.get('de') or title_dict.get('fr') or title_dict.get('en') or title_dict.get('it') or "" # noqa + + +class SwissDCATI14YRDFHarvester(SwissDCATRDFHarvester): + + def info(self): + info = super(SwissDCATI14YRDFHarvester, self).info() + + info['name'] = 'dcat_ch_i14y_rdf' + info['title'] = 'DCAT-AP Switzerland I14Y RDF Harvester' + info['description'] = \ + 'Harvester for DCAT-AP Switzerland datasets from an RDF graph desighned for I14Y' # noqa + + return info + + def _get_guid(self, dataset_dict, source_url=None): + guid = super(SwissDCATI14YRDFHarvester, self).\ + _get_guid(dataset_dict, source_url) + + # get organization name + try: + dataset_organization = get_action('organization_show')( + {}, + {'id': dataset_dict['owner_org']} + ) + dataset_organization_name = dataset_organization['name'] + + except NotFound: + raise ValueError( + 'The selected organization was not found.' + ) + + # identifier that has form of , + # should be changed to the form @, + # where slug is an organization name + if (dataset_dict.get('identifier') + and dataset_dict['identifier'] == guid + and '@' not in guid): + dataset_dict['identifier_i14y'] =\ + dataset_dict['identifier'] + dataset_dict['identifier'] =\ + dataset_dict['identifier'] + '@'\ + + dataset_organization_name + + return dataset_dict['identifier'] diff --git a/ckanext/dcatapchharvest/harvesters_i14y.py b/ckanext/dcatapchharvest/harvesters_i14y.py deleted file mode 100644 index 351f820..0000000 --- a/ckanext/dcatapchharvest/harvesters_i14y.py +++ /dev/null @@ -1,188 +0,0 @@ -import json - -import ckan.plugins as p -import ckan.model as model - -from ckanext.dcat.harvesters.rdf import DCATRDFHarvester -from ckanext.dcat.interfaces import IDCATRDFHarvester -from ckanext.dcatapchharvest.dcat_helpers import get_pagination -from ckanext.dcatapchharvest.harvest_helper import ( - map_resources_to_ids, - check_package_change, - create_activity, -) - -import logging -log = logging.getLogger(__name__) - - -class SwissDCATI14YRDFHarvester(DCATRDFHarvester): - p.implements(IDCATRDFHarvester, inherit=True) - - harvest_job = None - current_page_url = None - - def info(self): - return { - 'name': 'dcat_ch_i14y_rdf', - 'title': 'DCAT-AP Switzerland I14Y RDF Harvester', - 'description': 'Harvester for DCAT-AP Switzerland datasets from an RDF graph desighned for i14Y' # noqa - } - - def validate_config(self, source_config): - source_config = super(SwissDCATI14YRDFHarvester, self).validate_config(source_config) # noqa - - if not source_config: - return source_config - - source_config_obj = json.loads(source_config) - - if 'excluded_dataset_identifiers' in source_config_obj: - excluded_dataset_identifiers = source_config_obj['excluded_dataset_identifiers'] # noqa - if not isinstance(excluded_dataset_identifiers, list): - raise ValueError('excluded_dataset_identifiers must be ' - 'a list of strings') - if not all(isinstance(item, basestring) - for item in excluded_dataset_identifiers): - raise ValueError('excluded_dataset_identifiers must be ' - 'a list of strings') - - if 'excluded_rights' in source_config_obj: - excluded_rights = source_config_obj['excluded_rights'] - if not isinstance(excluded_rights, list): - raise ValueError('excluded_rights must be ' - 'a list of strings') - if not all(isinstance(item, basestring) - for item in excluded_rights): - raise ValueError('excluded_rights must be ' - 'a list of strings') - - return source_config - - def before_download(self, url, harvest_job): - # save the harvest_job on the instance - self.harvest_job = harvest_job - self.current_page_url = url - - # fix broken URL for City of Zurich - url = url.replace('ogd.global.szh.loc', 'data.stadt-zuerich.ch') - return url, [] - - def _get_guid(self, dataset_dict, source_url=None): # noqa - ''' - Try to get a unique identifier for a harvested dataset - It will be the first found of: - * URI (rdf:about) - * dcat:identifier - * Source URL + Dataset name - * Dataset name - The last two are obviously not optimal, as depend on title, which - might change. - Returns None if no guid could be decided. - ''' - guid = None - - if dataset_dict.get('identifier'): - guid = dataset_dict['identifier'] - # check if the owner_org matches the identifier - try: - if '@' in guid: - org_name = guid.split('@')[-1] # get last element - org = model.Group.by_name(org_name) - if not org: - error_msg = ( - 'The organization in the dataset identifier (%s) ' - 'does not not exist. ' % org_name - ) - log.error(error_msg) - self._save_gather_error(error_msg, self.harvest_job) - return None - - if org.id != dataset_dict['owner_org']: - error_msg = ( - 'The organization in the dataset identifier (%s) ' - 'does not match the organization in the harvester ' - 'config (%s)' % (org.id, dataset_dict['owner_org']) - ) - log.error(error_msg) - self._save_gather_error(error_msg, self.harvest_job) - return None - except Exception as e: - log.exception("Error when getting identifier: %s" % e) - return None - return dataset_dict['identifier'] - - for extra in dataset_dict.get('extras', []): - if extra['key'] == 'uri' and extra['value']: - return extra['value'] - - if dataset_dict.get('uri'): - return dataset_dict['uri'] - - for extra in dataset_dict.get('extras', []): - if extra['key'] == 'identifier' and extra['value']: - return extra['value'] - - for extra in dataset_dict.get('extras', []): - if extra['key'] == 'dcat_identifier' and extra['value']: - return extra['value'] - - if dataset_dict.get('name'): - guid = dataset_dict['name'] - if source_url: - guid = source_url.rstrip('/') + '/' + guid - - return guid - - def _gen_new_name(self, title): - return super(SwissDCATI14YRDFHarvester, self)._gen_new_name(_derive_flat_title(title)) # noqa - - def before_create(self, harvest_object, dataset_dict, temp_dict): - try: - source_config_obj = json.loads(harvest_object.job.source.config) - for excluded_dataset_identifier in source_config_obj.get('excluded_dataset_identifiers', []): # noqa - if excluded_dataset_identifier == dataset_dict.get('identifier'): # noqa - dataset_dict.clear() - excluded_rights = source_config_obj.get('excluded_rights', []) - dataset_rights = set([res.get('rights') for res in dataset_dict.get('resources', [])]) # noqa - if [rights for rights in dataset_rights if rights in excluded_rights]: # noqa - dataset_dict.clear() - except ValueError: - pass - - def before_update(self, harvest_object, dataset_dict, temp_dict): - existing_pkg = map_resources_to_ids(dataset_dict, dataset_dict['name']) - package_changed, msg = check_package_change(existing_pkg, dataset_dict) - if package_changed: - create_activity(package_id=dataset_dict['id'], message=msg) - - def after_download(self, content, harvest_job): - if not content: - after_download_error_msg = \ - 'The content of page-url {} could not be read'.format( - self.current_page_url - ) - log.info(after_download_error_msg) - return False, [after_download_error_msg] - return content, [] - - def after_parsing(self, rdf_parser, harvest_job): - parsed_content = rdf_parser.datasets() - dataset_identifiers = [dataset.get('identifier') - for dataset in parsed_content] - pagination = get_pagination(rdf_parser.g) - log.debug("pagination-info: {}".format(pagination)) - if not dataset_identifiers: - after_parsing_error_msg = \ - 'The content of page-url {} could not be parsed. ' \ - 'Therefore the harvesting was stopped.' \ - 'Pagination info: {}'.format(self.page_url, pagination) - log.info(after_parsing_error_msg) - return False, [after_parsing_error_msg] - log.debug("datasets parsed: {}".format(','.join(dataset_identifiers))) - return rdf_parser, [] - - -def _derive_flat_title(title_dict): - """localizes language dict if no language is specified""" - return title_dict.get('de') or title_dict.get('fr') or title_dict.get('en') or title_dict.get('it') or "" # noqa diff --git a/setup.py b/setup.py index 6f0869a..732517e 100644 --- a/setup.py +++ b/setup.py @@ -79,7 +79,7 @@ [ckan.plugins] ogdch_dcat=ckanext.dcatapchharvest.plugins:OgdchDcatPlugin dcat_ch_rdf_harvester=ckanext.dcatapchharvest.harvesters:SwissDCATRDFHarvester - dcat_ch_i14y_rdf_harvester=ckanext.dcatapchharvest.harvesters_i14y:SwissDCATI14YRDFHarvester + dcat_ch_i14y_rdf_harvester=ckanext.dcatapchharvest.harvesters:SwissDCATI14YRDFHarvester [ckan.rdf.profiles] swiss_dcat_ap=ckanext.dcatapchharvest.profiles:SwissDCATAPProfile From 0e0c2bcd98b89332c6086fa238b2ce18166098ba Mon Sep 17 00:00:00 2001 From: kovalch Date: Fri, 1 Mar 2024 13:41:18 +0100 Subject: [PATCH 4/8] style: Update description of the ne harvester --- ckanext/dcatapchharvest/harvesters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py index b783e19..c2f7e8b 100644 --- a/ckanext/dcatapchharvest/harvesters.py +++ b/ckanext/dcatapchharvest/harvesters.py @@ -197,7 +197,8 @@ def info(self): info['name'] = 'dcat_ch_i14y_rdf' info['title'] = 'DCAT-AP Switzerland I14Y RDF Harvester' info['description'] = \ - 'Harvester for DCAT-AP Switzerland datasets from an RDF graph desighned for I14Y' # noqa + 'Harvester for DCAT-AP Switzerland datasets from ' \ + 'an RDF graph designed for I14Y' return info From 965753749cd46c409cea6e205f27adca752c906e Mon Sep 17 00:00:00 2001 From: kovalch Date: Fri, 1 Mar 2024 14:28:09 +0100 Subject: [PATCH 5/8] style: Remove not needed noqa --- ckanext/dcatapchharvest/harvesters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py index c2f7e8b..6667992 100644 --- a/ckanext/dcatapchharvest/harvesters.py +++ b/ckanext/dcatapchharvest/harvesters.py @@ -69,7 +69,7 @@ def before_download(self, url, harvest_job): url = url.replace('ogd.global.szh.loc', 'data.stadt-zuerich.ch') return url, [] - def _get_guid(self, dataset_dict, source_url=None): # noqa + def _get_guid(self, dataset_dict, source_url=None): ''' Try to get a unique identifier for a harvested dataset It will be the first found of: From 9364990082c7d85bb7ef8d32bfc569d7b9c51cda Mon Sep 17 00:00:00 2001 From: kovalch Date: Fri, 1 Mar 2024 14:31:52 +0100 Subject: [PATCH 6/8] style: Move back noqa --- ckanext/dcatapchharvest/harvesters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py index 6667992..c2f7e8b 100644 --- a/ckanext/dcatapchharvest/harvesters.py +++ b/ckanext/dcatapchharvest/harvesters.py @@ -69,7 +69,7 @@ def before_download(self, url, harvest_job): url = url.replace('ogd.global.szh.loc', 'data.stadt-zuerich.ch') return url, [] - def _get_guid(self, dataset_dict, source_url=None): + def _get_guid(self, dataset_dict, source_url=None): # noqa ''' Try to get a unique identifier for a harvested dataset It will be the first found of: From 971010c26c99ad844d890c39623667e3f2b38bf9 Mon Sep 17 00:00:00 2001 From: kovalch Date: Tue, 28 May 2024 10:42:21 +0200 Subject: [PATCH 7/8] fix: Use ckan.plugins.toolkit instead of ckan.logic --- ckanext/dcatapchharvest/harvesters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py index c2f7e8b..6034d30 100644 --- a/ckanext/dcatapchharvest/harvesters.py +++ b/ckanext/dcatapchharvest/harvesters.py @@ -2,7 +2,7 @@ import ckan.plugins as p import ckan.model as model -from ckan.logic import NotFound, get_action +import ckan.plugins.toolkit as tk from ckanext.dcat.harvesters.rdf import DCATRDFHarvester from ckanext.dcat.interfaces import IDCATRDFHarvester @@ -208,13 +208,13 @@ def _get_guid(self, dataset_dict, source_url=None): # get organization name try: - dataset_organization = get_action('organization_show')( + dataset_organization = tk.get_action('organization_show')( {}, {'id': dataset_dict['owner_org']} ) dataset_organization_name = dataset_organization['name'] - except NotFound: + except tk.NotFound: raise ValueError( 'The selected organization was not found.' ) From 2d407250c5dea733413377a622db6c69643cb2d1 Mon Sep 17 00:00:00 2001 From: kovalch Date: Tue, 28 May 2024 11:05:35 +0200 Subject: [PATCH 8/8] fix: ObjectNotFound exeption for tk instead of logic.NotFound --- ckanext/dcatapchharvest/harvesters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py index 6034d30..9520fc1 100644 --- a/ckanext/dcatapchharvest/harvesters.py +++ b/ckanext/dcatapchharvest/harvesters.py @@ -214,7 +214,7 @@ def _get_guid(self, dataset_dict, source_url=None): ) dataset_organization_name = dataset_organization['name'] - except tk.NotFound: + except tk.ObjectNotFound: raise ValueError( 'The selected organization was not found.' )