Skip to content

Commit

Permalink
feat: Modify guid from the identifier that is in shape of <id>
Browse files Browse the repository at this point in the history
  • Loading branch information
kovalch committed Mar 1, 2024
1 parent 6be89f0 commit e01c202
Showing 1 changed file with 18 additions and 76 deletions.
94 changes: 18 additions & 76 deletions ckanext/dcatapchharvest/harvesters.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,33 +190,19 @@ def _derive_flat_title(title_dict):


class SwissDCATI14YRDFHarvester(SwissDCATRDFHarvester):
p.implements(IDCATRDFHarvester, inherit=True)

def info(self):
# call the parent class SwissDCATRDFHarvester
# to get the default info dictionary
info = super(SwissDCATRDFHarvester, self).info()
# source_config = super(SwissDCATRDFHarvester, self).validate_config(source_config) # noqa
info = super(SwissDCATI14YRDFHarvester, self).info()

info['name'] = 'dcat_ch_i14y_rdf'
info['title'] = 'DCAT-AP Switzerland I14Y RDF Harvester'
info['description'] = 'Harvester for DCAT-AP Switzerland datasets from an RDF graph desighned for I14Y' # noqa

return info

def _get_guid(self, dataset_dict, source_url=None): # noqa
'''
Try to get a unique identifier for a harvested dataset
It will be the first found of:
* URI (rdf:about)
* dct:identifier
* Source URL + Dataset name
* Dataset name
The last two are obviously not optimal, as depend on title, which
might change.
Returns None if no guid could be decided.
'''
guid = None
def _get_guid(self, dataset_dict, source_url=None):
guid = super(SwissDCATI14YRDFHarvester, self)._get_guid(dataset_dict, source_url)

# get organization name
try:
dataset_organization = get_action('organization_show')(
Expand All @@ -227,63 +213,19 @@ def _get_guid(self, dataset_dict, source_url=None): # noqa

except NotFound:
raise ValueError(
_('The selected organization was not found.') # noqa
_('The selected organization was not found.')
)

if dataset_dict.get('identifier'):
guid = dataset_dict['identifier']
try:
# check if the owner_org matches the identifier
if '@' in guid:
org_name = guid.split('@')[-1] # get last element
org = model.Group.by_name(org_name)
if not org:
error_msg = (
'The organization in the dataset identifier (%s) '
'does not not exist. ' % org_name
)
log.error(error_msg)
self._save_gather_error(error_msg, self.harvest_job)
return None

if org.id != dataset_dict['owner_org']:
error_msg = (
'The organization in the dataset identifier (%s) '
'does not match the organization in the harvester '
'config (%s)' % (org.id, dataset_dict['owner_org'])
)
log.error(error_msg)
self._save_gather_error(error_msg, self.harvest_job)
return None
else:
# save original I14Y identifier and create a new identifier of the form <id>@<slug>,# noqa
# where slug is an organization name
dataset_dict['identifier_i14y'] = dataset_dict['identifier'] # noqa
dataset_dict['identifier'] = \
dataset_dict['identifier'] + '@' + dataset_organization_name # noqa
except Exception as e:
log.exception("Error when getting identifier: %s" % e)
return None
return dataset_dict['identifier']

for extra in dataset_dict.get('extras', []):
if extra['key'] == 'uri' and extra['value']:
return extra['value']

if dataset_dict.get('uri'):
return dataset_dict['uri']

for extra in dataset_dict.get('extras', []):
if extra['key'] == 'identifier' and extra['value']:
return extra['value']

for extra in dataset_dict.get('extras', []):
if extra['key'] == 'dcat_identifier' and extra['value']:
return extra['value']

if dataset_dict.get('name'):
guid = dataset_dict['name']
if source_url:
guid = source_url.rstrip('/') + '/' + guid

return guid
# identifier that has form of <id>,
# should be changed to the form <id>@<slug>,
# where slug is an organization name
if (dataset_dict.get('identifier')
and dataset_dict['identifier'] == guid
and '@' not in guid):
dataset_dict['identifier_i14y'] =\
dataset_dict['identifier']
dataset_dict['identifier'] =\
dataset_dict['identifier'] + '@'\
+ dataset_organization_name

return dataset_dict['identifier']

0 comments on commit e01c202

Please sign in to comment.