From ec7fb73d4a3801092efe6766e318c34d72e9fc44 Mon Sep 17 00:00:00 2001 From: Michael Meinel Date: Wed, 18 Oct 2023 14:33:29 +0200 Subject: [PATCH 01/39] (Try to) Fix InvenioRDM license checks --- src/hermes/commands/deposit/invenio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index bbc4ab28..ecacf502 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -553,7 +553,7 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): parsed_url = urlparse(license_url) url_path = parsed_url.path.rstrip("/") - license_id = url_path.split("/")[-1] + license_id = url_path.split("/")[-1].lower() response = requests.get( f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} From d2f9147368400a5d51f57abd83b62381b4b7caff Mon Sep 17 00:00:00 2001 From: Michael Meinel Date: Wed, 18 Oct 2023 14:57:37 +0200 Subject: [PATCH 02/39] Add debug output --- src/hermes/commands/deposit/invenio.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index ecacf502..c8603546 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -555,6 +555,8 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): url_path = parsed_url.path.rstrip("/") license_id = url_path.split("/")[-1].lower() + print(f"DEBUG: License is said to be {license_url} -> {license_id}") + response = requests.get( f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} ) From 855c52c1af2d8389c5382df89eae153b39a1f413 Mon Sep 17 00:00:00 2001 From: Sophie <133236526+SKernchen@users.noreply.github.com> Date: Thu, 19 Oct 2023 01:12:36 +0200 Subject: [PATCH 03/39] Add HERMES_PUSH_TOKEN --- docs/source/tutorials/automated-publication-with-ci.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index 2ff58d96..4c56e7f7 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -167,6 +167,11 @@ and activate the option "Allow GitHub Actions to create and approve pull request Copy the Zenodo sandbox token you just created into a new [GitLab CI variable](https://docs.gitlab.com/ee/ci/variables/#for-a-project) called `ZENODO_TOKEN`. +For Gitlab you also need the HERMES Push Token. That Token gives access to the project in order for HERMES to create Merge Requests. +Therefore, you [create an access token in your project](https://docs.gitlab.com/ee/user/project/settings/project_access_tokens.html#create-a-project-access-token). +The Token needs to have at least the `developer` role and `write` access (e.g. write_repository scope). +Then you create a Gitlab CI variable with the token called `HERMES_PUSH_TOKEN`. + Copy the [template file for GitLab to Zenodo Sandbox publication](https://github.com/hermes-hmc/ci-templates/blob/main/gitlab/hermes-ci.yml) into your project to `.gitlab/hermes-ci.yml`. From 98fe0a421e1528d71e37d1b6ab44612318b87c4d Mon Sep 17 00:00:00 2001 From: Sophie <133236526+SKernchen@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:49:28 +0200 Subject: [PATCH 04/39] Note about protection of variables --- docs/source/tutorials/automated-publication-with-ci.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index 4c56e7f7..adf86cc2 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -172,6 +172,13 @@ Therefore, you [create an access token in your project](https://docs.gitlab.com/ The Token needs to have at least the `developer` role and `write` access (e.g. write_repository scope). Then you create a Gitlab CI variable with the token called `HERMES_PUSH_TOKEN`. +```{note} +The two Gitlab CI Variables include sensitive and powerful information. +Therefore you should at least select the flag `Mask variable` when creating. +If possible you should also select the flag `Protect variable` and define all branches `hermes/*` as +protected branch. +``` + Copy the [template file for GitLab to Zenodo Sandbox publication](https://github.com/hermes-hmc/ci-templates/blob/main/gitlab/hermes-ci.yml) into your project to `.gitlab/hermes-ci.yml`. From 388dc667996affbbd2f84185cc456b500ac1e5f5 Mon Sep 17 00:00:00 2001 From: Sophie <133236526+SKernchen@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:58:08 +0200 Subject: [PATCH 05/39] Add File Contributor Sophie Kernchen --- docs/source/tutorials/automated-publication-with-ci.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index adf86cc2..cb6cde95 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -8,6 +8,7 @@ SPDX-License-Identifier: CC-BY-SA-4.0 SPDX-FileContributor: Oliver Bertuch SPDX-FileContributor: Michael Meinel SPDX-FileContributor: Stephan Druskat +SPDX-FileContributor: Sophie Kernchen --> # Set up automatic software publishing From 4cc416572228a52e418c68c0770ef77c4c537c5b Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Mon, 30 Oct 2023 15:29:04 +0100 Subject: [PATCH 06/39] Retrieve license identifier by crawling vocabulary entpoint. --- src/hermes/commands/deposit/invenio.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index c8603546..8aeca716 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -23,7 +23,7 @@ from hermes.model.path import ContextPath from hermes.utils import hermes_user_agent -_DEFAULT_LICENSES_API_PATH = "api/licenses" +_DEFAULT_LICENSES_API_PATH = "api/vocabulary/licenses" _DEFAULT_COMMUNITIES_API_PATH = "api/communities" _DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" @@ -551,21 +551,22 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): "Licenses of type 'CreativeWork' are not supported." ) - parsed_url = urlparse(license_url) - url_path = parsed_url.path.rstrip("/") - license_id = url_path.split("/")[-1].lower() - - print(f"DEBUG: License is said to be {license_url} -> {license_id}") - + # Fetch full list of licenses available... maybe we should cache this. response = requests.get( - f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} + f"{license_api_url}", headers={"User-Agent": hermes_user_agent} ) - if response.status_code == 404: - raise RuntimeError(f"Not a valid license identifier: {license_id}") - # Catch other problems response.raise_for_status() - return response.json()["id"] + for license_info in response.json()['hits']['hits']: + try: + if license_info['props']['url'] == license_url: + break + except KeyError: + continue + else: + raise RuntimeError(f"Not a valid license identifier: {license_url}") + + return license_info["id"] def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): From 098fd05d8d9ebc3a0603e6599415724653bc93ca Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Mon, 30 Oct 2023 15:44:31 +0100 Subject: [PATCH 07/39] Using a fixed number to download all... this is hacky and not nice. Note: Still not working as we get SPDX URLs but the URLs on Zenodo / InvenioRDM are opensource.org URLs --- src/hermes/commands/deposit/invenio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index 8aeca716..b1405d62 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -23,7 +23,7 @@ from hermes.model.path import ContextPath from hermes.utils import hermes_user_agent -_DEFAULT_LICENSES_API_PATH = "api/vocabulary/licenses" +_DEFAULT_LICENSES_API_PATH = "api/vocabularies/licenses" _DEFAULT_COMMUNITIES_API_PATH = "api/communities" _DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" @@ -553,7 +553,7 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): # Fetch full list of licenses available... maybe we should cache this. response = requests.get( - f"{license_api_url}", headers={"User-Agent": hermes_user_agent} + f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} ) response.raise_for_status() From 37eccd86f4a76536d7be297c96502375fbef7489 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Mon, 30 Oct 2023 16:15:24 +0100 Subject: [PATCH 08/39] Re-map SPDX-URLs to alternatives where possible --- src/hermes/commands/deposit/invenio.py | 31 ++++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index b1405d62..f1707666 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -557,14 +557,31 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): ) response.raise_for_status() - for license_info in response.json()['hits']['hits']: - try: - if license_info['props']['url'] == license_url: + valid_licenses = response.json() + def _search_license_info(_url): + for license_info in valid_licenses['hits']['hits']: + try: + if license_info['props']['url'] == _url: + return license_info + except KeyError: + continue + else: + return None + + license_info = _search_license_info(license_url) + if license_info is None and license_url.startswith('https://spdx.org/licenses/'): + response = requests.get(f"{license_url}.json", headers={"User-Agent": hermes_user_agent}) + response.raise_for_status() + + for license_cross_ref in response.json()['crossRef']: + if not license_cross_ref['isValid']: + continue + + license_info = _search_license_info(license_cross_ref["url"]) + if license_info is not None: break - except KeyError: - continue - else: - raise RuntimeError(f"Not a valid license identifier: {license_url}") + else: + raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") return license_info["id"] From 72db107f233e3f88a11d4f846b9dfec850e62d03 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Mon, 30 Oct 2023 16:17:09 +0100 Subject: [PATCH 09/39] Fix pylint errors --- src/hermes/commands/deposit/invenio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index f1707666..fe328db9 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -556,8 +556,8 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} ) response.raise_for_status() - valid_licenses = response.json() + def _search_license_info(_url): for license_info in valid_licenses['hits']['hits']: try: From 2364bc34804776226557f06fce619be52b5145b3 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Tue, 31 Oct 2023 09:20:28 +0100 Subject: [PATCH 10/39] Try to guess the correct licenses identifier before doing the query journey. Thanks to @zyzzyxdonta for the hint. --- src/hermes/commands/deposit/invenio.py | 39 ++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index fe328db9..4f7ea5c4 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -134,6 +134,7 @@ def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): ) if not response.ok: + print(response.text) raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") deposit = response.json() @@ -533,7 +534,7 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): Typically, Invenio instances offer licenses from https://opendefinition.org and https://spdx.org. However, it is possible to mint PIDs for custom licenses. - An API endpoint (usually ``/api/licenses``) can be used to check whether a given + An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given license is supported by the Invenio instance. This function tries to retrieve the license by the identifier at the end of the license URL path. If this identifier does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no @@ -551,7 +552,39 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): "Licenses of type 'CreativeWork' are not supported." ) - # Fetch full list of licenses available... maybe we should cache this. + # First try: Look up license by assuming lower-case name is the correct identifier + parsed_url = urlparse(license_url) + url_path = parsed_url.path.rstrip("/") + license_id = url_path.split("/")[-1].lower() + + response = requests.get( + f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} + ) + if response.ok: + license_info = response.json() + + # Second try: Fetch full list of licenses available... maybe we should cache this. + else: + license_info = _look_up_license_info(license_api_url, license_url) + + return license_info["id"] + + +def _look_up_license_info(license_api_url, license_url): + """Deliberately try to resolve the license URL to a valid InvenioRDM license information record from the + vocabulary. + + First, this method tries to find the license URL in the list of known license vocabulary (which is fetched each + time, ouch...). + + If the URL is not found (what is pretty probable by now, as CFFConvert produces SPDX-URLs while InvenioRDM still + relies on the overhauled opensource.org URLs), the SPDX information record is fetched and all valid cross references + are sought for. + + :param license_api_url: Base API endpoint for InvenioRDM license vocabulary queries. + :param license_url: The URL for the license we are search an identifier for. + :return: The vocabulary record that is provided by InvenioRDM. + """ response = requests.get( f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} ) @@ -583,7 +616,7 @@ def _search_license_info(_url): else: raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") - return license_info["id"] + return license_info def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): From 464375e8d4238cd05d98ccde2a654bb3592a355b Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 1 Nov 2023 11:36:40 +0100 Subject: [PATCH 11/39] Move InvenioRDM adaptations into new plugin This is still based on C&P... should we extract the "common base" that is still valid and only migrate the relevant parts? Also: I stupidly changed all string occurrences... to also distinguish configurations and avoid accidential mis-use. --- hermes.toml | 10 +- pyproject.toml | 8 + src/hermes/commands/deposit/invenio.py | 67 +- src/hermes/commands/deposit/invenio_rdm.py | 717 +++++++++++++++++++++ 4 files changed, 737 insertions(+), 65 deletions(-) create mode 100644 src/hermes/commands/deposit/invenio_rdm.py diff --git a/hermes.toml b/hermes.toml index 1e31c736..2da8a88b 100644 --- a/hermes.toml +++ b/hermes.toml @@ -9,15 +9,15 @@ from = [ "cff", "git" ] validate = false [deposit] -mapping = "invenio" -target = "invenio" +mapping = "invenio_rdm" +target = "invenio_rdm" -[deposit.invenio] +[deposit.invenio_rdm] site_url = "https://sandbox.zenodo.org" communities = ["zenodo"] access_right = "open" -[deposit.invenio.api_paths] +[deposit.invenio_rdm.api_paths] depositions = "api/deposit/depositions" -licenses = "api/licenses" +licenses = "api/vocabularies/licenses" communities = "api/communities" diff --git a/pyproject.toml b/pyproject.toml index fa90f9b1..fa55422b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,34 +88,42 @@ git_add_branch = "hermes.commands.process.git:add_branch" [tool.poetry.plugins."hermes.deposit.prepare"] invenio = "hermes.commands.deposit.invenio:prepare" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:prepare" file = "hermes.commands.deposit.file:dummy_noop" [tool.poetry.plugins."hermes.deposit.map"] invenio = "hermes.commands.deposit.invenio:map_metadata" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:map_metadata" file = "hermes.commands.deposit.file:map_metadata" [tool.poetry.plugins."hermes.deposit.create_initial_version"] invenio = "hermes.commands.deposit.invenio:create_initial_version" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:create_initial_version" file = "hermes.commands.deposit.file:dummy_noop" [tool.poetry.plugins."hermes.deposit.create_new_version"] invenio = "hermes.commands.deposit.invenio:create_new_version" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:create_new_version" file = "hermes.commands.deposit.file:dummy_noop" [tool.poetry.plugins."hermes.deposit.update_metadata"] invenio = "hermes.commands.deposit.invenio:update_metadata" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:update_metadata" file = "hermes.commands.deposit.file:dummy_noop" [tool.poetry.plugins."hermes.deposit.delete_artifacts"] invenio = "hermes.commands.deposit.invenio:delete_artifacts" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:delete_artifacts" file = "hermes.commands.deposit.file:dummy_noop" [tool.poetry.plugins."hermes.deposit.upload_artifacts"] invenio = "hermes.commands.deposit.invenio:upload_artifacts" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:upload_artifacts" file = "hermes.commands.deposit.file:dummy_noop" [tool.poetry.plugins."hermes.deposit.publish"] invenio = "hermes.commands.deposit.invenio:publish" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:publish" file = "hermes.commands.deposit.file:publish" [tool.poetry.plugins."hermes.postprocess"] diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index 4f7ea5c4..bbc4ab28 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -23,7 +23,7 @@ from hermes.model.path import ContextPath from hermes.utils import hermes_user_agent -_DEFAULT_LICENSES_API_PATH = "api/vocabularies/licenses" +_DEFAULT_LICENSES_API_PATH = "api/licenses" _DEFAULT_COMMUNITIES_API_PATH = "api/communities" _DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" @@ -134,7 +134,6 @@ def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): ) if not response.ok: - print(response.text) raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") deposit = response.json() @@ -534,7 +533,7 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): Typically, Invenio instances offer licenses from https://opendefinition.org and https://spdx.org. However, it is possible to mint PIDs for custom licenses. - An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given + An API endpoint (usually ``/api/licenses``) can be used to check whether a given license is supported by the Invenio instance. This function tries to retrieve the license by the identifier at the end of the license URL path. If this identifier does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no @@ -552,71 +551,19 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): "Licenses of type 'CreativeWork' are not supported." ) - # First try: Look up license by assuming lower-case name is the correct identifier parsed_url = urlparse(license_url) url_path = parsed_url.path.rstrip("/") - license_id = url_path.split("/")[-1].lower() + license_id = url_path.split("/")[-1] response = requests.get( f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} ) - if response.ok: - license_info = response.json() - - # Second try: Fetch full list of licenses available... maybe we should cache this. - else: - license_info = _look_up_license_info(license_api_url, license_url) - - return license_info["id"] - - -def _look_up_license_info(license_api_url, license_url): - """Deliberately try to resolve the license URL to a valid InvenioRDM license information record from the - vocabulary. - - First, this method tries to find the license URL in the list of known license vocabulary (which is fetched each - time, ouch...). - - If the URL is not found (what is pretty probable by now, as CFFConvert produces SPDX-URLs while InvenioRDM still - relies on the overhauled opensource.org URLs), the SPDX information record is fetched and all valid cross references - are sought for. - - :param license_api_url: Base API endpoint for InvenioRDM license vocabulary queries. - :param license_url: The URL for the license we are search an identifier for. - :return: The vocabulary record that is provided by InvenioRDM. - """ - response = requests.get( - f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} - ) + if response.status_code == 404: + raise RuntimeError(f"Not a valid license identifier: {license_id}") + # Catch other problems response.raise_for_status() - valid_licenses = response.json() - - def _search_license_info(_url): - for license_info in valid_licenses['hits']['hits']: - try: - if license_info['props']['url'] == _url: - return license_info - except KeyError: - continue - else: - return None - - license_info = _search_license_info(license_url) - if license_info is None and license_url.startswith('https://spdx.org/licenses/'): - response = requests.get(f"{license_url}.json", headers={"User-Agent": hermes_user_agent}) - response.raise_for_status() - - for license_cross_ref in response.json()['crossRef']: - if not license_cross_ref['isValid']: - continue - - license_info = _search_license_info(license_cross_ref["url"]) - if license_info is not None: - break - else: - raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") - return license_info + return response.json()["id"] def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py new file mode 100644 index 00000000..e9206ed3 --- /dev/null +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -0,0 +1,717 @@ +# SPDX-FileCopyrightText: 2023 Helmholtz-Zentrum Dresden-Rossendorf (HZDR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: David Pape +# SPDX-FileContributor: Oliver Bertuch +# SPDX-FileContributor: Michael Meinel + +import json +import logging +import typing as t +from datetime import date, datetime +from pathlib import Path +from urllib.parse import urlparse + +import click +import requests + +from hermes import config +from hermes.commands.deposit.error import DepositionUnauthorizedError +from hermes.error import MisconfigurationError +from hermes.model.context import CodeMetaContext +from hermes.model.path import ContextPath +from hermes.utils import hermes_user_agent + +_DEFAULT_LICENSES_API_PATH = "api/vocabularies/licenses" +_DEFAULT_COMMUNITIES_API_PATH = "api/communities" +_DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" + + +def prepare(click_ctx: click.Context, ctx: CodeMetaContext): + """Prepare the deposition on an Invenio-based platform. + + In this function we do the following: + + - resolve the latest published version of this publication (if any) + - check whether the current version (given in the CodeMeta) was already published + - check whether we have a valid license identifier (if any) + - check wether the communities are valid (if configured) + - check access modalities (access right, access conditions, embargo data, existence + of license) + - check whether required configuration options are present + - check whether an auth token is given + - update ``ctx`` with metadata collected during the checks + """ + + if not click_ctx.params["auth_token"]: + raise DepositionUnauthorizedError("No auth token given for deposition platform") + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_config = config.get("deposit").get("invenio_rdm", {}) + rec_id, rec_meta = _resolve_latest_invenio_id(ctx) + + version = ctx["codemeta"].get("version") + if rec_meta and (version == rec_meta.get("version")): + raise ValueError(f"Version {version} already deposited.") + + ctx.update(invenio_path['latestRecord'], {'id': rec_id, 'metadata': rec_meta}) + + site_url = invenio_config.get("site_url") + if site_url is None: + raise MisconfigurationError("deposit.invenio_rdm.site_url is not configured") + + licenses_api_path = invenio_config.get("api_paths", {}).get( + "licenses", _DEFAULT_LICENSES_API_PATH + ) + licenses_api_url = f"{site_url}/{licenses_api_path}" + license = _get_license_identifier(ctx, licenses_api_url) + ctx.update(invenio_path["license"], license) + + communities_api_path = invenio_config.get("api_paths", {}).get( + "communities", _DEFAULT_COMMUNITIES_API_PATH + ) + communities_api_url = f"{site_url}/{communities_api_path}" + communities = _get_community_identifiers(ctx, communities_api_url) + ctx.update(invenio_path["communities"], communities) + + access_right, embargo_date, access_conditions = _get_access_modalities(license) + ctx.update(invenio_path["access_right"], access_right) + ctx.update(invenio_path["embargo_date"], embargo_date) + ctx.update(invenio_path["access_conditions"], access_conditions) + + +def map_metadata(click_ctx: click.Context, ctx: CodeMetaContext): + """Map the harvested metadata onto the Invenio schema.""" + + deposition_metadata = _codemeta_to_invenio_deposition(ctx) + + metadata_path = ContextPath.parse("deposit.invenio_rdm.depositionMetadata") + ctx.update(metadata_path, deposition_metadata) + + # Store a snapshot of the mapped data within the cache, useful for analysis, debugging, etc + with open(ctx.get_cache("deposit", "invenio_rdm", create=True), 'w') as invenio_json: + json.dump(deposition_metadata, invenio_json, indent=' ') + + +def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): + """Create an initial version of a publication. + + If a previous publication exists, this function does nothing, leaving the work for + :func:`create_new_version`. + """ + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_ctx = ctx[invenio_path] + latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") + + if latest_record_id is not None: + # A previous version exists. This means that we need to create a new version in + # the next step. Thus, there is nothing to do here. + return + + if not click_ctx.params['initial']: + raise RuntimeError("Please use `--initial` to make an initial deposition.") + + _log = logging.getLogger("cli.deposit.invenio_rdm") + + invenio_config = config.get("deposit").get("invenio_rdm", {}) + site_url = invenio_config["site_url"] + depositions_api_path = invenio_config.get("api_paths", {}).get( + "depositions", _DEFAULT_DEPOSITIONS_API_PATH + ) + + deposition_metadata = invenio_ctx["depositionMetadata"] + + deposit_url = f"{site_url}/{depositions_api_path}" + response = requests.post( + deposit_url, + json={"metadata": deposition_metadata}, + headers={ + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + ) + + if not response.ok: + print(response.text) + raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") + + deposit = response.json() + _log.debug("Created initial version deposit: %s", deposit["links"]["html"]) + with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: + json.dump(deposit, deposit_file, indent=4) + + ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) + ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) + + +def create_new_version(click_ctx: click.Context, ctx: CodeMetaContext): + """Create a new version of an existing publication. + + If no previous publication exists, this function does nothing because + :func:`create_initial_version` will have done the work. + """ + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_ctx = ctx[invenio_path] + latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") + + if latest_record_id is None: + # No previous version exists. This means that an initial version was created in + # the previous step. Thus, there is nothing to do here. + return + + session = requests.Session() + session.headers = { + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + + invenio_config = config.get("deposit").get("invenio_rdm", {}) + site_url = invenio_config["site_url"] + depositions_api_path = invenio_config.get("api_paths", {}).get( + "depositions", _DEFAULT_DEPOSITIONS_API_PATH + ) + + # Get current deposit + deposit_url = f"{site_url}/{depositions_api_path}/{latest_record_id}" + response = session.get(deposit_url) + if not response.ok: + raise RuntimeError(f"Failed to get current deposit {deposit_url!r}") + + # Create a new version using the newversion action + deposit_url = response.json()["links"]["newversion"] + response = session.post(deposit_url) + if not response.ok: + raise RuntimeError(f"Could not create new version deposit {deposit_url!r}") + + # Store link to latest draft to be used in :func:`update_metadata`. + old_deposit = response.json() + ctx.update(invenio_path["links"]["latestDraft"], old_deposit['links']['latest_draft']) + + +def update_metadata(click_ctx: click.Context, ctx: CodeMetaContext): + """Update the metadata of a draft. + + If no draft is found in the context, it is assumed that no metadata has to be + updated (e.g. because an initial version was created already containing the + metadata). + """ + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_ctx = ctx[invenio_path] + draft_url = invenio_ctx.get("links", {}).get("latestDraft") + + if draft_url is None: + return + + _log = logging.getLogger("cli.deposit.invenio_rdm") + + deposition_metadata = invenio_ctx["depositionMetadata"] + + response = requests.put( + draft_url, + json={"metadata": deposition_metadata}, + headers={ + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + ) + + if not response.ok: + raise RuntimeError(f"Could not update metadata of draft {draft_url!r}") + + deposit = response.json() + _log.debug("Created new version deposit: %s", deposit["links"]["html"]) + with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: + json.dump(deposit, deposit_file, indent=4) + + ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) + ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) + + +def delete_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): + """Delete existing file artifacts. + + This is done so that files which existed in an earlier publication but don't exist + any more, are removed. Otherwise they would cause an error because the didn't change + between versions. + """ + # TODO: This needs to be implemented! + pass + + +def upload_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): + """Upload file artifacts to the deposit. + + We'll use the bucket API rather than the files API as it supports file sizes above + 100MB. The URL to the bucket of the deposit is taken from the context at + ``deposit.invenio.links.bucket``. + """ + + bucket_url_path = ContextPath.parse("deposit.invenio_rdm.links.bucket") + bucket_url = ctx[bucket_url_path] + + session = requests.Session() + session.headers = { + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + + files: list[click.Path] = click_ctx.params["file"] + for path_arg in files: + path = Path(path_arg) + + # This should not happen, as Click shall not accept dirs as arguments already. Zero trust anyway. + if not path.is_file(): + raise ValueError("Any given argument to be included in the deposit must be a file.") + + with open(path, "rb") as file_content: + response = session.put( + f"{bucket_url}/{path.name}", + data=file_content + ) + if not response.ok: + raise RuntimeError(f"Could not upload file {path.name!r} into bucket {bucket_url!r}") + + # This can potentially be used to verify the checksum + # file_resource = response.json() + + +def publish(click_ctx: click.Context, ctx: CodeMetaContext): + """Publish the deposited record. + + This is done by doing a POST request to the publication URL stored in the context at + ``deposit.invenio.links.publish``. + """ + + _log = logging.getLogger("cli.deposit.invenio_rdm") + + publish_url_path = ContextPath.parse("deposit.invenio_rdm.links.publish") + publish_url = ctx[publish_url_path] + + response = requests.post( + publish_url, + headers={ + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}" + } + ) + + if not response.ok: + _log.debug(response.text) + raise RuntimeError(f"Could not publish deposit via {publish_url!r}") + + record = response.json() + _log.info("Published record: %s", record["links"]["record_html"]) + + +def _resolve_latest_invenio_id(ctx: CodeMetaContext) -> t.Tuple[str, dict]: + """ + Using the given configuration and metadata, figure out the latest record id. + + If a record id is present as configuration ``deposit.invenio.record_id`` this one will be used to identify the + latest version of the record. Otherwise, if there is a doi present (either as configuration with key + ``deposit.invenio.doi`` or as a codemeta identifier, the DOI will be used to resolve the base record id. + + Anyway, the record id will always be used to resolve the latest version. + + If any of the resolution steps fail or produce an unexpected result, a ValueError will be thrown. + + :param ctx: The context for which the record id should be resolved. + :return: The Invenio record id and the metadata of the record + """ + + invenio_config = config.get('deposit').get('invenio_rdm', {}) + site_url = invenio_config.get('site_url') + if site_url is None: + raise MisconfigurationError("deposit.invenio_rdm.site_url is not configured") + + # Check if we configured an Invenio record ID (of the concept...) + record_id = invenio_config.get('record_id') + if record_id is None: + doi = invenio_config.get('doi') + if doi is None: + try: + # TODO: There might be more semantic in the codemeta.identifier... (also see schema.org) + identifier = ctx['codemeta.identifier'] + if identifier.startswith('https://doi.org/'): + doi = identifier[16:] + elif identifier.startswith('http://dx.doi.org/'): + doi = identifier[18:] + except KeyError: + pass + + if doi is not None: + # If we got a DOI, resolve it (using doi.org) into a Invenio URL ... and extract the record id. + record_id = _invenio_resolve_doi(site_url, doi) + + if record_id is not None: + # If we got a record id by now, resolve it using the Invenio API to the latests record. + return _invenio_resolve_record_id(site_url, record_id) + + return None, {} + + +def _invenio_resolve_doi(site_url, doi) -> str: + """ + Resolve an DOI to a Invenio URL and extract the record id. + + :param site_url: Root URL for the Invenio instance to use. + :param doi: The DOI to be resolved (only the identifier *without* the ``https://doi.org/`` prefix). + :return: The record ID on the respective instance. + """ + + res = requests.get(f'https://doi.org/{doi}') + + # This is a mean hack due to DataCite answering a 404 with a 200 status + if res.url == 'https://datacite.org/404.html': + raise ValueError(f"Invalid DOI: {doi}") + + # Ensure the resolved record is on the correct instance + if not res.url.startswith(site_url): + raise ValueError(f"{res.url} is not on configured host {site_url}.") + + # Extract the record id as last part of the URL path + page_url = urlparse(res.url) + *_, record_id = page_url.path.split('/') + return record_id + + +def _invenio_resolve_record_id(site_url: str, record_id: str) -> t.Tuple[str, dict]: + """ + Find the latest version of a given record. + + :param site_url: Root URL for the Invenio instance to use. + :param record_id: The record that sould be resolved. + :return: The record id of the latest version for the requested record. + """ + res = requests.get(f"{site_url}/api/records/{record_id}") + if res.status_code != 200: + raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") + + res_json = res.json() + res = requests.get(res_json['links']['latest']) + if res.status_code != 200: + raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") + + res_json = res.json() + return res_json['id'], res_json['metadata'] + + +def _codemeta_to_invenio_deposition(ctx: CodeMetaContext) -> dict: + """The mapping logic. + + Functionality similar to this exists in the ``convert_codemeta`` package which uses + the crosswalk tables to do the mapping: + + .. code-block:: python + + invenio_metadata = convert_codemeta.crosswalk( + metadata, "codemeta", "Zenodo" + ) + + Unfortunately, this doesn't work well with additional metadata in the same dict, so + it is safer to provide our own implementation. + + Currently, this function handles a lot of cases which we want to be able to + configure. A simple mapping from one JSON path to another is not enough. + + The metadata expected by Zenodo is described in the `Zenodo Developers guide + `_. Unfortunately, there doesn't seem + to be a schema one can download in order to validate these metadata. There might be + differences between Invenio-based platforms. + """ + + metadata = ctx["codemeta"] + license = ctx["deposit"]["invenio_rdm"]["license"] + communities = ctx["deposit"]["invenio_rdm"]["communities"] + access_right = ctx["deposit"]["invenio_rdm"]["access_right"] + embargo_date = ctx["deposit"]["invenio_rdm"]["embargo_date"] + access_conditions = ctx["deposit"]["invenio_rdm"]["access_conditions"] + + creators = [ + # TODO: Distinguish between @type "Person" and others + { + k: v for k, v in { + # TODO: This is ugly + "affiliation": author.get("affiliation", {"legalName": None}).get("legalName"), + # Invenio wants "family, given". author.get("name") might not have this format. + "name": f"{author.get('familyName')}, {author.get('givenName')}" + if author.get("familyName") and author.get("givenName") + else author.get("name"), + # Invenio expects the ORCID without the URL part + "orcid": author.get("@id", "").replace("https://orcid.org/", "") or None, + }.items() if v is not None + } + for author in metadata["author"] + ] + + # This is not used at the moment. See comment below in `deposition_metadata` dict. + contributors = [ # noqa: F841 + # TODO: Distinguish between @type "Person" and others + { + k: v for k, v in { + # TODO: This is ugly + "affiliation": contributor.get("affiliation", {"legalName": None}).get("legalName"), + # Invenio wants "family, given". contributor.get("name") might not have this format. + "name": f"{contributor.get('familyName')}, {contributor.get('givenName')}" + if contributor.get("familyName") and contributor.get("givenName") + else contributor.get("name"), + # Invenio expects the ORCID without the URL part + "orcid": contributor.get("@id", "").replace("https://orcid.org/", "") or None, + # TODO: Many possibilities here. Get from config + "type": "ProjectMember", + }.items() if v is not None + } + # TODO: Filtering out "GitHub" should be done elsewhere + for contributor in metadata["contributor"] if contributor.get("name") != "GitHub" + ] + + # TODO: Use the fields currently set to `None`. + # Some more fields are available but they most likely don't relate to software + # publications targeted by hermes. + deposition_metadata = {k: v for k, v in { + # If upload_type is "publication"/"image", a publication_type/image_type must be + # specified. Since hermes targets software publications, this can be ignored and + # upload_type can be hard-coded to "software". + # TODO: Make this a constant maybe. + "upload_type": "software", + # IS0 8601-formatted date + # TODO: Maybe we want a different date? Then make this configurable. If not, + # this can be removed as it defaults to today. + "publication_date": date.today().isoformat(), + "title": metadata["name"], + "creators": creators, + # TODO: Use a real description here. Possible sources could be + # `tool.poetry.description` from pyproject.toml or `abstract` from + # CITATION.cff. This should then be stored in codemeta description field. + "description": metadata["name"], + "access_right": access_right, + "license": license, + "embargo_date": embargo_date, + "access_conditions": access_conditions, + # TODO: If a publisher already has assigned a DOI to the files we want to + # upload, it should be used here. In this case, Invenio will not give us a new + # one. Set "prereserve_doi" accordingly. + "doi": None, + # This prereserves a DOI that can then be added to the files before publishing + # them. + # TODO: Use the DOI we get back from this. + "prereserve_doi": True, + # TODO: A good source for this could be `tool.poetry.keywords` in pyproject.toml. + "keywords": None, + "notes": None, + "related_identifiers": None, + # TODO: Use `contributors`. In the case of the hermes workflow itself, the + # contributors are currently all in `creators` already. So for now, we set this + # to `None`. Change this when relationship between authors and contributors can + # be specified in the processing step. + "contributors": None, + "references": None, + "communities": communities, + "grants": None, + "subjects": None, + "version": metadata.get('version'), + }.items() if v is not None} + + return deposition_metadata + + +def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): + """Get Invenio license representation from CodeMeta. + + The license to use is extracted from the ``license`` field in the + :class:`CodeMetaContext` and converted into an appropiate license identifier to be + passed to an Invenio instance. + + A license according to CodeMeta may be a URL (text) or a CreativeWork. This function + only handles URLs. If a ``license`` field is present in the CodeMeta and it is not + of type :class:`str`, a :class:`RuntimeError` is raised. + + Invenio instances take a license string which refers to a license identifier. + Typically, Invenio instances offer licenses from https://opendefinition.org and + https://spdx.org. However, it is possible to mint PIDs for custom licenses. + + An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given + license is supported by the Invenio instance. This function tries to retrieve the + license by the identifier at the end of the license URL path. If this identifier + does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no + license is given in the CodeMeta, ``None`` is returned. + """ + + license_url = ctx["codemeta"].get("license") + + if license_url is None: + return None + + if not isinstance(license_url, str): + raise RuntimeError( + "The given license in CodeMeta must be of type str. " + "Licenses of type 'CreativeWork' are not supported." + ) + + # First try: Look up license by assuming lower-case name is the correct identifier + parsed_url = urlparse(license_url) + url_path = parsed_url.path.rstrip("/") + license_id = url_path.split("/")[-1].lower() + + response = requests.get( + f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} + ) + if response.ok: + license_info = response.json() + + # Second try: Fetch full list of licenses available... maybe we should cache this. + else: + license_info = _look_up_license_info(license_api_url, license_url) + + return license_info["id"] + + +def _look_up_license_info(license_api_url, license_url): + """Deliberately try to resolve the license URL to a valid InvenioRDM license information record from the + vocabulary. + + First, this method tries to find the license URL in the list of known license vocabulary (which is fetched each + time, ouch...). + + If the URL is not found (what is pretty probable by now, as CFFConvert produces SPDX-URLs while InvenioRDM still + relies on the overhauled opensource.org URLs), the SPDX information record is fetched and all valid cross references + are sought for. + + :param license_api_url: Base API endpoint for InvenioRDM license vocabulary queries. + :param license_url: The URL for the license we are search an identifier for. + :return: The vocabulary record that is provided by InvenioRDM. + """ + response = requests.get( + f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} + ) + response.raise_for_status() + valid_licenses = response.json() + + def _search_license_info(_url): + for license_info in valid_licenses['hits']['hits']: + try: + if license_info['props']['url'] == _url: + return license_info + except KeyError: + continue + else: + return None + + license_info = _search_license_info(license_url) + if license_info is None and license_url.startswith('https://spdx.org/licenses/'): + response = requests.get(f"{license_url}.json", headers={"User-Agent": hermes_user_agent}) + response.raise_for_status() + + for license_cross_ref in response.json()['crossRef']: + if not license_cross_ref['isValid']: + continue + + license_info = _search_license_info(license_cross_ref["url"]) + if license_info is not None: + break + else: + raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") + + return license_info + + +def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): + """Get Invenio community identifiers from config. + + This function gets the communities to be used for the deposition on an Invenio-based + site from the config and checks their validity against the site's API. If one of the + identifiers can not be found on the site, a :class:`MisconfigurationError` is + raised. + """ + + communities = config.get("deposit").get("invenio_rdm", {}).get("communities") + if communities is None: + return None + + session = requests.Session() + session.headers = {"User-Agent": hermes_user_agent} + + community_ids = [] + for community_id in communities: + url = f"{communities_api_url}/{community_id}" + response = session.get(url) + if response.status_code == 404: + raise MisconfigurationError( + f"Not a valid community identifier: {community_id}" + ) + # Catch other problems + response.raise_for_status() + community_ids.append({"identifier": response.json()["id"]}) + + return community_ids + + +def _get_access_modalities(license): + """Get access right, embargo date and access conditions based on configuration and given license. + + This function implements the rules laid out in the `Zenodo developer documentation + `_: + + - ``access_right`` is a controlled vocabulary + - embargoed access depositions need an embargo date + - restricted access depositions need access conditions + - open and embargoed access depositions need a license + - closed access depositions have no further requirements + + This function also makes sure that the given embargo date can be parsed as an ISO + 8601 string representation and that the access rights are given as a string. + """ + invenio_config = config.get("deposit").get("invenio_rdm", {}) + + access_right = invenio_config.get("access_right") + if access_right is None: + raise MisconfigurationError("deposit.invenio_rdm.access_right is not configured") + + access_right_options = ["open", "embargoed", "restricted", "closed"] + if access_right not in access_right_options: + raise MisconfigurationError( + "deposition.invenio_rdm.access_right must be one of: " + f"{', '.join(access_right_options)}" + ) + + embargo_date = invenio_config.get("embargo_date") + if access_right == "embargoed" and embargo_date is None: + raise MisconfigurationError( + f"With access_right {access_right}, " + "deposit.invenio_rdm.embargo_date must be configured" + ) + + if embargo_date is not None: + try: + datetime.fromisoformat(embargo_date) + except ValueError: + raise MisconfigurationError( + f"Could not parse deposit.invenio_rdm.embargo_date {embargo_date!r}. " + "Must be in ISO 8601 format." + ) + + access_conditions = invenio_config.get("access_conditions") + if access_right == "restricted" and access_conditions is None: + raise MisconfigurationError( + f"With access_right {access_right}, " + "deposit.invenio_rdm.access_conditions must be configured" + ) + + if access_conditions is not None and not isinstance(access_conditions, str): + raise MisconfigurationError( + "deposit.invenio_rdm.access_conditions must be a string (HTML is allowed)." + ) + + if license is None and access_right in ["open", "embargoed"]: + raise MisconfigurationError( + f"With access_right {access_right}, a license is required." + ) + + if access_right == "closed": + pass + + return access_right, embargo_date, access_conditions From e9ac24652437d3cb1e416758c2ff73ab0cb4fcb7 Mon Sep 17 00:00:00 2001 From: David Pape Date: Wed, 1 Nov 2023 14:38:39 +0100 Subject: [PATCH 12/39] Use terminal-only logger for hermes clean --- src/hermes/commands/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index 39b15b39..dccd6bd5 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -291,7 +291,7 @@ def clean(): """ Remove cached data. """ - audit_log = logging.getLogger('audit') + audit_log = logging.getLogger('cli') audit_log.info("# Cleanup") # Create Hermes context (i.e., all collected metadata for all stages...) From 895bdefa9c7ed09d94cadf6bc3ca3a3aecb0e3be Mon Sep 17 00:00:00 2001 From: David Pape Date: Wed, 1 Nov 2023 14:43:48 +0100 Subject: [PATCH 13/39] Shutdown logging before cleaning files This ensures that any and all opened logging files in the .hermes directory are closed, allowing us to safely remove that directory. --- src/hermes/commands/workflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index dccd6bd5..c1ad6c59 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -293,6 +293,8 @@ def clean(): """ audit_log = logging.getLogger('cli') audit_log.info("# Cleanup") + # shut down logging so that .hermes/ can safely be removed + logging.shutdown() # Create Hermes context (i.e., all collected metadata for all stages...) ctx = HermesContext() From 17891469fa6d443b9e28ee9c5478935e3aa6892e Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 08:03:03 +0100 Subject: [PATCH 14/39] Remove setting that is no longer used --- hermes.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/hermes.toml b/hermes.toml index 2da8a88b..f6831be5 100644 --- a/hermes.toml +++ b/hermes.toml @@ -9,7 +9,6 @@ from = [ "cff", "git" ] validate = false [deposit] -mapping = "invenio_rdm" target = "invenio_rdm" [deposit.invenio_rdm] From 94a689d5b40dcdbc8d847e465dcdff4e41e9da92 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 08:11:55 +0100 Subject: [PATCH 15/39] Use logging instead of printing --- src/hermes/commands/deposit/invenio_rdm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py index e9206ed3..44127ef7 100644 --- a/src/hermes/commands/deposit/invenio_rdm.py +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -134,7 +134,7 @@ def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): ) if not response.ok: - print(response.text) + _log.error("Webserver response: \n%s", response.text) raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") deposit = response.json() From 4e2fcae45cdae87cf3aa194a20236fb17e7d9887 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 08:17:11 +0100 Subject: [PATCH 16/39] Extend description of license look up to match the actual algorithm --- src/hermes/commands/deposit/invenio_rdm.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py index 44127ef7..4199a326 100644 --- a/src/hermes/commands/deposit/invenio_rdm.py +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -536,9 +536,16 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given license is supported by the Invenio instance. This function tries to retrieve the - license by the identifier at the end of the license URL path. If this identifier - does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no - license is given in the CodeMeta, ``None`` is returned. + license by lower-casing the identifier at the end of the license URL path. If this identifier + does not exist on the Invenio instance, all available licenses are fetched and the URL is sought + for in the results. However, this might again not lead to success (as Invenio still provides + the obsolete https://opensource.org URLs) but harvesters might provide the SPDX style URLs. + Hence, the license URL is checked whether it is pointing to https://spdx.org/licenses/ and if + this is the case, the license record from SPDX is fetched and all `crossRef` URLs that are flagged + `isValid` are again sought for in the full set of licenses. Only if this still fails, + a :class:`RuntimeError` is raised. + + If no license is given in the CodeMeta, ``None`` is returned. """ license_url = ctx["codemeta"].get("license") From 727206999083286e39a6c1aa481f0baff4ca8967 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 12:12:21 +0100 Subject: [PATCH 17/39] Add post-processing plugin for InvenioRDM This might have been parametrized or implemented differently to distinguish between Invenio and InvenioRDM. However, this way works without further coding headache... --- pyproject.toml | 3 +- .../commands/postprocess/invenio_rdm.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 src/hermes/commands/postprocess/invenio_rdm.py diff --git a/pyproject.toml b/pyproject.toml index fa55422b..7f13c0c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,7 +127,8 @@ invenio_rdm = "hermes.commands.deposit.invenio_rdm:publish" file = "hermes.commands.deposit.file:publish" [tool.poetry.plugins."hermes.postprocess"] -config_record_id = "hermes.commands.postprocess.invenio:config_record_id" +config_invenio_record_id = "hermes.commands.postprocess.invenio:config_record_id" +config_invenio_rdm_record_id = "hermes.commands.postprocess.invenio_rdm:config_record_id" cff_doi = "hermes.commands.postprocess.invenio:cff_doi" [tool.taskipy.tasks] diff --git a/src/hermes/commands/postprocess/invenio_rdm.py b/src/hermes/commands/postprocess/invenio_rdm.py new file mode 100644 index 00000000..c14a3a07 --- /dev/null +++ b/src/hermes/commands/postprocess/invenio_rdm.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2023 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel +# SPDX-FileContributor: Stephan Druskat + +import json +import logging + +import toml +from ruamel import yaml + +from hermes import config + + +_log = logging.getLogger('deposit.invenio_rdm') + + +def config_record_id(ctx): + deposition_path = ctx.get_cache('deposit', 'deposit') + with deposition_path.open("r") as deposition_file: + deposition = json.load(deposition_file) + conf = config.get('hermes') + try: + conf['deposit']['invenio_rdm']['record_id'] = deposition['record_id'] + toml.dump(conf, open('hermes.toml', 'w')) + except KeyError: + raise RuntimeError("No deposit.invenio configuration available to store record id in") from None From 543117f82fb1600667463791ab0572689da836df Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 12:12:43 +0100 Subject: [PATCH 18/39] Adapt tutorial for Zenodo running on InvenioRDM --- docs/source/tutorials/automated-publication-with-ci.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index cb6cde95..7fa0cff6 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -73,14 +73,14 @@ Configure HERMES to: from = [ "git", "cff" ] [deposit] -target = "invenio" +target = "invenio_rdm" -[deposit.invenio] +[deposit.invenio_rdm] site_url = "https://sandbox.zenodo.org" access_right = "open" [postprocess] -execute = [ "config_record_id" ] +execute = [ "config_invenio_rdm_record_id" ] ``` Copy this file to the root directory of your repository and add it to version control: @@ -93,7 +93,7 @@ git push ```{note} If you decide to start from an existing `hermes.toml` (e.g., the one found in this repository), -be sure that there is no `record_id` value defined in the `deposit.invenio` section. +be sure that there is no `record_id` value defined in the `deposit.invenio_rdm` section. Otherwise, your deposition will fail as *hermes* would try to deposit a new version for the given record. ``` From dc28a846f93a19d99bdedf429cf1f41a55aded40 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 12:14:36 +0100 Subject: [PATCH 19/39] Fix pylint error --- src/hermes/commands/postprocess/invenio_rdm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hermes/commands/postprocess/invenio_rdm.py b/src/hermes/commands/postprocess/invenio_rdm.py index c14a3a07..fc249132 100644 --- a/src/hermes/commands/postprocess/invenio_rdm.py +++ b/src/hermes/commands/postprocess/invenio_rdm.py @@ -9,7 +9,6 @@ import logging import toml -from ruamel import yaml from hermes import config From 2d676bfded6433ab923a40ac3acb62f17d0f2b91 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 16:20:27 +0100 Subject: [PATCH 20/39] Add '_' to allowed key characters --- src/hermes/model/path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/model/path.py b/src/hermes/model/path.py index fe354633..036c07e6 100644 --- a/src/hermes/model/path.py +++ b/src/hermes/model/path.py @@ -29,7 +29,7 @@ class ContextPathGrammar: The pyparsing grammar for ContextGrammar paths. """ - key = pp.Word('@:' + pp.alphas) + key = pp.Word('@:_' + pp.alphas) index = pp.Word(pp.nums).set_parse_action(lambda tok: [int(tok[0])]) | pp.Char('*') field = key + (pp.Suppress('[') + index + pp.Suppress(']'))[...] path = field + (pp.Suppress('.') + field)[...] From a30968171fe7de6fcf4eeec5f855c10e7763e758 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 1 Nov 2023 11:36:40 +0100 Subject: [PATCH 21/39] Move InvenioRDM adaptations into new plugin This is still based on C&P... should we extract the "common base" that is still valid and only migrate the relevant parts? Also: I stupidly changed all string occurrences... to also distinguish configurations and avoid accidential mis-use. --- hermes.toml | 9 +- pyproject.toml | 41 ++ src/hermes/commands/deposit/invenio_rdm.py | 717 +++++++++++++++++++++ 3 files changed, 763 insertions(+), 4 deletions(-) create mode 100644 src/hermes/commands/deposit/invenio_rdm.py diff --git a/hermes.toml b/hermes.toml index bc7eb1e8..2da8a88b 100644 --- a/hermes.toml +++ b/hermes.toml @@ -9,14 +9,15 @@ from = [ "cff", "git" ] validate = false [deposit] -target = "invenio" +mapping = "invenio_rdm" +target = "invenio_rdm" -[deposit.invenio] +[deposit.invenio_rdm] site_url = "https://sandbox.zenodo.org" communities = ["zenodo"] access_right = "open" -[deposit.invenio.api_paths] +[deposit.invenio_rdm.api_paths] depositions = "api/deposit/depositions" -licenses = "api/licenses" +licenses = "api/vocabularies/licenses" communities = "api/communities" diff --git a/pyproject.toml b/pyproject.toml index f3d90e1d..4ffa8f23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,6 +89,47 @@ git_add_branch = "hermes.commands.process.git:add_branch" [tool.poetry.plugins."hermes.deposit"] file = "hermes.commands.deposit.file:FileDepositPlugin" invenio = "hermes.commands.deposit.invenio:InvenioDepositPlugin" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:InvenioDepositPlugin" + +[tool.poetry.plugins."hermes.deposit.prepare"] +invenio = "hermes.commands.deposit.invenio:prepare" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:prepare" +file = "hermes.commands.deposit.file:dummy_noop" + +[tool.poetry.plugins."hermes.deposit.map"] +invenio = "hermes.commands.deposit.invenio:map_metadata" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:map_metadata" +file = "hermes.commands.deposit.file:map_metadata" + +[tool.poetry.plugins."hermes.deposit.create_initial_version"] +invenio = "hermes.commands.deposit.invenio:create_initial_version" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:create_initial_version" +file = "hermes.commands.deposit.file:dummy_noop" + +[tool.poetry.plugins."hermes.deposit.create_new_version"] +invenio = "hermes.commands.deposit.invenio:create_new_version" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:create_new_version" +file = "hermes.commands.deposit.file:dummy_noop" + +[tool.poetry.plugins."hermes.deposit.update_metadata"] +invenio = "hermes.commands.deposit.invenio:update_metadata" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:update_metadata" +file = "hermes.commands.deposit.file:dummy_noop" + +[tool.poetry.plugins."hermes.deposit.delete_artifacts"] +invenio = "hermes.commands.deposit.invenio:delete_artifacts" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:delete_artifacts" +file = "hermes.commands.deposit.file:dummy_noop" + +[tool.poetry.plugins."hermes.deposit.upload_artifacts"] +invenio = "hermes.commands.deposit.invenio:upload_artifacts" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:upload_artifacts" +file = "hermes.commands.deposit.file:dummy_noop" + +[tool.poetry.plugins."hermes.deposit.publish"] +invenio = "hermes.commands.deposit.invenio:publish" +invenio_rdm = "hermes.commands.deposit.invenio_rdm:publish" +file = "hermes.commands.deposit.file:publish" [tool.poetry.plugins."hermes.postprocess"] config_record_id = "hermes.commands.postprocess.invenio:config_record_id" diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py new file mode 100644 index 00000000..e9206ed3 --- /dev/null +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -0,0 +1,717 @@ +# SPDX-FileCopyrightText: 2023 Helmholtz-Zentrum Dresden-Rossendorf (HZDR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: David Pape +# SPDX-FileContributor: Oliver Bertuch +# SPDX-FileContributor: Michael Meinel + +import json +import logging +import typing as t +from datetime import date, datetime +from pathlib import Path +from urllib.parse import urlparse + +import click +import requests + +from hermes import config +from hermes.commands.deposit.error import DepositionUnauthorizedError +from hermes.error import MisconfigurationError +from hermes.model.context import CodeMetaContext +from hermes.model.path import ContextPath +from hermes.utils import hermes_user_agent + +_DEFAULT_LICENSES_API_PATH = "api/vocabularies/licenses" +_DEFAULT_COMMUNITIES_API_PATH = "api/communities" +_DEFAULT_DEPOSITIONS_API_PATH = "api/deposit/depositions" + + +def prepare(click_ctx: click.Context, ctx: CodeMetaContext): + """Prepare the deposition on an Invenio-based platform. + + In this function we do the following: + + - resolve the latest published version of this publication (if any) + - check whether the current version (given in the CodeMeta) was already published + - check whether we have a valid license identifier (if any) + - check wether the communities are valid (if configured) + - check access modalities (access right, access conditions, embargo data, existence + of license) + - check whether required configuration options are present + - check whether an auth token is given + - update ``ctx`` with metadata collected during the checks + """ + + if not click_ctx.params["auth_token"]: + raise DepositionUnauthorizedError("No auth token given for deposition platform") + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_config = config.get("deposit").get("invenio_rdm", {}) + rec_id, rec_meta = _resolve_latest_invenio_id(ctx) + + version = ctx["codemeta"].get("version") + if rec_meta and (version == rec_meta.get("version")): + raise ValueError(f"Version {version} already deposited.") + + ctx.update(invenio_path['latestRecord'], {'id': rec_id, 'metadata': rec_meta}) + + site_url = invenio_config.get("site_url") + if site_url is None: + raise MisconfigurationError("deposit.invenio_rdm.site_url is not configured") + + licenses_api_path = invenio_config.get("api_paths", {}).get( + "licenses", _DEFAULT_LICENSES_API_PATH + ) + licenses_api_url = f"{site_url}/{licenses_api_path}" + license = _get_license_identifier(ctx, licenses_api_url) + ctx.update(invenio_path["license"], license) + + communities_api_path = invenio_config.get("api_paths", {}).get( + "communities", _DEFAULT_COMMUNITIES_API_PATH + ) + communities_api_url = f"{site_url}/{communities_api_path}" + communities = _get_community_identifiers(ctx, communities_api_url) + ctx.update(invenio_path["communities"], communities) + + access_right, embargo_date, access_conditions = _get_access_modalities(license) + ctx.update(invenio_path["access_right"], access_right) + ctx.update(invenio_path["embargo_date"], embargo_date) + ctx.update(invenio_path["access_conditions"], access_conditions) + + +def map_metadata(click_ctx: click.Context, ctx: CodeMetaContext): + """Map the harvested metadata onto the Invenio schema.""" + + deposition_metadata = _codemeta_to_invenio_deposition(ctx) + + metadata_path = ContextPath.parse("deposit.invenio_rdm.depositionMetadata") + ctx.update(metadata_path, deposition_metadata) + + # Store a snapshot of the mapped data within the cache, useful for analysis, debugging, etc + with open(ctx.get_cache("deposit", "invenio_rdm", create=True), 'w') as invenio_json: + json.dump(deposition_metadata, invenio_json, indent=' ') + + +def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): + """Create an initial version of a publication. + + If a previous publication exists, this function does nothing, leaving the work for + :func:`create_new_version`. + """ + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_ctx = ctx[invenio_path] + latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") + + if latest_record_id is not None: + # A previous version exists. This means that we need to create a new version in + # the next step. Thus, there is nothing to do here. + return + + if not click_ctx.params['initial']: + raise RuntimeError("Please use `--initial` to make an initial deposition.") + + _log = logging.getLogger("cli.deposit.invenio_rdm") + + invenio_config = config.get("deposit").get("invenio_rdm", {}) + site_url = invenio_config["site_url"] + depositions_api_path = invenio_config.get("api_paths", {}).get( + "depositions", _DEFAULT_DEPOSITIONS_API_PATH + ) + + deposition_metadata = invenio_ctx["depositionMetadata"] + + deposit_url = f"{site_url}/{depositions_api_path}" + response = requests.post( + deposit_url, + json={"metadata": deposition_metadata}, + headers={ + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + ) + + if not response.ok: + print(response.text) + raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") + + deposit = response.json() + _log.debug("Created initial version deposit: %s", deposit["links"]["html"]) + with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: + json.dump(deposit, deposit_file, indent=4) + + ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) + ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) + + +def create_new_version(click_ctx: click.Context, ctx: CodeMetaContext): + """Create a new version of an existing publication. + + If no previous publication exists, this function does nothing because + :func:`create_initial_version` will have done the work. + """ + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_ctx = ctx[invenio_path] + latest_record_id = invenio_ctx.get("latestRecord", {}).get("id") + + if latest_record_id is None: + # No previous version exists. This means that an initial version was created in + # the previous step. Thus, there is nothing to do here. + return + + session = requests.Session() + session.headers = { + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + + invenio_config = config.get("deposit").get("invenio_rdm", {}) + site_url = invenio_config["site_url"] + depositions_api_path = invenio_config.get("api_paths", {}).get( + "depositions", _DEFAULT_DEPOSITIONS_API_PATH + ) + + # Get current deposit + deposit_url = f"{site_url}/{depositions_api_path}/{latest_record_id}" + response = session.get(deposit_url) + if not response.ok: + raise RuntimeError(f"Failed to get current deposit {deposit_url!r}") + + # Create a new version using the newversion action + deposit_url = response.json()["links"]["newversion"] + response = session.post(deposit_url) + if not response.ok: + raise RuntimeError(f"Could not create new version deposit {deposit_url!r}") + + # Store link to latest draft to be used in :func:`update_metadata`. + old_deposit = response.json() + ctx.update(invenio_path["links"]["latestDraft"], old_deposit['links']['latest_draft']) + + +def update_metadata(click_ctx: click.Context, ctx: CodeMetaContext): + """Update the metadata of a draft. + + If no draft is found in the context, it is assumed that no metadata has to be + updated (e.g. because an initial version was created already containing the + metadata). + """ + + invenio_path = ContextPath.parse("deposit.invenio_rdm") + invenio_ctx = ctx[invenio_path] + draft_url = invenio_ctx.get("links", {}).get("latestDraft") + + if draft_url is None: + return + + _log = logging.getLogger("cli.deposit.invenio_rdm") + + deposition_metadata = invenio_ctx["depositionMetadata"] + + response = requests.put( + draft_url, + json={"metadata": deposition_metadata}, + headers={ + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + ) + + if not response.ok: + raise RuntimeError(f"Could not update metadata of draft {draft_url!r}") + + deposit = response.json() + _log.debug("Created new version deposit: %s", deposit["links"]["html"]) + with open(ctx.get_cache('deposit', 'deposit', create=True), 'w') as deposit_file: + json.dump(deposit, deposit_file, indent=4) + + ctx.update(invenio_path["links"]["bucket"], deposit["links"]["bucket"]) + ctx.update(invenio_path["links"]["publish"], deposit["links"]["publish"]) + + +def delete_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): + """Delete existing file artifacts. + + This is done so that files which existed in an earlier publication but don't exist + any more, are removed. Otherwise they would cause an error because the didn't change + between versions. + """ + # TODO: This needs to be implemented! + pass + + +def upload_artifacts(click_ctx: click.Context, ctx: CodeMetaContext): + """Upload file artifacts to the deposit. + + We'll use the bucket API rather than the files API as it supports file sizes above + 100MB. The URL to the bucket of the deposit is taken from the context at + ``deposit.invenio.links.bucket``. + """ + + bucket_url_path = ContextPath.parse("deposit.invenio_rdm.links.bucket") + bucket_url = ctx[bucket_url_path] + + session = requests.Session() + session.headers = { + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}", + } + + files: list[click.Path] = click_ctx.params["file"] + for path_arg in files: + path = Path(path_arg) + + # This should not happen, as Click shall not accept dirs as arguments already. Zero trust anyway. + if not path.is_file(): + raise ValueError("Any given argument to be included in the deposit must be a file.") + + with open(path, "rb") as file_content: + response = session.put( + f"{bucket_url}/{path.name}", + data=file_content + ) + if not response.ok: + raise RuntimeError(f"Could not upload file {path.name!r} into bucket {bucket_url!r}") + + # This can potentially be used to verify the checksum + # file_resource = response.json() + + +def publish(click_ctx: click.Context, ctx: CodeMetaContext): + """Publish the deposited record. + + This is done by doing a POST request to the publication URL stored in the context at + ``deposit.invenio.links.publish``. + """ + + _log = logging.getLogger("cli.deposit.invenio_rdm") + + publish_url_path = ContextPath.parse("deposit.invenio_rdm.links.publish") + publish_url = ctx[publish_url_path] + + response = requests.post( + publish_url, + headers={ + "User-Agent": hermes_user_agent, + "Authorization": f"Bearer {click_ctx.params['auth_token']}" + } + ) + + if not response.ok: + _log.debug(response.text) + raise RuntimeError(f"Could not publish deposit via {publish_url!r}") + + record = response.json() + _log.info("Published record: %s", record["links"]["record_html"]) + + +def _resolve_latest_invenio_id(ctx: CodeMetaContext) -> t.Tuple[str, dict]: + """ + Using the given configuration and metadata, figure out the latest record id. + + If a record id is present as configuration ``deposit.invenio.record_id`` this one will be used to identify the + latest version of the record. Otherwise, if there is a doi present (either as configuration with key + ``deposit.invenio.doi`` or as a codemeta identifier, the DOI will be used to resolve the base record id. + + Anyway, the record id will always be used to resolve the latest version. + + If any of the resolution steps fail or produce an unexpected result, a ValueError will be thrown. + + :param ctx: The context for which the record id should be resolved. + :return: The Invenio record id and the metadata of the record + """ + + invenio_config = config.get('deposit').get('invenio_rdm', {}) + site_url = invenio_config.get('site_url') + if site_url is None: + raise MisconfigurationError("deposit.invenio_rdm.site_url is not configured") + + # Check if we configured an Invenio record ID (of the concept...) + record_id = invenio_config.get('record_id') + if record_id is None: + doi = invenio_config.get('doi') + if doi is None: + try: + # TODO: There might be more semantic in the codemeta.identifier... (also see schema.org) + identifier = ctx['codemeta.identifier'] + if identifier.startswith('https://doi.org/'): + doi = identifier[16:] + elif identifier.startswith('http://dx.doi.org/'): + doi = identifier[18:] + except KeyError: + pass + + if doi is not None: + # If we got a DOI, resolve it (using doi.org) into a Invenio URL ... and extract the record id. + record_id = _invenio_resolve_doi(site_url, doi) + + if record_id is not None: + # If we got a record id by now, resolve it using the Invenio API to the latests record. + return _invenio_resolve_record_id(site_url, record_id) + + return None, {} + + +def _invenio_resolve_doi(site_url, doi) -> str: + """ + Resolve an DOI to a Invenio URL and extract the record id. + + :param site_url: Root URL for the Invenio instance to use. + :param doi: The DOI to be resolved (only the identifier *without* the ``https://doi.org/`` prefix). + :return: The record ID on the respective instance. + """ + + res = requests.get(f'https://doi.org/{doi}') + + # This is a mean hack due to DataCite answering a 404 with a 200 status + if res.url == 'https://datacite.org/404.html': + raise ValueError(f"Invalid DOI: {doi}") + + # Ensure the resolved record is on the correct instance + if not res.url.startswith(site_url): + raise ValueError(f"{res.url} is not on configured host {site_url}.") + + # Extract the record id as last part of the URL path + page_url = urlparse(res.url) + *_, record_id = page_url.path.split('/') + return record_id + + +def _invenio_resolve_record_id(site_url: str, record_id: str) -> t.Tuple[str, dict]: + """ + Find the latest version of a given record. + + :param site_url: Root URL for the Invenio instance to use. + :param record_id: The record that sould be resolved. + :return: The record id of the latest version for the requested record. + """ + res = requests.get(f"{site_url}/api/records/{record_id}") + if res.status_code != 200: + raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") + + res_json = res.json() + res = requests.get(res_json['links']['latest']) + if res.status_code != 200: + raise ValueError(f"Could not retrieve record from {res.url}: {res.text}") + + res_json = res.json() + return res_json['id'], res_json['metadata'] + + +def _codemeta_to_invenio_deposition(ctx: CodeMetaContext) -> dict: + """The mapping logic. + + Functionality similar to this exists in the ``convert_codemeta`` package which uses + the crosswalk tables to do the mapping: + + .. code-block:: python + + invenio_metadata = convert_codemeta.crosswalk( + metadata, "codemeta", "Zenodo" + ) + + Unfortunately, this doesn't work well with additional metadata in the same dict, so + it is safer to provide our own implementation. + + Currently, this function handles a lot of cases which we want to be able to + configure. A simple mapping from one JSON path to another is not enough. + + The metadata expected by Zenodo is described in the `Zenodo Developers guide + `_. Unfortunately, there doesn't seem + to be a schema one can download in order to validate these metadata. There might be + differences between Invenio-based platforms. + """ + + metadata = ctx["codemeta"] + license = ctx["deposit"]["invenio_rdm"]["license"] + communities = ctx["deposit"]["invenio_rdm"]["communities"] + access_right = ctx["deposit"]["invenio_rdm"]["access_right"] + embargo_date = ctx["deposit"]["invenio_rdm"]["embargo_date"] + access_conditions = ctx["deposit"]["invenio_rdm"]["access_conditions"] + + creators = [ + # TODO: Distinguish between @type "Person" and others + { + k: v for k, v in { + # TODO: This is ugly + "affiliation": author.get("affiliation", {"legalName": None}).get("legalName"), + # Invenio wants "family, given". author.get("name") might not have this format. + "name": f"{author.get('familyName')}, {author.get('givenName')}" + if author.get("familyName") and author.get("givenName") + else author.get("name"), + # Invenio expects the ORCID without the URL part + "orcid": author.get("@id", "").replace("https://orcid.org/", "") or None, + }.items() if v is not None + } + for author in metadata["author"] + ] + + # This is not used at the moment. See comment below in `deposition_metadata` dict. + contributors = [ # noqa: F841 + # TODO: Distinguish between @type "Person" and others + { + k: v for k, v in { + # TODO: This is ugly + "affiliation": contributor.get("affiliation", {"legalName": None}).get("legalName"), + # Invenio wants "family, given". contributor.get("name") might not have this format. + "name": f"{contributor.get('familyName')}, {contributor.get('givenName')}" + if contributor.get("familyName") and contributor.get("givenName") + else contributor.get("name"), + # Invenio expects the ORCID without the URL part + "orcid": contributor.get("@id", "").replace("https://orcid.org/", "") or None, + # TODO: Many possibilities here. Get from config + "type": "ProjectMember", + }.items() if v is not None + } + # TODO: Filtering out "GitHub" should be done elsewhere + for contributor in metadata["contributor"] if contributor.get("name") != "GitHub" + ] + + # TODO: Use the fields currently set to `None`. + # Some more fields are available but they most likely don't relate to software + # publications targeted by hermes. + deposition_metadata = {k: v for k, v in { + # If upload_type is "publication"/"image", a publication_type/image_type must be + # specified. Since hermes targets software publications, this can be ignored and + # upload_type can be hard-coded to "software". + # TODO: Make this a constant maybe. + "upload_type": "software", + # IS0 8601-formatted date + # TODO: Maybe we want a different date? Then make this configurable. If not, + # this can be removed as it defaults to today. + "publication_date": date.today().isoformat(), + "title": metadata["name"], + "creators": creators, + # TODO: Use a real description here. Possible sources could be + # `tool.poetry.description` from pyproject.toml or `abstract` from + # CITATION.cff. This should then be stored in codemeta description field. + "description": metadata["name"], + "access_right": access_right, + "license": license, + "embargo_date": embargo_date, + "access_conditions": access_conditions, + # TODO: If a publisher already has assigned a DOI to the files we want to + # upload, it should be used here. In this case, Invenio will not give us a new + # one. Set "prereserve_doi" accordingly. + "doi": None, + # This prereserves a DOI that can then be added to the files before publishing + # them. + # TODO: Use the DOI we get back from this. + "prereserve_doi": True, + # TODO: A good source for this could be `tool.poetry.keywords` in pyproject.toml. + "keywords": None, + "notes": None, + "related_identifiers": None, + # TODO: Use `contributors`. In the case of the hermes workflow itself, the + # contributors are currently all in `creators` already. So for now, we set this + # to `None`. Change this when relationship between authors and contributors can + # be specified in the processing step. + "contributors": None, + "references": None, + "communities": communities, + "grants": None, + "subjects": None, + "version": metadata.get('version'), + }.items() if v is not None} + + return deposition_metadata + + +def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): + """Get Invenio license representation from CodeMeta. + + The license to use is extracted from the ``license`` field in the + :class:`CodeMetaContext` and converted into an appropiate license identifier to be + passed to an Invenio instance. + + A license according to CodeMeta may be a URL (text) or a CreativeWork. This function + only handles URLs. If a ``license`` field is present in the CodeMeta and it is not + of type :class:`str`, a :class:`RuntimeError` is raised. + + Invenio instances take a license string which refers to a license identifier. + Typically, Invenio instances offer licenses from https://opendefinition.org and + https://spdx.org. However, it is possible to mint PIDs for custom licenses. + + An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given + license is supported by the Invenio instance. This function tries to retrieve the + license by the identifier at the end of the license URL path. If this identifier + does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no + license is given in the CodeMeta, ``None`` is returned. + """ + + license_url = ctx["codemeta"].get("license") + + if license_url is None: + return None + + if not isinstance(license_url, str): + raise RuntimeError( + "The given license in CodeMeta must be of type str. " + "Licenses of type 'CreativeWork' are not supported." + ) + + # First try: Look up license by assuming lower-case name is the correct identifier + parsed_url = urlparse(license_url) + url_path = parsed_url.path.rstrip("/") + license_id = url_path.split("/")[-1].lower() + + response = requests.get( + f"{license_api_url}/{license_id}", headers={"User-Agent": hermes_user_agent} + ) + if response.ok: + license_info = response.json() + + # Second try: Fetch full list of licenses available... maybe we should cache this. + else: + license_info = _look_up_license_info(license_api_url, license_url) + + return license_info["id"] + + +def _look_up_license_info(license_api_url, license_url): + """Deliberately try to resolve the license URL to a valid InvenioRDM license information record from the + vocabulary. + + First, this method tries to find the license URL in the list of known license vocabulary (which is fetched each + time, ouch...). + + If the URL is not found (what is pretty probable by now, as CFFConvert produces SPDX-URLs while InvenioRDM still + relies on the overhauled opensource.org URLs), the SPDX information record is fetched and all valid cross references + are sought for. + + :param license_api_url: Base API endpoint for InvenioRDM license vocabulary queries. + :param license_url: The URL for the license we are search an identifier for. + :return: The vocabulary record that is provided by InvenioRDM. + """ + response = requests.get( + f"{license_api_url}?size=1000", headers={"User-Agent": hermes_user_agent} + ) + response.raise_for_status() + valid_licenses = response.json() + + def _search_license_info(_url): + for license_info in valid_licenses['hits']['hits']: + try: + if license_info['props']['url'] == _url: + return license_info + except KeyError: + continue + else: + return None + + license_info = _search_license_info(license_url) + if license_info is None and license_url.startswith('https://spdx.org/licenses/'): + response = requests.get(f"{license_url}.json", headers={"User-Agent": hermes_user_agent}) + response.raise_for_status() + + for license_cross_ref in response.json()['crossRef']: + if not license_cross_ref['isValid']: + continue + + license_info = _search_license_info(license_cross_ref["url"]) + if license_info is not None: + break + else: + raise RuntimeError(f"Could not resolve license URL {license_url} to a valid identifier.") + + return license_info + + +def _get_community_identifiers(ctx: CodeMetaContext, communities_api_url: str): + """Get Invenio community identifiers from config. + + This function gets the communities to be used for the deposition on an Invenio-based + site from the config and checks their validity against the site's API. If one of the + identifiers can not be found on the site, a :class:`MisconfigurationError` is + raised. + """ + + communities = config.get("deposit").get("invenio_rdm", {}).get("communities") + if communities is None: + return None + + session = requests.Session() + session.headers = {"User-Agent": hermes_user_agent} + + community_ids = [] + for community_id in communities: + url = f"{communities_api_url}/{community_id}" + response = session.get(url) + if response.status_code == 404: + raise MisconfigurationError( + f"Not a valid community identifier: {community_id}" + ) + # Catch other problems + response.raise_for_status() + community_ids.append({"identifier": response.json()["id"]}) + + return community_ids + + +def _get_access_modalities(license): + """Get access right, embargo date and access conditions based on configuration and given license. + + This function implements the rules laid out in the `Zenodo developer documentation + `_: + + - ``access_right`` is a controlled vocabulary + - embargoed access depositions need an embargo date + - restricted access depositions need access conditions + - open and embargoed access depositions need a license + - closed access depositions have no further requirements + + This function also makes sure that the given embargo date can be parsed as an ISO + 8601 string representation and that the access rights are given as a string. + """ + invenio_config = config.get("deposit").get("invenio_rdm", {}) + + access_right = invenio_config.get("access_right") + if access_right is None: + raise MisconfigurationError("deposit.invenio_rdm.access_right is not configured") + + access_right_options = ["open", "embargoed", "restricted", "closed"] + if access_right not in access_right_options: + raise MisconfigurationError( + "deposition.invenio_rdm.access_right must be one of: " + f"{', '.join(access_right_options)}" + ) + + embargo_date = invenio_config.get("embargo_date") + if access_right == "embargoed" and embargo_date is None: + raise MisconfigurationError( + f"With access_right {access_right}, " + "deposit.invenio_rdm.embargo_date must be configured" + ) + + if embargo_date is not None: + try: + datetime.fromisoformat(embargo_date) + except ValueError: + raise MisconfigurationError( + f"Could not parse deposit.invenio_rdm.embargo_date {embargo_date!r}. " + "Must be in ISO 8601 format." + ) + + access_conditions = invenio_config.get("access_conditions") + if access_right == "restricted" and access_conditions is None: + raise MisconfigurationError( + f"With access_right {access_right}, " + "deposit.invenio_rdm.access_conditions must be configured" + ) + + if access_conditions is not None and not isinstance(access_conditions, str): + raise MisconfigurationError( + "deposit.invenio_rdm.access_conditions must be a string (HTML is allowed)." + ) + + if license is None and access_right in ["open", "embargoed"]: + raise MisconfigurationError( + f"With access_right {access_right}, a license is required." + ) + + if access_right == "closed": + pass + + return access_right, embargo_date, access_conditions From 27da8cce80a31a69474e28fe4224adec75207bfa Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 08:03:03 +0100 Subject: [PATCH 22/39] Remove setting that is no longer used --- hermes.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/hermes.toml b/hermes.toml index 2da8a88b..f6831be5 100644 --- a/hermes.toml +++ b/hermes.toml @@ -9,7 +9,6 @@ from = [ "cff", "git" ] validate = false [deposit] -mapping = "invenio_rdm" target = "invenio_rdm" [deposit.invenio_rdm] From 0167449915e2587e2dc8c63e5182e323019c0a9c Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 08:11:55 +0100 Subject: [PATCH 23/39] Use logging instead of printing --- src/hermes/commands/deposit/invenio_rdm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py index e9206ed3..44127ef7 100644 --- a/src/hermes/commands/deposit/invenio_rdm.py +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -134,7 +134,7 @@ def create_initial_version(click_ctx: click.Context, ctx: CodeMetaContext): ) if not response.ok: - print(response.text) + _log.error("Webserver response: \n%s", response.text) raise RuntimeError(f"Could not create initial deposit {deposit_url!r}") deposit = response.json() From 8092acacbace3ce842e35ac77cea013f358fdeec Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 08:17:11 +0100 Subject: [PATCH 24/39] Extend description of license look up to match the actual algorithm --- src/hermes/commands/deposit/invenio_rdm.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/hermes/commands/deposit/invenio_rdm.py b/src/hermes/commands/deposit/invenio_rdm.py index 44127ef7..4199a326 100644 --- a/src/hermes/commands/deposit/invenio_rdm.py +++ b/src/hermes/commands/deposit/invenio_rdm.py @@ -536,9 +536,16 @@ def _get_license_identifier(ctx: CodeMetaContext, license_api_url: str): An API endpoint (usually ``/api/vocabularies/licenses``) can be used to check whether a given license is supported by the Invenio instance. This function tries to retrieve the - license by the identifier at the end of the license URL path. If this identifier - does not exist on the Invenio instance, a :class:`RuntimeError` is raised. If no - license is given in the CodeMeta, ``None`` is returned. + license by lower-casing the identifier at the end of the license URL path. If this identifier + does not exist on the Invenio instance, all available licenses are fetched and the URL is sought + for in the results. However, this might again not lead to success (as Invenio still provides + the obsolete https://opensource.org URLs) but harvesters might provide the SPDX style URLs. + Hence, the license URL is checked whether it is pointing to https://spdx.org/licenses/ and if + this is the case, the license record from SPDX is fetched and all `crossRef` URLs that are flagged + `isValid` are again sought for in the full set of licenses. Only if this still fails, + a :class:`RuntimeError` is raised. + + If no license is given in the CodeMeta, ``None`` is returned. """ license_url = ctx["codemeta"].get("license") From dad5d019bfe0c5df1f4fed8c417378e582df0c3f Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 16:20:27 +0100 Subject: [PATCH 25/39] Add '_' to allowed key characters --- src/hermes/model/path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/model/path.py b/src/hermes/model/path.py index fe354633..036c07e6 100644 --- a/src/hermes/model/path.py +++ b/src/hermes/model/path.py @@ -29,7 +29,7 @@ class ContextPathGrammar: The pyparsing grammar for ContextGrammar paths. """ - key = pp.Word('@:' + pp.alphas) + key = pp.Word('@:_' + pp.alphas) index = pp.Word(pp.nums).set_parse_action(lambda tok: [int(tok[0])]) | pp.Char('*') field = key + (pp.Suppress('[') + index + pp.Suppress(']'))[...] path = field + (pp.Suppress('.') + field)[...] From 7b3b8a298f6c58d30c6b6fd564468b4fff87b980 Mon Sep 17 00:00:00 2001 From: Sophie <133236526+SKernchen@users.noreply.github.com> Date: Thu, 19 Oct 2023 01:12:36 +0200 Subject: [PATCH 26/39] Add HERMES_PUSH_TOKEN --- docs/source/tutorials/automated-publication-with-ci.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index 2ff58d96..4c56e7f7 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -167,6 +167,11 @@ and activate the option "Allow GitHub Actions to create and approve pull request Copy the Zenodo sandbox token you just created into a new [GitLab CI variable](https://docs.gitlab.com/ee/ci/variables/#for-a-project) called `ZENODO_TOKEN`. +For Gitlab you also need the HERMES Push Token. That Token gives access to the project in order for HERMES to create Merge Requests. +Therefore, you [create an access token in your project](https://docs.gitlab.com/ee/user/project/settings/project_access_tokens.html#create-a-project-access-token). +The Token needs to have at least the `developer` role and `write` access (e.g. write_repository scope). +Then you create a Gitlab CI variable with the token called `HERMES_PUSH_TOKEN`. + Copy the [template file for GitLab to Zenodo Sandbox publication](https://github.com/hermes-hmc/ci-templates/blob/main/gitlab/hermes-ci.yml) into your project to `.gitlab/hermes-ci.yml`. From 7f5762f79c4426fd5f81763b791e622a05796bf2 Mon Sep 17 00:00:00 2001 From: Sophie <133236526+SKernchen@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:49:28 +0200 Subject: [PATCH 27/39] Note about protection of variables --- docs/source/tutorials/automated-publication-with-ci.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index 4c56e7f7..adf86cc2 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -172,6 +172,13 @@ Therefore, you [create an access token in your project](https://docs.gitlab.com/ The Token needs to have at least the `developer` role and `write` access (e.g. write_repository scope). Then you create a Gitlab CI variable with the token called `HERMES_PUSH_TOKEN`. +```{note} +The two Gitlab CI Variables include sensitive and powerful information. +Therefore you should at least select the flag `Mask variable` when creating. +If possible you should also select the flag `Protect variable` and define all branches `hermes/*` as +protected branch. +``` + Copy the [template file for GitLab to Zenodo Sandbox publication](https://github.com/hermes-hmc/ci-templates/blob/main/gitlab/hermes-ci.yml) into your project to `.gitlab/hermes-ci.yml`. From 8cd0801e2d9709b46cf8cd61e53e169e1a516e3d Mon Sep 17 00:00:00 2001 From: Sophie <133236526+SKernchen@users.noreply.github.com> Date: Thu, 19 Oct 2023 13:58:08 +0200 Subject: [PATCH 28/39] Add File Contributor Sophie Kernchen --- docs/source/tutorials/automated-publication-with-ci.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index adf86cc2..cb6cde95 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -8,6 +8,7 @@ SPDX-License-Identifier: CC-BY-SA-4.0 SPDX-FileContributor: Oliver Bertuch SPDX-FileContributor: Michael Meinel SPDX-FileContributor: Stephan Druskat +SPDX-FileContributor: Sophie Kernchen --> # Set up automatic software publishing From aa80bb7441386cb002ad75c13cae68b0cb2a347f Mon Sep 17 00:00:00 2001 From: David Pape Date: Wed, 1 Nov 2023 14:38:39 +0100 Subject: [PATCH 29/39] Use terminal-only logger for hermes clean --- src/hermes/commands/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index 655d97e3..1e8ef5b6 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -261,7 +261,7 @@ def clean(): """ Remove cached data. """ - audit_log = logging.getLogger('audit') + audit_log = logging.getLogger('cli') audit_log.info("# Cleanup") # Create Hermes context (i.e., all collected metadata for all stages...) From 9c694185e59ee9d62684acb5a9f7cadc2761c497 Mon Sep 17 00:00:00 2001 From: David Pape Date: Wed, 1 Nov 2023 14:43:48 +0100 Subject: [PATCH 30/39] Shutdown logging before cleaning files This ensures that any and all opened logging files in the .hermes directory are closed, allowing us to safely remove that directory. --- src/hermes/commands/workflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index 1e8ef5b6..bfa2bc0f 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -263,6 +263,8 @@ def clean(): """ audit_log = logging.getLogger('cli') audit_log.info("# Cleanup") + # shut down logging so that .hermes/ can safely be removed + logging.shutdown() # Create Hermes context (i.e., all collected metadata for all stages...) ctx = HermesContext() From 34f683c146a01e59d350226e363f473fae755f55 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 12:12:21 +0100 Subject: [PATCH 31/39] Add post-processing plugin for InvenioRDM This might have been parametrized or implemented differently to distinguish between Invenio and InvenioRDM. However, this way works without further coding headache... --- pyproject.toml | 3 +- .../commands/postprocess/invenio_rdm.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 src/hermes/commands/postprocess/invenio_rdm.py diff --git a/pyproject.toml b/pyproject.toml index 4ffa8f23..8ab0e25f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,7 +132,8 @@ invenio_rdm = "hermes.commands.deposit.invenio_rdm:publish" file = "hermes.commands.deposit.file:publish" [tool.poetry.plugins."hermes.postprocess"] -config_record_id = "hermes.commands.postprocess.invenio:config_record_id" +config_invenio_record_id = "hermes.commands.postprocess.invenio:config_record_id" +config_invenio_rdm_record_id = "hermes.commands.postprocess.invenio_rdm:config_record_id" cff_doi = "hermes.commands.postprocess.invenio:cff_doi" [tool.taskipy.tasks] diff --git a/src/hermes/commands/postprocess/invenio_rdm.py b/src/hermes/commands/postprocess/invenio_rdm.py new file mode 100644 index 00000000..c14a3a07 --- /dev/null +++ b/src/hermes/commands/postprocess/invenio_rdm.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2023 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel +# SPDX-FileContributor: Stephan Druskat + +import json +import logging + +import toml +from ruamel import yaml + +from hermes import config + + +_log = logging.getLogger('deposit.invenio_rdm') + + +def config_record_id(ctx): + deposition_path = ctx.get_cache('deposit', 'deposit') + with deposition_path.open("r") as deposition_file: + deposition = json.load(deposition_file) + conf = config.get('hermes') + try: + conf['deposit']['invenio_rdm']['record_id'] = deposition['record_id'] + toml.dump(conf, open('hermes.toml', 'w')) + except KeyError: + raise RuntimeError("No deposit.invenio configuration available to store record id in") from None From 4344f12f83b5dced8b4c7109cca7bc4375f698d7 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 12:12:43 +0100 Subject: [PATCH 32/39] Adapt tutorial for Zenodo running on InvenioRDM --- docs/source/tutorials/automated-publication-with-ci.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/tutorials/automated-publication-with-ci.md b/docs/source/tutorials/automated-publication-with-ci.md index cb6cde95..7fa0cff6 100644 --- a/docs/source/tutorials/automated-publication-with-ci.md +++ b/docs/source/tutorials/automated-publication-with-ci.md @@ -73,14 +73,14 @@ Configure HERMES to: from = [ "git", "cff" ] [deposit] -target = "invenio" +target = "invenio_rdm" -[deposit.invenio] +[deposit.invenio_rdm] site_url = "https://sandbox.zenodo.org" access_right = "open" [postprocess] -execute = [ "config_record_id" ] +execute = [ "config_invenio_rdm_record_id" ] ``` Copy this file to the root directory of your repository and add it to version control: @@ -93,7 +93,7 @@ git push ```{note} If you decide to start from an existing `hermes.toml` (e.g., the one found in this repository), -be sure that there is no `record_id` value defined in the `deposit.invenio` section. +be sure that there is no `record_id` value defined in the `deposit.invenio_rdm` section. Otherwise, your deposition will fail as *hermes* would try to deposit a new version for the given record. ``` From bc2858af746c10985619910aef4ceb643279ed5d Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Thu, 2 Nov 2023 12:14:36 +0100 Subject: [PATCH 33/39] Fix pylint error --- src/hermes/commands/postprocess/invenio_rdm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hermes/commands/postprocess/invenio_rdm.py b/src/hermes/commands/postprocess/invenio_rdm.py index c14a3a07..fc249132 100644 --- a/src/hermes/commands/postprocess/invenio_rdm.py +++ b/src/hermes/commands/postprocess/invenio_rdm.py @@ -9,7 +9,6 @@ import logging import toml -from ruamel import yaml from hermes import config From 4eabe80020ad22ee9293a099b2eb186442089f92 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 15 Nov 2023 10:18:16 +0100 Subject: [PATCH 34/39] Start of refactoring... --- src/hermes/model/context.py | 31 -- src/hermes/model/linked_data.py | 425 ++++++++++++++++++++++++++ test/pydalos_test/__init__.py | 0 test/pydalos_test/test_linked_data.py | 0 4 files changed, 425 insertions(+), 31 deletions(-) create mode 100644 src/hermes/model/linked_data.py create mode 100644 test/pydalos_test/__init__.py create mode 100644 test/pydalos_test/test_linked_data.py diff --git a/src/hermes/model/context.py b/src/hermes/model/context.py index 9bc0e35d..03a193cd 100644 --- a/src/hermes/model/context.py +++ b/src/hermes/model/context.py @@ -57,24 +57,6 @@ def __init__(self, project_dir: t.Optional[Path] = None): self._errors = [] self.contexts = {self.hermes_lod_context} - def __getitem__(self, key: ContextPath | str) -> t.Any: - """ - Access a single entry from the context. - - :param key: The path to the item that should be retrieved. - Can be in dotted syntax or as a :class:`ContextPath` instance. - :return: The value stored under the given key. - """ - if isinstance(key, str): - key = ContextPath.parse(key) - return key.get_from(self._data) - - def keys(self) -> t.List[ContextPath]: - """ - Get all the keys for the data stored in this context. - """ - return [ContextPath.parse(k) for k in self._data.keys()] - def init_cache(self, *path: str) -> Path: """ Initialize a cache directory if not present. @@ -126,19 +108,6 @@ def update(self, _key: str, _value: t.Any, **kwargs: t.Any): pass - def get_data(self, - data: t.Optional[dict] = None, - path: t.Optional['ContextPath'] = None, - tags: t.Optional[dict] = None) -> dict: - if data is None: - data = {} - if path is not None: - data.update({str(path): path.get_from(self._data)}) - else: - for key in self.keys(): - data.update({str(key): key.get_from(self._data)}) - return data - def error(self, ep: EntryPoint, error: Exception): """ Add an error that occurred during processing to the error log. diff --git a/src/hermes/model/linked_data.py b/src/hermes/model/linked_data.py new file mode 100644 index 00000000..73538db5 --- /dev/null +++ b/src/hermes/model/linked_data.py @@ -0,0 +1,425 @@ +# SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel + +import datetime +import pathlib +import traceback +import json +import logging +import shutil +import typing as t + +from pathlib import Path +from importlib.metadata import EntryPoint + +from hermes.model import errors +from hermes.model.path import ContextPath +from hermes.model.errors import HermesValidationError + + +_log = logging.getLogger(__name__) + +class HermesData: + """ + The HermesContext stores the metadata for a certain project. + + As there are different views of the metadata in the different stages, + some stages use a special subclass of this context: + + - The *harvest* stages uses :class:`HermesHarvestContext`. + """ + + default_timestamp = datetime.datetime.now().isoformat(timespec='seconds') + hermes_name = "hermes" + hermes_cache_name = "." + hermes_name + hermes_lod_context = (hermes_name, "https://software-metadata.pub/ns/hermes/") + + def __init__(self, project_dir: t.Optional[Path] = None): + """ + Create a new context for the given project dir. + + :param project_dir: The root directory of the project. + If nothing is given, the current working directory is used. + """ + + #: Base dir for the hermes metadata cache (default is `.hermes` in the project root). + self.hermes_dir = Path(project_dir or '.') / self.hermes_cache_name + + self._caches = {} + self._data = {} + self._errors = [] + self.contexts = {self.hermes_lod_context} + + def __getitem__(self, key: ContextPath | str) -> t.Any: + """ + Access a single entry from the context. + + :param key: The path to the item that should be retrieved. + Can be in dotted syntax or as a :class:`ContextPath` instance. + :return: The value stored under the given key. + """ + raise NotImplementedError() + + def keys(self) -> t.List[ContextPath]: + """ + Get all the keys for the data stored in this context. + """ + return [ContextPath.parse(k) for k in self._data.keys()] + + def init_cache(self, *path: str) -> Path: + """ + Initialize a cache directory if not present. + + :param path: The (local) path to identify the requested cache. + :return: The path to the requested cache file. + """ + cache_dir = self.hermes_dir.joinpath(*path) + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + def get_cache(self, *path: str, create: bool = False) -> Path: + """ + Retrieve a cache file for a given *path*. + + This method returns an appropriate path to a file but does not make any assertions about the format, encoding, + or whether the file should be exists. + However, it is capable to create the enclosing directory (if you specify `create = True`). + + :param path: The (local) path to identify the requested cache. + :param create: Select whether the directory should be created. + :return: The path to the requested cache file. + """ + + if path in self._caches: + return self._caches[path] + + *subdir, name = path + if create: + cache_dir = self.init_cache(*subdir) + else: + cache_dir = self.hermes_dir.joinpath(*subdir) + + data_file = cache_dir / (name + '.json') + self._caches[path] = data_file + + return data_file + + def update(self, _key: str, _value: t.Any, **kwargs: t.Any): + """ + Store a new value for a given key to the context. + + :param _key: The key may be a dotted name for a metadata attribute to store. + :param _value: The value that should be stored for the key. + :param kwargs: Additional information about the value. + This can be used to trace back the original value. + If `_ep` is given, it is treated as an entry point name that triggered the update. + """ + + pass + + def get_data(self, + data: t.Optional[dict] = None, + path: t.Optional['ContextPath'] = None, + tags: t.Optional[dict] = None) -> dict: + if data is None: + data = {} + if path is not None: + data.update({str(path): path.get_from(self._data)}) + else: + for key in self.keys(): + data.update({str(key): key.get_from(self._data)}) + return data + + def error(self, ep: EntryPoint, error: Exception): + """ + Add an error that occurred during processing to the error log. + + :param ep: The entry point that produced the error. + :param error: The exception that was thrown due to the error. + """ + + self._errors.append((ep, error)) + + def purge_caches(self) -> None: + """ + Delete `.hermes` cache-directory if it exsis. + """ + + if self.hermes_dir.exists(): + shutil.rmtree(self.hermes_dir) + + def add_context(self, new_context: tuple) -> None: + """ + Add a new linked data context to the harvest context. + + :param new_context: The new context as tuple (context name, context URI) + """ + self.contexts.add(new_context) + + +class HermesHarvestData(HermesData): + """ + A specialized context for use in *harvest* stage. + + Each harvester has its own context that is cached to :py:attr:`HermesContext.hermes_dir` `/harvest/EP_NAME`. + + This special context is implemented as a context manager that loads the cached data upon entering the context. + When the context is left, recorded metadata is stored in a cache file possible errors are propagated to the + parent context. + """ + + def __init__(self, ep: EntryPoint, config: dict = None): + """ + Initialize a new harvesting context. + + :param base: The base HermesContext that should receive the results of the harvesting. + :param ep: The entry point that implements the harvester using this context. + :param config: Configuration for the given harvester. + """ + + super().__init__() + + self._ep = ep + self._log = logging.getLogger(f'harvest.{self._ep.name}') + self.config = config or {} + + def load_cache(self): + """ + Load the cached data from the :py:attr:`HermesContext.hermes_dir`. + """ + + data_file = self.get_cache('harvest', self._ep.name) + if data_file.is_file(): + self._log.debug("Loading cache from %s...", data_file) + self._data = json.load(data_file.open('r')) + + contexts_file = self.get_cache('harvest', self._ep.name + '_contexts') + if contexts_file.is_file(): + self._log.debug("Loading contexts from %s...", contexts_file) + contexts = json.load(contexts_file.open('r')) + for context in contexts: + self.contexts.add((tuple(context))) + + def store_cache(self): + """ + Store the collected data to the :py:attr:`HermesContext.hermes_dir`. + """ + + data_file = self.get_cache('harvest', self._ep.name, create=True) + self._log.debug("Writing cache to %s...", data_file) + json.dump(self._data, data_file.open('w'), indent=2) + + if self.contexts: + contexts_file = self.get_cache('harvest', self._ep.name + '_contexts', create=True) + self._log.debug("Writing contexts to %s...", contexts_file) + json.dump(list(self.contexts), contexts_file.open('w'), indent=2) + + def update(self, _key: str, _value: t.Any, **kwargs: t.Any): + """ + The updates are added to a list of values. + A value is only replaced if the `_key` and all `kwargs` match. + + .. code:: python + + # 'value 2' will be added (twice) + ctx.update('key', 'value 1', spam='eggs') + ctx.update('key', 'value 2', foo='bar') + ctx.update('key', 'value 2', foo='bar', spam='eggs') + + # 'value 2' will replace 'value 1' + ctx.update('key', 'value 1', spam='eggs') + ctx.update('key', 'value 2', spam='eggs') + + This way, the harvester can fully specify the source and only override values that are from the same origin + (e.g., if the data changed between two runs). + + See :py:meth:`HermesContext.update` for more information. + """ + + timestamp = kwargs.pop('timestamp', self.default_timestamp) + harvester = kwargs.pop('harvester', self._ep.name) + + if _key not in self._data: + self._data[_key] = [] + + for entry in self._data[_key]: + value, tag = entry + tag_timestamp = tag.pop('timestamp') + tag_harvester = tag.pop('harvester') + + if tag == kwargs: + self._log.debug("Update %s: %s -> %s (%s)", _key, str(value), _value, str(tag)) + entry[0] = _value + tag['timestamp'] = timestamp + tag['harvester'] = harvester + break + + tag['timestamp'] = tag_timestamp + tag['harvester'] = tag_harvester + + else: + kwargs['timestamp'] = timestamp + kwargs['harvester'] = harvester + self._data[_key].append([_value, kwargs]) + + def _update_key_from(self, _key: ContextPath, _value: t.Any, **kwargs): + if isinstance(_value, dict): + for key, value in _value.items(): + self._update_key_from(_key[key], value, **kwargs) + + elif isinstance(_value, (list, tuple)): + for index, value in enumerate(_value): + self._update_key_from(_key[index], value, **kwargs) + + else: + self.update(str(_key), _value, **kwargs) + + def update_from(self, data: t.Dict[str, t.Any], **kwargs: t.Any): + """ + Bulk-update multiple values. + + If the value for a certain key is again a collection, the key will be expanded: + + .. code:: python + + ctx.update_from({'arr': ['foo', 'bar'], 'author': {'name': 'Monty Python', 'email': 'eggs@spam.xxx'}}) + + will eventually result in the following calls: + + .. code:: python + + ctx.update('arr[0]', 'foo') + ctx.update('arr[1]', 'bar') + ctx.update('author.name', 'Monty Python') + ctx.update('author.email', 'eggs@spam.xxx') + + :param data: The data that should be updated (as mapping with strings as keys). + :param kwargs: Additional information about the value (see :py:meth:`HermesContext.update` for details). + """ + + for key, value in data.items(): + self._update_key_from(ContextPath(key), value, **kwargs) + + def error(self, ep: EntryPoint, error: Exception): + """ + See :py:meth:`HermesContext.error` + """ + + ep = ep or self._ep + self._base.error(ep, error) + + def _check_values(self, path, values): + (value, tag), *values = values + for alt_value, alt_tag in values: + if value != alt_value: + raise ValueError(f'{path}') + return value, tag + + def get_data(self, + data: t.Optional[dict] = None, + path: t.Optional['ContextPath'] = None, + tags: t.Optional[dict] = None) -> dict: + """ + Retrieve the data from a given path. + + This method can be used to extract data and whole sub-trees from the context. + If you want a complete copy of the data, you can also call this method without giving a path. + + :param data: Optional a target dictionary where the data is stored. If not given, a new one is created. + :param path: The path to extract data from. + :param tags: An optional dictionary to collect the tags that belong to the extracted data. + The full path will be used as key for this dictionary. + :return: The extracted data (i.e., the `data` parameter if it was given). + """ + if data is None: + data = {} + for key, values in self._data.items(): + key = ContextPath.parse(key) + if path is None or key in path: + value, tag = self._check_values(key, values) + try: + key.update(data, value, tags, **tag) + if tags is not None and tag: + if str(key) in tags: + tags[str(key)].update(tag) + else: + tags[str(key)] = tag + except errors.MergeError as e: + self.error(self._ep, e) + return data + + def finish(self): + """ + Calling this method will lead to further processors not handling the context anymore. + """ + self._data.clear() + + +class CodeMetaData(HermesData): + _PRIMARY_ATTR = { + 'author': ('@id', 'email', 'name'), + } + + _CODEMETA_CONTEXT_URL = "https://doi.org/10.5063/schema/codemeta-2.0" + + def __init__(self, project_dir: pathlib.Path | None = None): + super().__init__(project_dir) + self.tags = {} + + def merge_from(self, other: HermesHarvestData): + other.get_data(self._data, tags=self.tags) + + def merge_contexts_from(self, other: HermesHarvestData): + """ + Merges any linked data contexts from a harvesting context into the instance's set of contexts. + + :param other: The :py:class:`HermesHarvestContext` to merge the linked data contexts from + """ + if other.contexts: + for context in other.contexts: + self.contexts.add(context) + + def update(self, _key: ContextPath, _value: t.Any, tags: t.Dict[str, t.Dict] | None = None): + if _key._item == '*': + _item_path, _item, _path = _key.resolve(self._data, query=_value, create=True) + if tags: + _tags = {k[len(str(_key) + '.'):]: t for k, t in tags.items() if ContextPath.parse(k) in _key} + else: + _tags = {} + _path._set_item(_item, _path, _value, **_tags) + if tags is not None and _tags: + for k, v in _tags.items(): + if not v: + continue + + if _key: + tag_key = str(_key) + '.' + k + else: + tag_key = k + tags[tag_key] = v + else: + _key.update(self._data, _value, tags) + + def find_key(self, item, other): + data = item.get_from(self._data) + + for i, node in enumerate(data): + match = [(k, node[k]) for k in self._PRIMARY_ATTR.get(str(item), ('@id',)) if k in node] + if any(other.get(k, None) == v for k, v in match): + return item[i] + return None + + def prepare_codemeta(self): + """ + Updates the linked data contexts, where the CodeMeta context is the default context, + and any additional contexts are named contexts. + Also sets the type to 'SoftwareSourceCode'. + """ + if self.contexts: + self.update(ContextPath('@context'), [self._CODEMETA_CONTEXT_URL, dict(self.contexts)]) + else: + self.update(ContextPath('@context'), self._CODEMETA_CONTEXT_URL) + self.update(ContextPath('@type'), 'SoftwareSourceCode') diff --git a/test/pydalos_test/__init__.py b/test/pydalos_test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/pydalos_test/test_linked_data.py b/test/pydalos_test/test_linked_data.py new file mode 100644 index 00000000..e69de29b From 54a69fa39462d22908599a34be59fa180242ce04 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 15 Nov 2023 10:18:57 +0100 Subject: [PATCH 35/39] Test for stuff that will never end up in hermes --- test/pydalos_test/test_linked_data.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/pydalos_test/test_linked_data.py b/test/pydalos_test/test_linked_data.py index e69de29b..d9091c86 100644 --- a/test/pydalos_test/test_linked_data.py +++ b/test/pydalos_test/test_linked_data.py @@ -0,0 +1,22 @@ +from pydalos.linked_data import LDTerm + + +def test_ldterm_equal(): + assert LDTerm('https://schema.org/Thing') == LDTerm('https://schema.org/Thing') + assert LDTerm('https://schema.org/Thing') == 'https://schema.org/Thing' + assert 'https://schema.org/Thing' == LDTerm('https://schema.org/Thing') + + +def test_ldterm_not_equal(): + assert LDTerm('https://schema.org/Thing') != LDTerm('https://schema.org/Taxon') + assert LDTerm('https://schema.org/Thing') != 'https://schema.org/Taxon' + assert 'https://schema.org/Thing' != LDTerm('https://schema.org/Taxon') + + +def test_ldterm_dict_key(): + data = { + LDTerm('https://schema.org/Thing'): 'spam' + } + + assert 'spam' == data[LDTerm('https://schema.org/Thing')] + assert 'spam' == data['https://schema.org/Thing'] From 5043844c6a17fedf0c03d9fbc8e665fa44a6178f Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 15 Nov 2023 15:03:22 +0100 Subject: [PATCH 36/39] Modifications to the data model to have meta-metadata in-place --- src/hermes/commands/workflow.py | 23 +++-- src/hermes/model/linked_data.py | 147 +++++++++++++++++--------------- src/hermes/model/path.py | 5 +- 3 files changed, 94 insertions(+), 81 deletions(-) diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index bfa2bc0f..5173612d 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -18,7 +18,7 @@ from hermes import config from hermes.commands.deposit.base import BaseDepositPlugin from hermes.error import MisconfigurationError -from hermes.model.context import HermesContext, HermesHarvestContext, CodeMetaContext +from hermes.model.linked_data import HermesData, HermesHarvestData, CodeMetaData from hermes.model.errors import MergeError from hermes.model.path import ContextPath @@ -34,7 +34,7 @@ def harvest(click_ctx: click.Context): audit_log.info("# Metadata harvesting") # Create Hermes context (i.e., all collected metadata for all stages...) - ctx = HermesContext() + ctx = HermesData() # Initialize the harvest cache directory here to indicate the step ran ctx.init_cache("harvest") @@ -55,11 +55,8 @@ def harvest(click_ctx: click.Context): _log.debug(". Loading harvester from %s", harvester.value) harvest = harvester.load() - with HermesHarvestContext(ctx, harvester, harvest_config.get(harvester.name, {})) as harvest_ctx: + with HermesHarvestData(harvester, harvest_config.get(harvester.name, {})) as harvest_ctx: harvest(click_ctx, harvest_ctx) - for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items(): - if any(v != _value and t == _tag for v, t in _trace): - raise MergeError(_key, None, _value) _log.info('') audit_log.info('') @@ -76,7 +73,7 @@ def process(click_ctx: click.Context): audit_log = logging.getLogger('audit') audit_log.info("# Metadata processing") - ctx = CodeMetaContext() + ctx = CodeMetaData() if not (ctx.hermes_dir / "harvest").exists(): _log.error("You must run the harvest command before process") @@ -95,7 +92,7 @@ def process(click_ctx: click.Context): harvester, *_ = harvesters audit_log.info("## Process data from %s", harvester.name) - harvest_context = HermesHarvestContext(ctx, harvester, {}) + harvest_context = HermesHarvestData(harvester, {}) try: harvest_context.load_cache() # when the harvest step ran, but there is no cache file, this is a serious flaw @@ -129,7 +126,7 @@ def process(click_ctx: click.Context): ctx.prepare_codemeta() with open(ctx.get_cache("process", ctx.hermes_name, create=True), 'w') as codemeta_file: - json.dump(ctx._data, codemeta_file, indent=2) + json.dump(ctx.data, codemeta_file, indent=2) logging.shutdown() @@ -137,7 +134,7 @@ def process(click_ctx: click.Context): @click.group(invoke_without_command=True) @click.pass_context def curate(click_ctx: click.Context): - ctx = CodeMetaContext() + ctx = CodeMetaData() process_output = ctx.hermes_dir / 'process' / (ctx.hermes_name + ".json") if not process_output.is_file(): @@ -173,7 +170,7 @@ def deposit(click_ctx: click.Context, initial, auth_token, file): click.echo("Metadata deposition") _log = logging.getLogger("cli.deposit") - ctx = CodeMetaContext() + ctx = CodeMetaData() codemeta_file = ctx.get_cache("curate", ctx.hermes_name) if not codemeta_file.exists(): @@ -231,7 +228,7 @@ def postprocess(click_ctx: click.Context): audit_log = logging.getLogger('audit') audit_log.info("# Post-processing") - ctx = CodeMetaContext() + ctx = CodeMetaData() if not (ctx.hermes_dir / "deposit").exists(): _log.error("You must run the deposit command before post-process") @@ -267,5 +264,5 @@ def clean(): logging.shutdown() # Create Hermes context (i.e., all collected metadata for all stages...) - ctx = HermesContext() + ctx = HermesData() ctx.purge_caches() diff --git a/src/hermes/model/linked_data.py b/src/hermes/model/linked_data.py index 73538db5..17c1bae9 100644 --- a/src/hermes/model/linked_data.py +++ b/src/hermes/model/linked_data.py @@ -37,6 +37,10 @@ class HermesData: hermes_cache_name = "." + hermes_name hermes_lod_context = (hermes_name, "https://software-metadata.pub/ns/hermes/") + _METADATA_TERMS = [ + 'timestamp', 'harvester', 'local_path', 'uri', + ] + def __init__(self, project_dir: t.Optional[Path] = None): """ Create a new context for the given project dir. @@ -49,7 +53,7 @@ def __init__(self, project_dir: t.Optional[Path] = None): self.hermes_dir = Path(project_dir or '.') / self.hermes_cache_name self._caches = {} - self._data = {} + self.data = {} self._errors = [] self.contexts = {self.hermes_lod_context} @@ -61,13 +65,16 @@ def __getitem__(self, key: ContextPath | str) -> t.Any: Can be in dotted syntax or as a :class:`ContextPath` instance. :return: The value stored under the given key. """ - raise NotImplementedError() + if not key is ContextPath: + key = ContextPath.parse(key) + data = key.get_from(self.data) + return data def keys(self) -> t.List[ContextPath]: """ Get all the keys for the data stored in this context. """ - return [ContextPath.parse(k) for k in self._data.keys()] + yield from (k for k in self.data.keys() if k not in ('@metadata', '@alternatives')) def init_cache(self, *path: str) -> Path: """ @@ -117,21 +124,41 @@ def update(self, _key: str, _value: t.Any, **kwargs: t.Any): This can be used to trace back the original value. If `_ep` is given, it is treated as an entry point name that triggered the update. """ - - pass - - def get_data(self, - data: t.Optional[dict] = None, - path: t.Optional['ContextPath'] = None, - tags: t.Optional[dict] = None) -> dict: - if data is None: - data = {} - if path is not None: - data.update({str(path): path.get_from(self._data)}) + path = ContextPath.parse(_key) + metadata = { + k: kwargs.pop(k) + for k in self._METADATA_TERMS + if k in kwargs + } + if kwargs: + metadata['custom'] = kwargs.copy() + + if path.parent is None: + target = self.data else: - for key in self.keys(): - data.update({str(key): key.get_from(self._data)}) - return data + target = path.parent.get_from(self.data) + + def _set_value(t, k, v): + if isinstance(v, dict): + if isinstance(t, list): + t.append({}) + else: + t[k] = {} + t = t[k] + for k, v in v.items(): + _set_value(t, k, v) + elif isinstance(v, (list, tuple)): + t[k] = [] + t = t[k] + for i, v in enumerate(v): + _set_value(t, i, v) + else: + t[k] = { + '@value': v, + '@metadata': metadata + } + + _set_value(target, path._item, _value) def error(self, ep: EntryPoint, error: Exception): """ @@ -194,7 +221,7 @@ def load_cache(self): data_file = self.get_cache('harvest', self._ep.name) if data_file.is_file(): self._log.debug("Loading cache from %s...", data_file) - self._data = json.load(data_file.open('r')) + self.data = json.load(data_file.open('r')) contexts_file = self.get_cache('harvest', self._ep.name + '_contexts') if contexts_file.is_file(): @@ -210,7 +237,7 @@ def store_cache(self): data_file = self.get_cache('harvest', self._ep.name, create=True) self._log.debug("Writing cache to %s...", data_file) - json.dump(self._data, data_file.open('w'), indent=2) + json.dump(self.data, data_file.open('w'), indent=2) if self.contexts: contexts_file = self.get_cache('harvest', self._ep.name + '_contexts', create=True) @@ -239,43 +266,15 @@ def update(self, _key: str, _value: t.Any, **kwargs: t.Any): See :py:meth:`HermesContext.update` for more information. """ - timestamp = kwargs.pop('timestamp', self.default_timestamp) - harvester = kwargs.pop('harvester', self._ep.name) - - if _key not in self._data: - self._data[_key] = [] - - for entry in self._data[_key]: - value, tag = entry - tag_timestamp = tag.pop('timestamp') - tag_harvester = tag.pop('harvester') - - if tag == kwargs: - self._log.debug("Update %s: %s -> %s (%s)", _key, str(value), _value, str(tag)) - entry[0] = _value - tag['timestamp'] = timestamp - tag['harvester'] = harvester - break - - tag['timestamp'] = tag_timestamp - tag['harvester'] = tag_harvester - - else: - kwargs['timestamp'] = timestamp - kwargs['harvester'] = harvester - self._data[_key].append([_value, kwargs]) - - def _update_key_from(self, _key: ContextPath, _value: t.Any, **kwargs): - if isinstance(_value, dict): - for key, value in _value.items(): - self._update_key_from(_key[key], value, **kwargs) + metadata = { + 'timestamp': kwargs.pop('timestamp', self.default_timestamp), + 'harvester': kwargs.pop('harvester', self._ep.name), + } - elif isinstance(_value, (list, tuple)): - for index, value in enumerate(_value): - self._update_key_from(_key[index], value, **kwargs) + if kwargs: + metadata.update(kwargs) - else: - self.update(str(_key), _value, **kwargs) + super().update(_key, _value, **metadata) def update_from(self, data: t.Dict[str, t.Any], **kwargs: t.Any): """ @@ -301,7 +300,7 @@ def update_from(self, data: t.Dict[str, t.Any], **kwargs: t.Any): """ for key, value in data.items(): - self._update_key_from(ContextPath(key), value, **kwargs) + self.update(key, value, **kwargs) def error(self, ep: EntryPoint, error: Exception): """ @@ -309,14 +308,14 @@ def error(self, ep: EntryPoint, error: Exception): """ ep = ep or self._ep - self._base.error(ep, error) + super().error(ep, error) def _check_values(self, path, values): - (value, tag), *values = values - for alt_value, alt_tag in values: - if value != alt_value: - raise ValueError(f'{path}') - return value, tag + if isinstance(values, dict) and '@value' in values: + return values['@value'], values.get('@metadata', {}) + else: + return values, {} + raise ValueError(f'{path}') def get_data(self, data: t.Optional[dict] = None, @@ -336,7 +335,7 @@ def get_data(self, """ if data is None: data = {} - for key, values in self._data.items(): + for key, values in self.data.items(): key = ContextPath.parse(key) if path is None or key in path: value, tag = self._check_values(key, values) @@ -351,11 +350,25 @@ def get_data(self, self.error(self._ep, e) return data + def __enter__(self): + self.load_cache() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.store_cache() + if exc_type is not None and issubclass(exc_type, HermesValidationError): + exc = traceback.TracebackException(exc_type, exc_val, exc_tb) + self._base.error(self._ep, exc) + self._log.warning("%s: %s", + exc_type, + ' '.join(map(str, exc_val.args))) + return True + def finish(self): """ Calling this method will lead to further processors not handling the context anymore. """ - self._data.clear() + self.data.clear() class CodeMetaData(HermesData): @@ -370,7 +383,7 @@ def __init__(self, project_dir: pathlib.Path | None = None): self.tags = {} def merge_from(self, other: HermesHarvestData): - other.get_data(self._data, tags=self.tags) + other.get_data(self.data, tags=self.tags) def merge_contexts_from(self, other: HermesHarvestData): """ @@ -384,7 +397,7 @@ def merge_contexts_from(self, other: HermesHarvestData): def update(self, _key: ContextPath, _value: t.Any, tags: t.Dict[str, t.Dict] | None = None): if _key._item == '*': - _item_path, _item, _path = _key.resolve(self._data, query=_value, create=True) + _item_path, _item, _path = _key.resolve(self.data, query=_value, create=True) if tags: _tags = {k[len(str(_key) + '.'):]: t for k, t in tags.items() if ContextPath.parse(k) in _key} else: @@ -401,10 +414,10 @@ def update(self, _key: ContextPath, _value: t.Any, tags: t.Dict[str, t.Dict] | N tag_key = k tags[tag_key] = v else: - _key.update(self._data, _value, tags) + _key.update(self.data, _value, tags) def find_key(self, item, other): - data = item.get_from(self._data) + data = item.get_from(self.data) for i, node in enumerate(data): match = [(k, node[k]) for k in self._PRIMARY_ATTR.get(str(item), ('@id',)) if k in node] diff --git a/src/hermes/model/path.py b/src/hermes/model/path.py index 036c07e6..4fa5fa69 100644 --- a/src/hermes/model/path.py +++ b/src/hermes/model/path.py @@ -151,6 +151,9 @@ def __eq__(self, other: 'ContextPath') -> bool: This match includes semantics for wildcards. Items that access `'*'` will automatically match everything (except for None). """ + if isinstance(other, str): + other = ContextPath.parse(other) + return ( other is not None and (self._item == other._item or self._item == '*' or other._item == '*') @@ -344,7 +347,7 @@ def get_from(self, target: dict | list) -> t.Any: :return: The value stored at path. """ prefix, target, path = self.resolve(target) - return self._get_item(target, path) + return self._get_item(target, prefix) def update(self, target: t.Dict[str, t.Any] | t.List, value: t.Any, tags: t.Optional[dict] = None, **kwargs): """ From 68884ea40024586064cc55e7e0d4b710c91937fe Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 15 Nov 2023 15:44:42 +0100 Subject: [PATCH 37/39] Break stuff by deleting the old data model --- src/hermes/model/context.py | 415 ------------------------------------ 1 file changed, 415 deletions(-) delete mode 100644 src/hermes/model/context.py diff --git a/src/hermes/model/context.py b/src/hermes/model/context.py deleted file mode 100644 index 03a193cd..00000000 --- a/src/hermes/model/context.py +++ /dev/null @@ -1,415 +0,0 @@ -# SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR) -# -# SPDX-License-Identifier: Apache-2.0 - -# SPDX-FileContributor: Michael Meinel - -import datetime -import pathlib -import traceback -import json -import logging -import shutil -import typing as t - -from pathlib import Path -from importlib.metadata import EntryPoint - -from hermes.model import errors -from hermes.model.path import ContextPath -from hermes.model.errors import HermesValidationError - - -_log = logging.getLogger(__name__) - - -ContextPath.init_merge_strategies() - - -class HermesContext: - """ - The HermesContext stores the metadata for a certain project. - - As there are different views of the metadata in the different stages, - some stages use a special subclass of this context: - - - The *harvest* stages uses :class:`HermesHarvestContext`. - """ - - default_timestamp = datetime.datetime.now().isoformat(timespec='seconds') - hermes_name = "hermes" - hermes_cache_name = "." + hermes_name - hermes_lod_context = (hermes_name, "https://software-metadata.pub/ns/hermes/") - - def __init__(self, project_dir: t.Optional[Path] = None): - """ - Create a new context for the given project dir. - - :param project_dir: The root directory of the project. - If nothing is given, the current working directory is used. - """ - - #: Base dir for the hermes metadata cache (default is `.hermes` in the project root). - self.hermes_dir = Path(project_dir or '.') / self.hermes_cache_name - - self._caches = {} - self._data = {} - self._errors = [] - self.contexts = {self.hermes_lod_context} - - def init_cache(self, *path: str) -> Path: - """ - Initialize a cache directory if not present. - - :param path: The (local) path to identify the requested cache. - :return: The path to the requested cache file. - """ - cache_dir = self.hermes_dir.joinpath(*path) - cache_dir.mkdir(parents=True, exist_ok=True) - return cache_dir - - def get_cache(self, *path: str, create: bool = False) -> Path: - """ - Retrieve a cache file for a given *path*. - - This method returns an appropriate path to a file but does not make any assertions about the format, encoding, - or whether the file should be exists. - However, it is capable to create the enclosing directory (if you specify `create = True`). - - :param path: The (local) path to identify the requested cache. - :param create: Select whether the directory should be created. - :return: The path to the requested cache file. - """ - - if path in self._caches: - return self._caches[path] - - *subdir, name = path - if create: - cache_dir = self.init_cache(*subdir) - else: - cache_dir = self.hermes_dir.joinpath(*subdir) - - data_file = cache_dir / (name + '.json') - self._caches[path] = data_file - - return data_file - - def update(self, _key: str, _value: t.Any, **kwargs: t.Any): - """ - Store a new value for a given key to the context. - - :param _key: The key may be a dotted name for a metadata attribute to store. - :param _value: The value that should be stored for the key. - :param kwargs: Additional information about the value. - This can be used to trace back the original value. - If `_ep` is given, it is treated as an entry point name that triggered the update. - """ - - pass - - def error(self, ep: EntryPoint, error: Exception): - """ - Add an error that occurred during processing to the error log. - - :param ep: The entry point that produced the error. - :param error: The exception that was thrown due to the error. - """ - - self._errors.append((ep, error)) - - def purge_caches(self) -> None: - """ - Delete `.hermes` cache-directory if it exsis. - """ - - if self.hermes_dir.exists(): - shutil.rmtree(self.hermes_dir) - - def add_context(self, new_context: tuple) -> None: - """ - Add a new linked data context to the harvest context. - - :param new_context: The new context as tuple (context name, context URI) - """ - self.contexts.add(new_context) - - -class HermesHarvestContext(HermesContext): - """ - A specialized context for use in *harvest* stage. - - Each harvester has its own context that is cached to :py:attr:`HermesContext.hermes_dir` `/harvest/EP_NAME`. - - This special context is implemented as a context manager that loads the cached data upon entering the context. - When the context is left, recorded metadata is stored in a cache file possible errors are propagated to the - parent context. - """ - - def __init__(self, base: HermesContext, ep: EntryPoint, config: dict = None): - """ - Initialize a new harvesting context. - - :param base: The base HermesContext that should receive the results of the harvesting. - :param ep: The entry point that implements the harvester using this context. - :param config: Configuration for the given harvester. - """ - - super().__init__() - - self._base = base - self._ep = ep - self._log = logging.getLogger(f'harvest.{self._ep.name}') - self.config = config or {} - - def load_cache(self): - """ - Load the cached data from the :py:attr:`HermesContext.hermes_dir`. - """ - - data_file = self._base.get_cache('harvest', self._ep.name) - if data_file.is_file(): - self._log.debug("Loading cache from %s...", data_file) - self._data = json.load(data_file.open('r')) - - contexts_file = self._base.get_cache('harvest', self._ep.name + '_contexts') - if contexts_file.is_file(): - self._log.debug("Loading contexts from %s...", contexts_file) - contexts = json.load(contexts_file.open('r')) - for context in contexts: - self.contexts.add((tuple(context))) - - def store_cache(self): - """ - Store the collected data to the :py:attr:`HermesContext.hermes_dir`. - """ - - data_file = self.get_cache('harvest', self._ep.name, create=True) - self._log.debug("Writing cache to %s...", data_file) - json.dump(self._data, data_file.open('w'), indent=2) - - if self.contexts: - contexts_file = self.get_cache('harvest', self._ep.name + '_contexts', create=True) - self._log.debug("Writing contexts to %s...", contexts_file) - json.dump(list(self.contexts), contexts_file.open('w'), indent=2) - - def __enter__(self): - self.load_cache() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.store_cache() - if exc_type is not None and issubclass(exc_type, HermesValidationError): - exc = traceback.TracebackException(exc_type, exc_val, exc_tb) - self._base.error(self._ep, exc) - self._log.warning("%s: %s", - exc_type, - ' '.join(map(str, exc_val.args))) - return True - - def update(self, _key: str, _value: t.Any, **kwargs: t.Any): - """ - The updates are added to a list of values. - A value is only replaced if the `_key` and all `kwargs` match. - - .. code:: python - - # 'value 2' will be added (twice) - ctx.update('key', 'value 1', spam='eggs') - ctx.update('key', 'value 2', foo='bar') - ctx.update('key', 'value 2', foo='bar', spam='eggs') - - # 'value 2' will replace 'value 1' - ctx.update('key', 'value 1', spam='eggs') - ctx.update('key', 'value 2', spam='eggs') - - This way, the harvester can fully specify the source and only override values that are from the same origin - (e.g., if the data changed between two runs). - - See :py:meth:`HermesContext.update` for more information. - """ - - timestamp = kwargs.pop('timestamp', self.default_timestamp) - harvester = kwargs.pop('harvester', self._ep.name) - - if _key not in self._data: - self._data[_key] = [] - - for entry in self._data[_key]: - value, tag = entry - tag_timestamp = tag.pop('timestamp') - tag_harvester = tag.pop('harvester') - - if tag == kwargs: - self._log.debug("Update %s: %s -> %s (%s)", _key, str(value), _value, str(tag)) - entry[0] = _value - tag['timestamp'] = timestamp - tag['harvester'] = harvester - break - - tag['timestamp'] = tag_timestamp - tag['harvester'] = tag_harvester - - else: - kwargs['timestamp'] = timestamp - kwargs['harvester'] = harvester - self._data[_key].append([_value, kwargs]) - - def _update_key_from(self, _key: ContextPath, _value: t.Any, **kwargs): - if isinstance(_value, dict): - for key, value in _value.items(): - self._update_key_from(_key[key], value, **kwargs) - - elif isinstance(_value, (list, tuple)): - for index, value in enumerate(_value): - self._update_key_from(_key[index], value, **kwargs) - - else: - self.update(str(_key), _value, **kwargs) - - def update_from(self, data: t.Dict[str, t.Any], **kwargs: t.Any): - """ - Bulk-update multiple values. - - If the value for a certain key is again a collection, the key will be expanded: - - .. code:: python - - ctx.update_from({'arr': ['foo', 'bar'], 'author': {'name': 'Monty Python', 'email': 'eggs@spam.xxx'}}) - - will eventually result in the following calls: - - .. code:: python - - ctx.update('arr[0]', 'foo') - ctx.update('arr[1]', 'bar') - ctx.update('author.name', 'Monty Python') - ctx.update('author.email', 'eggs@spam.xxx') - - :param data: The data that should be updated (as mapping with strings as keys). - :param kwargs: Additional information about the value (see :py:meth:`HermesContext.update` for details). - """ - - for key, value in data.items(): - self._update_key_from(ContextPath(key), value, **kwargs) - - def error(self, ep: EntryPoint, error: Exception): - """ - See :py:meth:`HermesContext.error` - """ - - ep = ep or self._ep - self._base.error(ep, error) - - def _check_values(self, path, values): - (value, tag), *values = values - for alt_value, alt_tag in values: - if value != alt_value: - raise ValueError(f'{path}') - return value, tag - - def get_data(self, - data: t.Optional[dict] = None, - path: t.Optional['ContextPath'] = None, - tags: t.Optional[dict] = None) -> dict: - """ - Retrieve the data from a given path. - - This method can be used to extract data and whole sub-trees from the context. - If you want a complete copy of the data, you can also call this method without giving a path. - - :param data: Optional a target dictionary where the data is stored. If not given, a new one is created. - :param path: The path to extract data from. - :param tags: An optional dictionary to collect the tags that belong to the extracted data. - The full path will be used as key for this dictionary. - :return: The extracted data (i.e., the `data` parameter if it was given). - """ - if data is None: - data = {} - for key, values in self._data.items(): - key = ContextPath.parse(key) - if path is None or key in path: - value, tag = self._check_values(key, values) - try: - key.update(data, value, tags, **tag) - if tags is not None and tag: - if str(key) in tags: - tags[str(key)].update(tag) - else: - tags[str(key)] = tag - except errors.MergeError as e: - self.error(self._ep, e) - return data - - def finish(self): - """ - Calling this method will lead to further processors not handling the context anymore. - """ - self._data.clear() - - -class CodeMetaContext(HermesContext): - _PRIMARY_ATTR = { - 'author': ('@id', 'email', 'name'), - } - - _CODEMETA_CONTEXT_URL = "https://doi.org/10.5063/schema/codemeta-2.0" - - def __init__(self, project_dir: pathlib.Path | None = None): - super().__init__(project_dir) - self.tags = {} - - def merge_from(self, other: HermesHarvestContext): - other.get_data(self._data, tags=self.tags) - - def merge_contexts_from(self, other: HermesHarvestContext): - """ - Merges any linked data contexts from a harvesting context into the instance's set of contexts. - - :param other: The :py:class:`HermesHarvestContext` to merge the linked data contexts from - """ - if other.contexts: - for context in other.contexts: - self.contexts.add(context) - - def update(self, _key: ContextPath, _value: t.Any, tags: t.Dict[str, t.Dict] | None = None): - if _key._item == '*': - _item_path, _item, _path = _key.resolve(self._data, query=_value, create=True) - if tags: - _tags = {k[len(str(_key) + '.'):]: t for k, t in tags.items() if ContextPath.parse(k) in _key} - else: - _tags = {} - _path._set_item(_item, _path, _value, **_tags) - if tags is not None and _tags: - for k, v in _tags.items(): - if not v: - continue - - if _key: - tag_key = str(_key) + '.' + k - else: - tag_key = k - tags[tag_key] = v - else: - _key.update(self._data, _value, tags) - - def find_key(self, item, other): - data = item.get_from(self._data) - - for i, node in enumerate(data): - match = [(k, node[k]) for k in self._PRIMARY_ATTR.get(str(item), ('@id',)) if k in node] - if any(other.get(k, None) == v for k, v in match): - return item[i] - return None - - def prepare_codemeta(self): - """ - Updates the linked data contexts, where the CodeMeta context is the default context, - and any additional contexts are named contexts. - Also sets the type to 'SoftwareSourceCode'. - """ - if self.contexts: - self.update(ContextPath('@context'), [self._CODEMETA_CONTEXT_URL, dict(self.contexts)]) - else: - self.update(ContextPath('@context'), self._CODEMETA_CONTEXT_URL) - self.update(ContextPath('@type'), 'SoftwareSourceCode') From 68732505ca4cad1a83a709a884b34d60d4652f76 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 15 Nov 2023 15:51:15 +0100 Subject: [PATCH 38/39] Resolve pylint and reuse errors --- src/hermes/commands/workflow.py | 1 - src/hermes/model/linked_data.py | 3 ++- test/pydalos_test/__init__.py | 0 test/pydalos_test/test_linked_data.py | 22 ---------------------- 4 files changed, 2 insertions(+), 24 deletions(-) delete mode 100644 test/pydalos_test/__init__.py delete mode 100644 test/pydalos_test/test_linked_data.py diff --git a/src/hermes/commands/workflow.py b/src/hermes/commands/workflow.py index 5173612d..1ef4cad5 100644 --- a/src/hermes/commands/workflow.py +++ b/src/hermes/commands/workflow.py @@ -19,7 +19,6 @@ from hermes.commands.deposit.base import BaseDepositPlugin from hermes.error import MisconfigurationError from hermes.model.linked_data import HermesData, HermesHarvestData, CodeMetaData -from hermes.model.errors import MergeError from hermes.model.path import ContextPath diff --git a/src/hermes/model/linked_data.py b/src/hermes/model/linked_data.py index 17c1bae9..ba60a6f9 100644 --- a/src/hermes/model/linked_data.py +++ b/src/hermes/model/linked_data.py @@ -22,6 +22,7 @@ _log = logging.getLogger(__name__) + class HermesData: """ The HermesContext stores the metadata for a certain project. @@ -65,7 +66,7 @@ def __getitem__(self, key: ContextPath | str) -> t.Any: Can be in dotted syntax or as a :class:`ContextPath` instance. :return: The value stored under the given key. """ - if not key is ContextPath: + if not isinstance(key, ContextPath): key = ContextPath.parse(key) data = key.get_from(self.data) return data diff --git a/test/pydalos_test/__init__.py b/test/pydalos_test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/test/pydalos_test/test_linked_data.py b/test/pydalos_test/test_linked_data.py deleted file mode 100644 index d9091c86..00000000 --- a/test/pydalos_test/test_linked_data.py +++ /dev/null @@ -1,22 +0,0 @@ -from pydalos.linked_data import LDTerm - - -def test_ldterm_equal(): - assert LDTerm('https://schema.org/Thing') == LDTerm('https://schema.org/Thing') - assert LDTerm('https://schema.org/Thing') == 'https://schema.org/Thing' - assert 'https://schema.org/Thing' == LDTerm('https://schema.org/Thing') - - -def test_ldterm_not_equal(): - assert LDTerm('https://schema.org/Thing') != LDTerm('https://schema.org/Taxon') - assert LDTerm('https://schema.org/Thing') != 'https://schema.org/Taxon' - assert 'https://schema.org/Thing' != LDTerm('https://schema.org/Taxon') - - -def test_ldterm_dict_key(): - data = { - LDTerm('https://schema.org/Thing'): 'spam' - } - - assert 'spam' == data[LDTerm('https://schema.org/Thing')] - assert 'spam' == data['https://schema.org/Thing'] From 915c3d5b9fb3f9df92fe243828161c25f6bb68e1 Mon Sep 17 00:00:00 2001 From: "Meinel, Michael" Date: Wed, 15 Nov 2023 16:59:40 +0100 Subject: [PATCH 39/39] Drop tests for old data model --- test/hermes_test/model/test_base_context.py | 38 ----- .../model/test_codemeta_context.py | 71 --------- .../hermes_test/model/test_harvest_context.py | 139 ------------------ 3 files changed, 248 deletions(-) delete mode 100644 test/hermes_test/model/test_base_context.py delete mode 100644 test/hermes_test/model/test_codemeta_context.py delete mode 100644 test/hermes_test/model/test_harvest_context.py diff --git a/test/hermes_test/model/test_base_context.py b/test/hermes_test/model/test_base_context.py deleted file mode 100644 index bdf016b7..00000000 --- a/test/hermes_test/model/test_base_context.py +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR) -# -# SPDX-License-Identifier: Apache-2.0 - -# SPDX-FileContributor: Michael Meinel - -from pathlib import Path - -from hermes.model.context import HermesContext - - -def test_context_hermes_dir_default(): - ctx = HermesContext() - assert ctx.hermes_dir == Path('.') / '.hermes' - - -def test_context_hermes_dir_custom(): - ctx = HermesContext('spam') - assert ctx.hermes_dir == Path('spam') / '.hermes' - - -def test_context_get_cache_default(): - ctx = HermesContext() - assert ctx.get_cache('spam', 'eggs') == Path('.') / '.hermes' / 'spam' / 'eggs.json' - - -def test_context_get_cache_cached(): - ctx = HermesContext() - ctx._caches[('spam', 'eggs')] = Path('spam_and_eggs') - assert ctx.get_cache('spam', 'eggs') == Path('spam_and_eggs') - - -def test_context_get_cache_create(tmpdir): - ctx = HermesContext(tmpdir) - subdir = Path(tmpdir) / '.hermes' / 'spam' - - assert ctx.get_cache('spam', 'eggs', create=True) == subdir / 'eggs.json' - assert subdir.exists() diff --git a/test/hermes_test/model/test_codemeta_context.py b/test/hermes_test/model/test_codemeta_context.py deleted file mode 100644 index b9a5ec3e..00000000 --- a/test/hermes_test/model/test_codemeta_context.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-FileCopyrightText: 2023 German Aerospace Center (DLR) -# -# SPDX-License-Identifier: Apache-2.0 - -# SPDX-FileContributor: Stephan Druskat - -import pytest -from unittest.mock import Mock - -from hermes.model.context import CodeMetaContext, HermesHarvestContext - - -@pytest.fixture -def mock_ep(): - ep = Mock() - ep.name = 'mock_name' - return ep - - -@pytest.fixture -def _context(): - return 'foo', 'bar' - - -@pytest.fixture -def _codemeta_context(): - return CodeMetaContext() - - -@pytest.fixture -def _data(_codemeta_context): - return { - '@context': [ - 'https://doi.org/10.5063/schema/codemeta-2.0', - {'hermes': 'https://software-metadata.pub/ns/hermes/'}], - '@type': 'SoftwareSourceCode' - } - - -@pytest.fixture -def _data_with_contexts(_codemeta_context): - return { - '@type': 'SoftwareSourceCode', - '@context': [ - 'https://doi.org/10.5063/schema/codemeta-2.0', - {'foo': 'bar', - 'hermes': 'https://software-metadata.pub/ns/hermes/'} - ] - } - - -def test_merge_contexts_from(mock_ep, _context, _codemeta_context): - assert _codemeta_context.contexts == {_codemeta_context.hermes_lod_context} - other = HermesHarvestContext(None, mock_ep) - other.contexts.add(_context) - _codemeta_context.merge_contexts_from(other) - assert _codemeta_context.contexts == {_context, _codemeta_context.hermes_lod_context} - - -def test_prepare_codemeta(_codemeta_context, _context, _data): - assert not _codemeta_context.keys() - _codemeta_context.prepare_codemeta() - assert _codemeta_context.get_data() == _data - - -def test_prepare_codemeta_with_contexts(_codemeta_context, _context, _data_with_contexts): - assert not _codemeta_context.keys() - assert _codemeta_context.contexts == {_codemeta_context.hermes_lod_context} - _codemeta_context.add_context(_context) - _codemeta_context.prepare_codemeta() - assert _codemeta_context.get_data() == _data_with_contexts diff --git a/test/hermes_test/model/test_harvest_context.py b/test/hermes_test/model/test_harvest_context.py deleted file mode 100644 index 7f6eeb62..00000000 --- a/test/hermes_test/model/test_harvest_context.py +++ /dev/null @@ -1,139 +0,0 @@ -# SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR) -# -# SPDX-License-Identifier: Apache-2.0 - -# SPDX-FileContributor: Michael Meinel - -from importlib.metadata import EntryPoint - -import pytest - -from hermes.model.context import HermesContext, HermesHarvestContext - - -@pytest.fixture -def harvest_ctx(request: pytest.FixtureRequest): - ctx = HermesContext() - return HermesHarvestContext( - ctx, - EntryPoint(name=request.function.__name__, group='hermes.harvest', value='hermes_test:ctx') - ) - - -def test_context_default(harvest_ctx): - harvest_ctx.update('spam', 'eggs', test=True) - - assert harvest_ctx._data['spam'] == [ - ['eggs', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_default'}] - ] - - -def test_context_update_append(harvest_ctx): - harvest_ctx.update('spam', 'noodles', index=0) - harvest_ctx.update('spam', 'eggs', index=1) - - assert harvest_ctx._data['spam'] == [ - ['noodles', {'index': 0, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_update_append'}], - ['eggs', {'index': 1, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_update_append'}] - ] - - -def test_context_update_replace(harvest_ctx): - harvest_ctx.update('spam', 'noodles', test=True) - harvest_ctx.update('spam', 'eggs', test=True) - - assert harvest_ctx._data['spam'] == [ - ['eggs', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_update_replace'}] - ] - - -def test_context_bulk_flat(harvest_ctx): - harvest_ctx.update_from({ - 'ans': 42, - 'spam': 'eggs' - }, test=True) - - assert harvest_ctx._data['ans'] == [ - [42, {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_flat'}] - ] - assert harvest_ctx._data['spam'] == [ - ['eggs', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_flat'}] - ] - - -def test_context_bulk_complex(harvest_ctx): - harvest_ctx.update_from({ - 'ans': 42, - 'author': [ - {'name': 'Monty Python', 'email': 'eggs@spam.io'}, - {'name': 'Herr Mes'}, - ] - }, test=True) - - assert harvest_ctx._data['ans'] == [ - [42, {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_complex'}] - ] - assert harvest_ctx._data['author[0].name'] == [ - ['Monty Python', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_complex'}] - ] - assert harvest_ctx._data['author[0].email'] == [ - ['eggs@spam.io', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_complex'}] - ] - assert harvest_ctx._data['author[1].name'] == [ - ['Herr Mes', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_complex'}] - ] - - -def test_context_bulk_replace(harvest_ctx): - harvest_ctx.update('author[0].name', 'Monty Python', test=True) - harvest_ctx.update_from({'author': [{'name': 'Herr Mes', 'email': 'eggs@spam.io'}]}, test=True) - - assert harvest_ctx._data['author[0].name'] == [ - ['Herr Mes', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_replace'}] - ] - assert harvest_ctx._data['author[0].email'] == [ - ['eggs@spam.io', {'test': True, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_replace'}] - ] - - -def test_context_bulk_append(harvest_ctx): - harvest_ctx.update('author[0].name', 'Monty Python', index=0) - harvest_ctx.update_from({'author': [{'name': 'Herr Mes', 'email': 'eggs@spam.io'}]}, index=1) - - assert harvest_ctx._data['author[0].name'] == [ - ['Monty Python', {'index': 0, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_append'}], - ['Herr Mes', {'index': 1, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_append'}] - ] - assert harvest_ctx._data['author[0].email'] == [ - ['eggs@spam.io', {'index': 1, - 'timestamp': HermesContext.default_timestamp, - 'harvester': 'test_context_bulk_append'}] - ]