Skip to content

Commit

Permalink
Implements harvest and process step
Browse files Browse the repository at this point in the history
  • Loading branch information
SKernchen committed Mar 4, 2024
1 parent 156e918 commit dc781ed
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 45 deletions.
2 changes: 1 addition & 1 deletion hermes.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: CC0-1.0

[harvest]
sources = [ "cff", "git" ]
sources = [ "cff", "git" ] # ordered priority (first one is most important)

[harvest.cff]
enable_validation = false
Expand Down
4 changes: 2 additions & 2 deletions src/hermes/commands/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ def load_settings(self, args: argparse.Namespace):
"""Load settings from the configuration file (passed in from command line)."""

toml_data = toml.load(args.path / args.config)
root_settings = HermesCommand.settings_class.model_validate(toml_data)
self.settings = getattr(root_settings, self.command_name)
self.root_settings = HermesCommand.settings_class.model_validate(toml_data)
self.settings = getattr(self.root_settings, self.command_name)

def patch_settings(self, args: argparse.Namespace):
"""Process command line options for the settings."""
Expand Down
10 changes: 9 additions & 1 deletion src/hermes/commands/curate/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,22 @@

import argparse

from pydantic import BaseModel

from hermes.commands.base import HermesCommand


class CurateSettings(BaseModel):
"""Generic deposition settings."""

pass


class HermesCurateCommand(HermesCommand):
""" Curate the unified metadata before deposition. """

command_name = "curate"
settings_class = None
settings_class = CurateSettings

def init_command_parser(self, command_parser: argparse.ArgumentParser) -> None:
pass
Expand Down
17 changes: 14 additions & 3 deletions src/hermes/commands/harvest/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from pydantic import BaseModel

from hermes.commands.base import HermesCommand, HermesPlugin
from hermes.model.errors import HermesValidationError
from hermes.model.context import HermesContext, HermesHarvestContext
from hermes.model.errors import HermesValidationError, MergeError


class HermesHarvestPlugin(HermesPlugin):
Expand All @@ -36,13 +37,23 @@ class HermesHarvestCommand(HermesCommand):

def __call__(self, args: argparse.Namespace) -> None:
self.args = args
ctx = HermesContext()

# Initialize the harvest cache directory here to indicate the step ran
ctx.init_cache("harvest")

for plugin_name in self.settings.sources:
try:
plugin_func = self.plugins[plugin_name]()
harvested_data = plugin_func(self)
harvested_data, local_path = plugin_func(self)
print(harvested_data)
# TODO: store harvested data for later use
with HermesHarvestContext(
ctx, plugin_name
) as harvest_ctx:
harvest_ctx.update_from(harvested_data, local_path=local_path)
for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items():
if any(v != _value and t == _tag for v, t in _trace):
raise MergeError(_key, None, _value)

except KeyError:
self.log.error("Plugin '%s' not found.", plugin_name)
Expand Down
2 changes: 1 addition & 1 deletion src/hermes/commands/harvest/cff.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __call__(self, command: HermesHarvestCommand) -> None:
# TODO Replace the following temp patch for #112 once there is a new cffconvert version with cffconvert#309
codemeta_dict = self._patch_author_emails(cff_dict, codemeta_dict)

return codemeta_dict
return codemeta_dict, str(cff_file)

def _load_cff_from_file(self, cff_data: str) -> t.Any:
yaml = YAML(typ='safe')
Expand Down
2 changes: 1 addition & 1 deletion src/hermes/commands/harvest/codemeta.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __call__(self, command: HermesHarvestCommand):
raise HermesValidationError(codemeta_file)

codemeta = json.loads(codemeta_str)
return codemeta
return codemeta, str(codemeta_file)

def _validate(self, codemeta_file: pathlib.Path) -> bool:
with open(codemeta_file, "r") as fi:
Expand Down
55 changes: 49 additions & 6 deletions src/hermes/commands/process/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,71 @@

# SPDX-FileContributor: Michael Meinel

import argparse
import argparse, sys, json

from pydantic import BaseModel

from hermes.commands.base import HermesCommand, HermesPlugin
from hermes.model.errors import HermesValidationError
from hermes.model.context import HermesContext, CodeMetaContext
from hermes.commands.harvest.base import HarvestSettings
from hermes.model.context import HermesHarvestContext, CodeMetaContext


class HermesProcessPlugin(HermesPlugin):

pass


class ProcessSettings(BaseModel):
"""Generic deposition settings."""

pass


class HermesProcessCommand(HermesCommand):
""" Process the collected metadata into a common dataset. """

command_name = "process"
settings_class = None
settings_class = ProcessSettings

def __call__(self, args: argparse.Namespace) -> None:
self.args = args
# TODO: get harvested data
# TODO: Merge Datasets
ctx = CodeMetaContext()

if not (ctx.hermes_dir / "harvest").exists():
self.log.error("You must run the harvest command before process")
sys.exit(1)

# Get all harvesters
harvester_names = self.root_settings.harvest.sources
harvester_names.reverse() # Switch order for priority handling

for harvester in harvester_names:
self.log.info("## Process data from %s", harvester)

harvest_context = HermesHarvestContext(ctx, harvester, {})
try:
harvest_context.load_cache()
# when the harvest step ran, but there is no cache file, this is a serious flaw
except FileNotFoundError:
self.log.warning("No output data from harvester %s found, skipping", harvester)
continue

ctx.merge_from(harvest_context)
ctx.merge_contexts_from(harvest_context)

if ctx._errors:
self.log.error('!!! warning "Errors during merge"')

for ep, error in ctx._errors:
self.log.info(" - %s: %s", ep.name, error)

tags_path = ctx.get_cache('process', 'tags', create=True)
with tags_path.open('w') as tags_file:
json.dump(ctx.tags, tags_file, indent=2)

ctx.prepare_codemeta()

with open(ctx.get_cache("process", ctx.hermes_name, create=True), 'w') as codemeta_file:
json.dump(ctx._data, codemeta_file, indent=2)


24 changes: 0 additions & 24 deletions src/hermes/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,30 +59,6 @@
'logging': _logging_config,
}


def configure(settings: hermes.settings.HermesSettings, working_path: pathlib.Path):
"""
Load the configuration from the given path as global hermes configuration.
:param config_path: The path to a TOML file containing HERMES configuration.
"""
if 'hermes' in _config and _config['hermes']:
return

# Load sane default paths for log files before potentially overwritting via configuration
_config['logging']['handlers']['logfile']['filename'] = \
working_path / HermesContext.hermes_cache_name / "hermes.log"
_config['logging']['handlers']['auditfile']['filename'] = \
working_path / HermesContext.hermes_cache_name / "audit.log"

_config['hermes'] = settings
global config
config = settings
_config['logging'] = settings.logging if settings.logging != {} else _config['logging']

# Might be a good idea to move somewhere else (see comment for _logging_config)?


_loggers = {}


Expand Down
12 changes: 6 additions & 6 deletions src/hermes/model/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,20 +194,20 @@ def __init__(self, base: HermesContext, ep: EntryPoint, config: dict = None):

self._base = base
self._ep = ep
self._log = logging.getLogger(f'harvest.{self._ep.name}')
self._log = logging.getLogger(f'harvest.{self._ep}')
self.config = config or {}

def load_cache(self):
"""
Load the cached data from the :py:attr:`HermesContext.hermes_dir`.
"""

data_file = self._base.get_cache('harvest', self._ep.name)
data_file = self._base.get_cache('harvest', self._ep)
if data_file.is_file():
self._log.debug("Loading cache from %s...", data_file)
self._data = json.load(data_file.open('r'))

contexts_file = self._base.get_cache('harvest', self._ep.name + '_contexts')
contexts_file = self._base.get_cache('harvest', self._ep + '_contexts')
if contexts_file.is_file():
self._log.debug("Loading contexts from %s...", contexts_file)
contexts = json.load(contexts_file.open('r'))
Expand All @@ -219,12 +219,12 @@ def store_cache(self):
Store the collected data to the :py:attr:`HermesContext.hermes_dir`.
"""

data_file = self.get_cache('harvest', self._ep.name, create=True)
data_file = self.get_cache('harvest', self._ep, create=True)
self._log.debug("Writing cache to %s...", data_file)
json.dump(self._data, data_file.open('w'), indent=2)

if self.contexts:
contexts_file = self.get_cache('harvest', self._ep.name + '_contexts', create=True)
contexts_file = self.get_cache('harvest', self._ep + '_contexts', create=True)
self._log.debug("Writing contexts to %s...", contexts_file)
json.dump(list(self.contexts), contexts_file.open('w'), indent=2)

Expand Down Expand Up @@ -265,7 +265,7 @@ def update(self, _key: str, _value: t.Any, **kwargs: t.Any):
"""

timestamp = kwargs.pop('timestamp', self.default_timestamp)
harvester = kwargs.pop('harvester', self._ep.name)
harvester = kwargs.pop('harvester', self._ep)

if _key not in self._data:
self._data[_key] = []
Expand Down

0 comments on commit dc781ed

Please sign in to comment.