Skip to content

Commit

Permalink
Modifications to the data model to have meta-metadata in-place
Browse files Browse the repository at this point in the history
  • Loading branch information
led02 committed Nov 15, 2023
1 parent d89868c commit 5043844
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 81 deletions.
23 changes: 10 additions & 13 deletions src/hermes/commands/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from hermes import config
from hermes.commands.deposit.base import BaseDepositPlugin
from hermes.error import MisconfigurationError
from hermes.model.context import HermesContext, HermesHarvestContext, CodeMetaContext
from hermes.model.linked_data import HermesData, HermesHarvestData, CodeMetaData
from hermes.model.errors import MergeError
from hermes.model.path import ContextPath

Expand All @@ -34,7 +34,7 @@ def harvest(click_ctx: click.Context):
audit_log.info("# Metadata harvesting")

# Create Hermes context (i.e., all collected metadata for all stages...)
ctx = HermesContext()
ctx = HermesData()

# Initialize the harvest cache directory here to indicate the step ran
ctx.init_cache("harvest")
Expand All @@ -55,11 +55,8 @@ def harvest(click_ctx: click.Context):
_log.debug(". Loading harvester from %s", harvester.value)
harvest = harvester.load()

with HermesHarvestContext(ctx, harvester, harvest_config.get(harvester.name, {})) as harvest_ctx:
with HermesHarvestData(harvester, harvest_config.get(harvester.name, {})) as harvest_ctx:
harvest(click_ctx, harvest_ctx)
for _key, ((_value, _tag), *_trace) in harvest_ctx._data.items():
if any(v != _value and t == _tag for v, t in _trace):
raise MergeError(_key, None, _value)

_log.info('')
audit_log.info('')
Expand All @@ -76,7 +73,7 @@ def process(click_ctx: click.Context):
audit_log = logging.getLogger('audit')
audit_log.info("# Metadata processing")

ctx = CodeMetaContext()
ctx = CodeMetaData()

if not (ctx.hermes_dir / "harvest").exists():
_log.error("You must run the harvest command before process")
Expand All @@ -95,7 +92,7 @@ def process(click_ctx: click.Context):
harvester, *_ = harvesters
audit_log.info("## Process data from %s", harvester.name)

harvest_context = HermesHarvestContext(ctx, harvester, {})
harvest_context = HermesHarvestData(harvester, {})
try:
harvest_context.load_cache()
# when the harvest step ran, but there is no cache file, this is a serious flaw
Expand Down Expand Up @@ -129,15 +126,15 @@ def process(click_ctx: click.Context):
ctx.prepare_codemeta()

with open(ctx.get_cache("process", ctx.hermes_name, create=True), 'w') as codemeta_file:
json.dump(ctx._data, codemeta_file, indent=2)
json.dump(ctx.data, codemeta_file, indent=2)

logging.shutdown()


@click.group(invoke_without_command=True)
@click.pass_context
def curate(click_ctx: click.Context):
ctx = CodeMetaContext()
ctx = CodeMetaData()
process_output = ctx.hermes_dir / 'process' / (ctx.hermes_name + ".json")

if not process_output.is_file():
Expand Down Expand Up @@ -173,7 +170,7 @@ def deposit(click_ctx: click.Context, initial, auth_token, file):
click.echo("Metadata deposition")
_log = logging.getLogger("cli.deposit")

ctx = CodeMetaContext()
ctx = CodeMetaData()

codemeta_file = ctx.get_cache("curate", ctx.hermes_name)
if not codemeta_file.exists():
Expand Down Expand Up @@ -231,7 +228,7 @@ def postprocess(click_ctx: click.Context):
audit_log = logging.getLogger('audit')
audit_log.info("# Post-processing")

ctx = CodeMetaContext()
ctx = CodeMetaData()

if not (ctx.hermes_dir / "deposit").exists():
_log.error("You must run the deposit command before post-process")
Expand Down Expand Up @@ -267,5 +264,5 @@ def clean():
logging.shutdown()

# Create Hermes context (i.e., all collected metadata for all stages...)
ctx = HermesContext()
ctx = HermesData()
ctx.purge_caches()
147 changes: 80 additions & 67 deletions src/hermes/model/linked_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class HermesData:
hermes_cache_name = "." + hermes_name
hermes_lod_context = (hermes_name, "https://software-metadata.pub/ns/hermes/")

_METADATA_TERMS = [
'timestamp', 'harvester', 'local_path', 'uri',
]

def __init__(self, project_dir: t.Optional[Path] = None):
"""
Create a new context for the given project dir.
Expand All @@ -49,7 +53,7 @@ def __init__(self, project_dir: t.Optional[Path] = None):
self.hermes_dir = Path(project_dir or '.') / self.hermes_cache_name

self._caches = {}
self._data = {}
self.data = {}
self._errors = []
self.contexts = {self.hermes_lod_context}

Expand All @@ -61,13 +65,16 @@ def __getitem__(self, key: ContextPath | str) -> t.Any:
Can be in dotted syntax or as a :class:`ContextPath` instance.
:return: The value stored under the given key.
"""
raise NotImplementedError()
if not key is ContextPath:
key = ContextPath.parse(key)
data = key.get_from(self.data)
return data

def keys(self) -> t.List[ContextPath]:
"""
Get all the keys for the data stored in this context.
"""
return [ContextPath.parse(k) for k in self._data.keys()]
yield from (k for k in self.data.keys() if k not in ('@metadata', '@alternatives'))

def init_cache(self, *path: str) -> Path:
"""
Expand Down Expand Up @@ -117,21 +124,41 @@ def update(self, _key: str, _value: t.Any, **kwargs: t.Any):
This can be used to trace back the original value.
If `_ep` is given, it is treated as an entry point name that triggered the update.
"""

pass

def get_data(self,
data: t.Optional[dict] = None,
path: t.Optional['ContextPath'] = None,
tags: t.Optional[dict] = None) -> dict:
if data is None:
data = {}
if path is not None:
data.update({str(path): path.get_from(self._data)})
path = ContextPath.parse(_key)
metadata = {
k: kwargs.pop(k)
for k in self._METADATA_TERMS
if k in kwargs
}
if kwargs:
metadata['custom'] = kwargs.copy()

if path.parent is None:
target = self.data
else:
for key in self.keys():
data.update({str(key): key.get_from(self._data)})
return data
target = path.parent.get_from(self.data)

def _set_value(t, k, v):
if isinstance(v, dict):
if isinstance(t, list):
t.append({})
else:
t[k] = {}
t = t[k]
for k, v in v.items():
_set_value(t, k, v)
elif isinstance(v, (list, tuple)):
t[k] = []
t = t[k]
for i, v in enumerate(v):
_set_value(t, i, v)
else:
t[k] = {
'@value': v,
'@metadata': metadata
}

_set_value(target, path._item, _value)

def error(self, ep: EntryPoint, error: Exception):
"""
Expand Down Expand Up @@ -194,7 +221,7 @@ def load_cache(self):
data_file = self.get_cache('harvest', self._ep.name)
if data_file.is_file():
self._log.debug("Loading cache from %s...", data_file)
self._data = json.load(data_file.open('r'))
self.data = json.load(data_file.open('r'))

contexts_file = self.get_cache('harvest', self._ep.name + '_contexts')
if contexts_file.is_file():
Expand All @@ -210,7 +237,7 @@ def store_cache(self):

data_file = self.get_cache('harvest', self._ep.name, create=True)
self._log.debug("Writing cache to %s...", data_file)
json.dump(self._data, data_file.open('w'), indent=2)
json.dump(self.data, data_file.open('w'), indent=2)

if self.contexts:
contexts_file = self.get_cache('harvest', self._ep.name + '_contexts', create=True)
Expand Down Expand Up @@ -239,43 +266,15 @@ def update(self, _key: str, _value: t.Any, **kwargs: t.Any):
See :py:meth:`HermesContext.update` for more information.
"""

timestamp = kwargs.pop('timestamp', self.default_timestamp)
harvester = kwargs.pop('harvester', self._ep.name)

if _key not in self._data:
self._data[_key] = []

for entry in self._data[_key]:
value, tag = entry
tag_timestamp = tag.pop('timestamp')
tag_harvester = tag.pop('harvester')

if tag == kwargs:
self._log.debug("Update %s: %s -> %s (%s)", _key, str(value), _value, str(tag))
entry[0] = _value
tag['timestamp'] = timestamp
tag['harvester'] = harvester
break

tag['timestamp'] = tag_timestamp
tag['harvester'] = tag_harvester

else:
kwargs['timestamp'] = timestamp
kwargs['harvester'] = harvester
self._data[_key].append([_value, kwargs])

def _update_key_from(self, _key: ContextPath, _value: t.Any, **kwargs):
if isinstance(_value, dict):
for key, value in _value.items():
self._update_key_from(_key[key], value, **kwargs)
metadata = {
'timestamp': kwargs.pop('timestamp', self.default_timestamp),
'harvester': kwargs.pop('harvester', self._ep.name),
}

elif isinstance(_value, (list, tuple)):
for index, value in enumerate(_value):
self._update_key_from(_key[index], value, **kwargs)
if kwargs:
metadata.update(kwargs)

else:
self.update(str(_key), _value, **kwargs)
super().update(_key, _value, **metadata)

def update_from(self, data: t.Dict[str, t.Any], **kwargs: t.Any):
"""
Expand All @@ -301,22 +300,22 @@ def update_from(self, data: t.Dict[str, t.Any], **kwargs: t.Any):
"""

for key, value in data.items():
self._update_key_from(ContextPath(key), value, **kwargs)
self.update(key, value, **kwargs)

def error(self, ep: EntryPoint, error: Exception):
"""
See :py:meth:`HermesContext.error`
"""

ep = ep or self._ep
self._base.error(ep, error)
super().error(ep, error)

def _check_values(self, path, values):
(value, tag), *values = values
for alt_value, alt_tag in values:
if value != alt_value:
raise ValueError(f'{path}')
return value, tag
if isinstance(values, dict) and '@value' in values:
return values['@value'], values.get('@metadata', {})
else:
return values, {}
raise ValueError(f'{path}')

def get_data(self,
data: t.Optional[dict] = None,
Expand All @@ -336,7 +335,7 @@ def get_data(self,
"""
if data is None:
data = {}
for key, values in self._data.items():
for key, values in self.data.items():
key = ContextPath.parse(key)
if path is None or key in path:
value, tag = self._check_values(key, values)
Expand All @@ -351,11 +350,25 @@ def get_data(self,
self.error(self._ep, e)
return data

def __enter__(self):
self.load_cache()
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.store_cache()
if exc_type is not None and issubclass(exc_type, HermesValidationError):
exc = traceback.TracebackException(exc_type, exc_val, exc_tb)
self._base.error(self._ep, exc)
self._log.warning("%s: %s",
exc_type,
' '.join(map(str, exc_val.args)))
return True

def finish(self):
"""
Calling this method will lead to further processors not handling the context anymore.
"""
self._data.clear()
self.data.clear()


class CodeMetaData(HermesData):
Expand All @@ -370,7 +383,7 @@ def __init__(self, project_dir: pathlib.Path | None = None):
self.tags = {}

def merge_from(self, other: HermesHarvestData):
other.get_data(self._data, tags=self.tags)
other.get_data(self.data, tags=self.tags)

def merge_contexts_from(self, other: HermesHarvestData):
"""
Expand All @@ -384,7 +397,7 @@ def merge_contexts_from(self, other: HermesHarvestData):

def update(self, _key: ContextPath, _value: t.Any, tags: t.Dict[str, t.Dict] | None = None):
if _key._item == '*':
_item_path, _item, _path = _key.resolve(self._data, query=_value, create=True)
_item_path, _item, _path = _key.resolve(self.data, query=_value, create=True)
if tags:
_tags = {k[len(str(_key) + '.'):]: t for k, t in tags.items() if ContextPath.parse(k) in _key}
else:
Expand All @@ -401,10 +414,10 @@ def update(self, _key: ContextPath, _value: t.Any, tags: t.Dict[str, t.Dict] | N
tag_key = k
tags[tag_key] = v
else:
_key.update(self._data, _value, tags)
_key.update(self.data, _value, tags)

def find_key(self, item, other):
data = item.get_from(self._data)
data = item.get_from(self.data)

for i, node in enumerate(data):
match = [(k, node[k]) for k in self._PRIMARY_ATTR.get(str(item), ('@id',)) if k in node]
Expand Down
5 changes: 4 additions & 1 deletion src/hermes/model/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ def __eq__(self, other: 'ContextPath') -> bool:
This match includes semantics for wildcards.
Items that access `'*'` will automatically match everything (except for None).
"""
if isinstance(other, str):
other = ContextPath.parse(other)

return (
other is not None
and (self._item == other._item or self._item == '*' or other._item == '*')
Expand Down Expand Up @@ -344,7 +347,7 @@ def get_from(self, target: dict | list) -> t.Any:
:return: The value stored at path.
"""
prefix, target, path = self.resolve(target)
return self._get_item(target, path)
return self._get_item(target, prefix)

def update(self, target: t.Dict[str, t.Any] | t.List, value: t.Any, tags: t.Optional[dict] = None, **kwargs):
"""
Expand Down

0 comments on commit 5043844

Please sign in to comment.