From fcd147d10f7e6ee9e03122555c20fec2f72d2e23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Mon, 27 Feb 2023 14:38:40 +0100 Subject: [PATCH] Overcome backwards xl-id form by handling both This adds the "normal" form (used in XL) as a sameAs to generated records if they are to have the backwards form mistakenly minted by this repository. An upper timestamp per dataset is used to check for if its member records are to have the backwards form. (Eventually we want to "garbage collect" this backwards form from XL, to ensure they don't "squat" on XL id:s in the future.) --- common.py | 1 + lxltools/datacompiler.py | 60 ++++++++++++++++++++++++++++++---------- lxltools/lxlslug.py | 5 ++-- syscore.py | 5 ++-- 4 files changed, 52 insertions(+), 19 deletions(-) diff --git a/common.py b/common.py index 7ad3199db..9d97a0b45 100644 --- a/common.py +++ b/common.py @@ -11,6 +11,7 @@ context="sys/context/base.jsonld", system_base_iri="", union="common.jsonld.lines", + last_backwards_id_time="2022-10-14T16:26:16Z" ) if __name__ == "__main__": diff --git a/lxltools/datacompiler.py b/lxltools/datacompiler.py index d94be9e25..34c72e90d 100644 --- a/lxltools/datacompiler.py +++ b/lxltools/datacompiler.py @@ -33,7 +33,8 @@ def __init__(self, *, context=None, record_thing_link='mainEntity', system_base_iri=None, - union='all.jsonld.lines'): + union='all.jsonld.lines', + last_backwards_id_time=None): self.datasets_description = datasets_description self.datasets = {} self.current_ds_resources = set() @@ -49,6 +50,11 @@ def __init__(self, *, self.current_ds_file = None self.no_records = False + self.last_backwards_id_time = ( + timeutil.w3c_dtz_to_ms(last_backwards_id_time) + if isinstance(last_backwards_id_time, str) + else None) + if datasets_description: self._handlers_from_datasets_description(datasets_description) @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result): data = self.to_jsonld(data) ds_url = urljoin(self.dataset_id, name) - self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms) + self._create_dataset_description( + ds_url, ds_created_ms, ds_created_ms=ds_created_ms) base_id = urljoin(self.dataset_id, base) @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result): modified_ms = None fpath = urlparse(nodeid).path[1:] - if self.no_records: - self.write(node, fpath) - continue - meta = node.pop('meta', None) if meta: if 'created' in meta: @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result): node, created_ms, modified_ms, - datasets=[self.dataset_id, ds_url]) - self.write(desc, fpath) + datasets=[self.dataset_id, ds_url], + ds_created_ms=ds_created_ms) + + # Keep sameAs "fowards" form in meta even if no_records is used + if self.no_records: + meta = meta or {} + sameas = meta.setdefault('sameAs', []) + rec = desc['@graph'][0] + if 'sameAs' in rec: + sameas.append({"@id": rec['@id']}) + for same in rec.get('sameAs', []): + sameas.append(same) + node['meta'] = meta + self.write(node, fpath) + else: + self.write(desc, fpath) - def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None): + def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, + label=None, ds_created_ms=None): if not label: label = ds_url.rsplit('/', 1)[-1] ds = { @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe return desc = self._to_node_description(ds, created_ms, modified_ms, - datasets={self.dataset_id, ds_url}) + datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms) record = desc['@graph'][0] if self.tool_id: @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe self.write(desc, ds_path) def _to_node_description(self, node, created_ms, - modified_ms=None, datasets=None): + modified_ms=None, datasets=None, ds_created_ms=None): assert self.record_thing_link not in node node_id = node['@id'] record = OrderedDict() record['@type'] = 'Record' - record['@id'] = self.generate_record_id(created_ms, node_id) + + self.set_record_id(record, created_ms, node_id, ds_created_ms) + record[self.record_thing_link] = {'@id': node_id} # Add provenance @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms, return {'@graph': items} - def generate_record_id(self, created_ms, node_id): - # FIXME: backwards_form=created_ms < 2015 - slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id)) + def set_record_id(self, record, created_ms, node_id, ds_created_ms=None): + if ds_created_ms is None: + ds_created_ms = created_ms + backwards_form = ds_created_ms < self.last_backwards_id_time + # TODO: use normal form and keep backwards_form as sameAs until "GC:able"? + record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form) + if backwards_form: + record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}] + + def generate_record_id(self, created_ms, node_id, backwards_form=False): + slug = lxlslug.librisencode( + created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form + ) return urljoin(self.system_base_iri, slug) def write(self, node, name): diff --git a/lxltools/lxlslug.py b/lxltools/lxlslug.py index ee514925f..dc0d17a0b 100755 --- a/lxltools/lxlslug.py +++ b/lxltools/lxlslug.py @@ -32,9 +32,10 @@ def rotate(c): def checksum(data): return crc32(data.encode('utf-8')) & 0xffffffff -def librisencode(a, b): +def librisencode(a, b, backwards_form=True): + form = reversed if backwards_form else lambda x: x alphabet = lower_consonants_numbers - timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a)))) + timepart = "".join(form(caesarize(alphabet, tobase(alphabet, a)))) codepart = tobase(alphabet, b) codelen = len(codepart) if codelen < 7: diff --git a/syscore.py b/syscore.py index 246ae2451..8bd7ba977 100644 --- a/syscore.py +++ b/syscore.py @@ -31,7 +31,8 @@ def _get_repo_version(): context='sys/context/base.jsonld', record_thing_link='mainEntity', system_base_iri='', - union='syscore.jsonld.lines') + union='syscore.jsonld.lines', + last_backwards_id_time='2022-11-20T00:00:00Z') @compiler.handler @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id): record = {'@type': 'SystemRecord'} record[compiler.record_thing_link] = {'@id': entity['@id']} graph.insert(0, record) - record['@id'] = compiler.generate_record_id(created_ms, entity['@id']) + compiler.set_record_id(record, created_ms, entity['@id']) record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]