Skip to content

Commit

Permalink
Overcome backwards xl-id form by handling both
Browse files Browse the repository at this point in the history
This adds the "normal" form (used in XL) as a sameAs to generated
records if they are to have the backwards form mistakenly minted by this
repository.

An upper timestamp per dataset is used to check for if its member
records are to have the backwards form.

(Eventually we want to "garbage collect" this backwards form from XL, to
ensure they don't "squat" on XL id:s in the future.)
  • Loading branch information
niklasl committed Feb 28, 2023
1 parent 1722acc commit 195d085
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 22 deletions.
1 change: 1 addition & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
context="sys/context/base.jsonld",
system_base_iri="",
union="common.jsonld.lines",
last_backwards_id_time="2022-10-14T16:26:16Z"
)

if __name__ == "__main__":
Expand Down
60 changes: 45 additions & 15 deletions lxltools/datacompiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def __init__(self, *,
context=None,
record_thing_link='mainEntity',
system_base_iri=None,
union='all.jsonld.lines'):
union='all.jsonld.lines',
last_backwards_id_time=None):
self.datasets_description = datasets_description
self.datasets = {}
self.current_ds_resources = set()
Expand All @@ -49,6 +50,11 @@ def __init__(self, *,
self.current_ds_file = None
self.no_records = False

self.last_backwards_id_time = (
timeutil.w3c_dtz_to_ms(last_backwards_id_time)
if isinstance(last_backwards_id_time, str)
else None)

if datasets_description:
self._handlers_from_datasets_description(datasets_description)

Expand Down Expand Up @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result):
data = self.to_jsonld(data)

ds_url = urljoin(self.dataset_id, name)
self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms)
self._create_dataset_description(
ds_url, ds_created_ms, ds_created_ms=ds_created_ms)

base_id = urljoin(self.dataset_id, base)

Expand All @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result):
modified_ms = None
fpath = urlparse(nodeid).path[1:]

if self.no_records:
self.write(node, fpath)
continue

meta = node.pop('meta', None)
if meta:
if 'created' in meta:
Expand All @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result):
node,
created_ms,
modified_ms,
datasets=[self.dataset_id, ds_url])
self.write(desc, fpath)
datasets=[self.dataset_id, ds_url],
ds_created_ms=ds_created_ms)

# Keep sameAs "fowards" form in meta even if no_records is used
if self.no_records:
meta = meta or {}
sameas = meta.setdefault('sameAs', [])
rec = desc['@graph'][0]
if 'sameAs' in rec:
sameas.append({"@id": rec['@id']})
for same in rec.get('sameAs', []):
sameas.append(same)
node['meta'] = meta
self.write(node, fpath)
else:
self.write(desc, fpath)

def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None):
def _create_dataset_description(self, ds_url, created_ms, modified_ms=None,
label=None, ds_created_ms=None):
if not label:
label = ds_url.rsplit('/', 1)[-1]
ds = {
Expand All @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
return

desc = self._to_node_description(ds, created_ms, modified_ms,
datasets={self.dataset_id, ds_url})
datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms)

record = desc['@graph'][0]
if self.tool_id:
Expand All @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
self.write(desc, ds_path)

def _to_node_description(self, node, created_ms,
modified_ms=None, datasets=None):
modified_ms=None, datasets=None, ds_created_ms=None):
assert self.record_thing_link not in node

node_id = node['@id']

record = OrderedDict()
record['@type'] = 'Record'
record['@id'] = self.generate_record_id(created_ms, node_id)

self.set_record_id(record, created_ms, node_id, ds_created_ms)

record[self.record_thing_link] = {'@id': node_id}

# Add provenance
Expand All @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms,

return {'@graph': items}

def generate_record_id(self, created_ms, node_id):
# FIXME: backwards_form=created_ms < 2015
slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id))
def set_record_id(self, record, created_ms, node_id, ds_created_ms=None):
if ds_created_ms is None:
ds_created_ms = created_ms
backwards_form = ds_created_ms < self.last_backwards_id_time
# TODO: use normal form and keep backwards_form as sameAs until "GC:able"?
record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form)
if backwards_form:
record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}]

def generate_record_id(self, created_ms, node_id, backwards_form=False):
slug = lxlslug.librisencode(
created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form
)
return urljoin(self.system_base_iri, slug)

def write(self, node, name):
Expand Down
12 changes: 7 additions & 5 deletions lxltools/lxlslug.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
from __future__ import unicode_literals, print_function

from typing import Any
from zlib import crc32
import string
import time
Expand Down Expand Up @@ -32,9 +31,12 @@ def rotate(c):
def checksum(data):
return crc32(data.encode('utf-8')) & 0xffffffff

def librisencode(a, b):
def librisencode(a, b, backwards_form=False):
alphabet = lower_consonants_numbers
timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a))))
chars = caesarize(alphabet, tobase(alphabet, a))
if backwards_form:
chars = reversed(chars)
timepart = "".join(chars)
codepart = tobase(alphabet, b)
codelen = len(codepart)
if codelen < 7:
Expand All @@ -53,7 +55,7 @@ def librisencode(a, b):
print("Usage: %s TIMESTAMP IDENTIFIER" % (cmd), file=sys.stderr)
exit(1)

timestamp = args.pop(0)
timestamp: Any = args.pop(0)
identifiers = args

try:
Expand Down
5 changes: 3 additions & 2 deletions syscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def _get_repo_version():
context='sys/context/base.jsonld',
record_thing_link='mainEntity',
system_base_iri='',
union='syscore.jsonld.lines')
union='syscore.jsonld.lines',
last_backwards_id_time='2022-11-20T00:00:00Z')


@compiler.handler
Expand Down Expand Up @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id):
record = {'@type': 'SystemRecord'}
record[compiler.record_thing_link] = {'@id': entity['@id']}
graph.insert(0, record)
record['@id'] = compiler.generate_record_id(created_ms, entity['@id'])
compiler.set_record_id(record, created_ms, entity['@id'])
record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]


Expand Down

0 comments on commit 195d085

Please sign in to comment.