Skip to content

Commit

Permalink
Overcome backwards xl-id form by handling both
Browse files Browse the repository at this point in the history
This adds the "normal" form (used in XL) as a sameAs to generated
records if they are to have the backwards form mistakenly minted by this
repository.

An upper timestamp per dataset is used to check for if its member
records are to have the backwards form.

(Eventually we want to "garbage collect" this backwards form from XL, to
ensure they don't "squat" on XL id:s in the future.)
  • Loading branch information
niklasl committed Feb 27, 2023
1 parent f399f23 commit fcd147d
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 19 deletions.
1 change: 1 addition & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
context="sys/context/base.jsonld",
system_base_iri="",
union="common.jsonld.lines",
last_backwards_id_time="2022-10-14T16:26:16Z"
)

if __name__ == "__main__":
Expand Down
60 changes: 45 additions & 15 deletions lxltools/datacompiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def __init__(self, *,
context=None,
record_thing_link='mainEntity',
system_base_iri=None,
union='all.jsonld.lines'):
union='all.jsonld.lines',
last_backwards_id_time=None):
self.datasets_description = datasets_description
self.datasets = {}
self.current_ds_resources = set()
Expand All @@ -49,6 +50,11 @@ def __init__(self, *,
self.current_ds_file = None
self.no_records = False

self.last_backwards_id_time = (
timeutil.w3c_dtz_to_ms(last_backwards_id_time)
if isinstance(last_backwards_id_time, str)
else None)

if datasets_description:
self._handlers_from_datasets_description(datasets_description)

Expand Down Expand Up @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result):
data = self.to_jsonld(data)

ds_url = urljoin(self.dataset_id, name)
self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms)
self._create_dataset_description(
ds_url, ds_created_ms, ds_created_ms=ds_created_ms)

base_id = urljoin(self.dataset_id, base)

Expand All @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result):
modified_ms = None
fpath = urlparse(nodeid).path[1:]

if self.no_records:
self.write(node, fpath)
continue

meta = node.pop('meta', None)
if meta:
if 'created' in meta:
Expand All @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result):
node,
created_ms,
modified_ms,
datasets=[self.dataset_id, ds_url])
self.write(desc, fpath)
datasets=[self.dataset_id, ds_url],
ds_created_ms=ds_created_ms)

# Keep sameAs "fowards" form in meta even if no_records is used
if self.no_records:
meta = meta or {}
sameas = meta.setdefault('sameAs', [])
rec = desc['@graph'][0]
if 'sameAs' in rec:
sameas.append({"@id": rec['@id']})
for same in rec.get('sameAs', []):
sameas.append(same)
node['meta'] = meta
self.write(node, fpath)
else:
self.write(desc, fpath)

def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None):
def _create_dataset_description(self, ds_url, created_ms, modified_ms=None,
label=None, ds_created_ms=None):
if not label:
label = ds_url.rsplit('/', 1)[-1]
ds = {
Expand All @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
return

desc = self._to_node_description(ds, created_ms, modified_ms,
datasets={self.dataset_id, ds_url})
datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms)

record = desc['@graph'][0]
if self.tool_id:
Expand All @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
self.write(desc, ds_path)

def _to_node_description(self, node, created_ms,
modified_ms=None, datasets=None):
modified_ms=None, datasets=None, ds_created_ms=None):
assert self.record_thing_link not in node

node_id = node['@id']

record = OrderedDict()
record['@type'] = 'Record'
record['@id'] = self.generate_record_id(created_ms, node_id)

self.set_record_id(record, created_ms, node_id, ds_created_ms)

record[self.record_thing_link] = {'@id': node_id}

# Add provenance
Expand All @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms,

return {'@graph': items}

def generate_record_id(self, created_ms, node_id):
# FIXME: backwards_form=created_ms < 2015
slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id))
def set_record_id(self, record, created_ms, node_id, ds_created_ms=None):
if ds_created_ms is None:
ds_created_ms = created_ms
backwards_form = ds_created_ms < self.last_backwards_id_time
# TODO: use normal form and keep backwards_form as sameAs until "GC:able"?
record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form)
if backwards_form:
record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}]

def generate_record_id(self, created_ms, node_id, backwards_form=False):
slug = lxlslug.librisencode(
created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form
)
return urljoin(self.system_base_iri, slug)

def write(self, node, name):
Expand Down
5 changes: 3 additions & 2 deletions lxltools/lxlslug.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ def rotate(c):
def checksum(data):
return crc32(data.encode('utf-8')) & 0xffffffff

def librisencode(a, b):
def librisencode(a, b, backwards_form=True):
form = reversed if backwards_form else lambda x: x
alphabet = lower_consonants_numbers
timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a))))
timepart = "".join(form(caesarize(alphabet, tobase(alphabet, a))))
codepart = tobase(alphabet, b)
codelen = len(codepart)
if codelen < 7:
Expand Down
5 changes: 3 additions & 2 deletions syscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def _get_repo_version():
context='sys/context/base.jsonld',
record_thing_link='mainEntity',
system_base_iri='',
union='syscore.jsonld.lines')
union='syscore.jsonld.lines',
last_backwards_id_time='2022-11-20T00:00:00Z')


@compiler.handler
Expand Down Expand Up @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id):
record = {'@type': 'SystemRecord'}
record[compiler.record_thing_link] = {'@id': entity['@id']}
graph.insert(0, record)
record['@id'] = compiler.generate_record_id(created_ms, entity['@id'])
compiler.set_record_id(record, created_ms, entity['@id'])
record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]


Expand Down

0 comments on commit fcd147d

Please sign in to comment.