Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load: add file creation date #214

Merged
merged 1 commit into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 37 additions & 7 deletions cds_migrator_kit/rdm/records/load/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration load module."""
import datetime
import json
import logging
import os
Expand Down Expand Up @@ -162,12 +163,27 @@ def _after_publish_update_dois(self, identity, record, entry):
_draft = current_rdm_records_service.edit(identity, record["id"])
current_rdm_records_service.publish(identity, _draft["id"])

def _after_publish_update_created(self, record, entry):
"""Update created timestamp post publish."""
# Fix the `created` timestamp forcing the one from the legacy system
# Force the created date. This can be done after publish as the service
# overrides the `created` date otherwise.
record._record.model.created = arrow.get(entry["record"]["created"]).datetime
def _after_publish_update_created(self, record, entry, version):
"""Update created timestamp post publish.

Ensures that the `created` timestamp is correctly set, preferring:
1. The original legacy system value for the version.
2. The record's creation date if there are no files.
3. Today's date if the original value and file creation date is missing.
"""
creation_date = entry["record"]["created"].datetime

versions = entry.get("versions", {})
version_data = versions.get(version, {})
# Use records creation date for the first version, unless it was created today
# which means it was missing so we rely on the files creation date
creation_date_is_today = creation_date.date() == datetime.date.today()
if version_data.get("files") and not version == 1 or creation_date_is_today:
# Subsequent versions should use the file creation date, instead of the record creation date,
# which is stored as the publication date in the version data
creation_date = version_data["publication_date"].datetime

record._record.model.created = creation_date
record._record.commit()

def _after_publish_mint_recid(self, record, entry, version):
Expand All @@ -178,11 +194,25 @@ def _after_publish_mint_recid(self, record, entry, version):
# but then we get a double redirection
legacy_recid_minter(legacy_recid, record._record.parent.model.id)

def _after_publish_update_files_created(self, record, entry, version):
"""Update the created date of the files post publish."""
# Fix the `created` timestamp forcing the one from the legacy system
# Force the created date. This can be done after publish as the service
# overrides the `created` date otherwise.
versions = entry.get("versions", {})
version_data = versions.get(version, {})
files = version_data.get("files", {})
for _, file_data in files.items():
file = record._record.files.entries[file_data["key"]]
file.model.created = arrow.get(file_data["creation_date"]).datetime
file.commit()

def _after_publish(self, identity, published_record, entry, version):
"""Run fixes after record publish."""
self._after_publish_update_dois(identity, published_record, entry)
self._after_publish_update_created(published_record, entry)
self._after_publish_update_created(published_record, entry, version)
self._after_publish_mint_recid(published_record, entry, version)
self._after_publish_update_files_created(published_record, entry, version)
db.session.commit()

def _pre_publish(self, identity, entry, version, draft):
Expand Down
22 changes: 20 additions & 2 deletions cds_migrator_kit/rdm/records/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _created(self, json_entry):
try:
return arrow.get(json_entry["_created"])
except KeyError:
return datetime.date.today().isoformat()
return arrow.get(datetime.date.today().isoformat())

def _updated(self, record_dump):
"""Returns the creation date of the record."""
Expand Down Expand Up @@ -435,16 +435,34 @@ def field_beams(record_json, custom_fields_dict):
)
return custom_fields

def _verify_creation_date(self, entry, json_data):
"""Verify creation date.

If the record has no files (file creation date will be used as record
creation date) and no creation date, raise an exception.
"""
if not entry.get("files") and not json_data.get("_created"):
raise ManualImportRequired(
message="Record missing creation date",
field="validation",
stage="transform",
description="Record has no files and no creation date",
recid=entry["recid"],
priority="warning",
value=None,
subfield=None,
)

def transform(self, entry):
"""Transform a record single entry."""
record_dump = CDSRecordDump(
entry,
)

migration_logger = RDMJsonLogger()

record_dump.prepare_revisions()
timestamp, json_data = record_dump.latest_revision
self._verify_creation_date(entry, json_data)
migration_logger.add_record(json_data)
record_json_output = {
"created": self._created(json_data),
Expand Down
Loading
Loading