From be8af494f37009fe3a957bb71a3aeb00dfae90ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= <51744858+olovy@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:21:58 +0100 Subject: [PATCH] feat(emm): Use AS terms in full dump (#1529) --- .gitignore | 1 + emm/exampleclient/client.py | 22 ++++++++---- emm/src/main/java/whelk/Dump.java | 59 ++++++++++++++++++++++++------- 3 files changed, 63 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 48e5e4e1cb..a6a28be55c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ Session.vim /*.properties /*.properties-* !gradle.properties +emm/exampleclient/libris-cache.sqlite3 # Java/Gradle Artifacts /.gradle diff --git a/emm/exampleclient/client.py b/emm/exampleclient/client.py index cfffd5f3b3..3dd4669554 100644 --- a/emm/exampleclient/client.py +++ b/emm/exampleclient/client.py @@ -26,7 +26,7 @@ # embedded within another entity, or somewhere else)? # If so, we must update that entity wherever we keep it. # -# This is all we need to do to keap our cache up date in perpetuity. +# This is all we need to do to keep our cache up date in perpetuity. # # For this example client we are going to keep knowledge graphs in an SQLITE # table called 'entities'. These knowledge graphs will always consist of an @@ -121,7 +121,7 @@ def download_entity(url): req = Request(url) req.add_header('accept', 'application/json+ld') try: - return json.load(urlopen(req))["@graph"][1] + return get_main_entity(json.load(urlopen(req))) except: return None @@ -218,8 +218,9 @@ def load_dump(connection): dumpCreationTime = None while next_url: with urllib.request.urlopen(next_url) as response: + print(f"Getting {next_url}") data = json.load(response) - dumpCreationTimeOnPage = data["creationTime"] + dumpCreationTimeOnPage = data["startTime"] if (dumpCreationTime and dumpCreationTime != dumpCreationTimeOnPage): print(" DUMP INVALIDATED WHILE DOWNLOADING, TODO: DEAL WITH THIS ") dumpCreationTime = dumpCreationTimeOnPage @@ -227,8 +228,12 @@ def load_dump(connection): next_url = data["next"] else: next_url = None - if "entities" in data: - for entity in data["entities"]: + if "items" in data: + for item in data["items"]: + if "@graph" not in item: # skip @context + continue + + entity = get_main_entity(item) embellish(entity, connection) ingest_entity(entity, connection) cursor = connection.cursor() @@ -244,6 +249,11 @@ def load_dump(connection): connection.commit() +def get_main_entity(named_graph): + # FIXME? relying on XL convention @graph[0] = Record, @graph[1] = Main entity + named_graph["@graph"][1] + + # # Given a root entity 'r', and a replacement/update of some embedded entity 'u', # update the data of 'u' wherever it is copied/embedded into 'r'. @@ -344,7 +354,7 @@ def update(connection): result = cursor.execute("SELECT julianday(changes_consumed_until) - julianday(?) FROM state", (item["published"],)) diff = result.fetchone()[0] if (float(diff) >= 0.0): - print(f"{item["published"]} is before our last taken update, stop here.") + print(f"{item['published']} is before our last taken update, stop here.") next_url = None break handle_activity(connection, item) diff --git a/emm/src/main/java/whelk/Dump.java b/emm/src/main/java/whelk/Dump.java index 86e61ba86f..079b515835 100644 --- a/emm/src/main/java/whelk/Dump.java +++ b/emm/src/main/java/whelk/Dump.java @@ -27,6 +27,7 @@ import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; import java.util.Set; @@ -170,26 +171,39 @@ private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class); Instant dumpCreationTime = attributes.creationTime().toInstant(); - sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines + EmmChangeSet.TARGET_HITS_PER_PAGE, totalEntityCount, dumpCreationTime); + sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime); } - private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList recordIdsOnPage, HttpServletResponse res, long nextLineOffset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{ + private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{ var responseObject = new LinkedHashMap<>(); - responseObject.put("creationTime", ZonedDateTime.ofInstant(dumpCreationTime, ZoneOffset.UTC).toString()); + responseObject.put(JsonLd.CONTEXT_KEY, "https://www.w3.org/ns/activitystreams"); + responseObject.put(JsonLd.ID_KEY, apiBaseUrl+"?dump="+dump+"&offset="+offset); + responseObject.put("type", "CollectionPage"); + responseObject.put("startTime", ZonedDateTime.ofInstant(dumpCreationTime, ZoneOffset.UTC).toString()); if (totalEntityCount == null) - responseObject.put("status", "generating"); + responseObject.put("_status", "generating"); else { - responseObject.put("status", "done"); - responseObject.put("totalEntityCount", totalEntityCount); + responseObject.put("_status", "done"); + responseObject.put("totalItems", totalEntityCount); } - if (totalEntityCount == null || nextLineOffset < totalEntityCount) { - responseObject.put("next", apiBaseUrl+"?dump="+dump+"&offset="+nextLineOffset); + long nextOffset = offset + EmmChangeSet.TARGET_HITS_PER_PAGE; + if (totalEntityCount == null || nextOffset < totalEntityCount) { + responseObject.put("next", apiBaseUrl+"?dump="+dump+"&offset="+nextOffset); + } + + var items = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE); + responseObject.put("items", items); + + var contextDoc = contextDoc(whelk); + if (offset == 0) { + items.add(Map.of( + JsonLd.ID_KEY, contextDoc.getRecordIdentifiers().getFirst(), + JsonLd.CONTEXT_KEY, contextDoc.data.get(JsonLd.CONTEXT_KEY) + )); } - var entitiesList = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE); - responseObject.put("entities", entitiesList); Map idsAndRecords = whelk.bulkLoad(recordIdsOnPage); for (Document doc : idsAndRecords.values()) { @@ -207,15 +221,16 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String logger.warn("Bad instance? " + itemOf); continue; } + // TODO just put instance as its own graph in items? var itemOfPath = new ArrayList<>(); itemOfPath.add("@graph"); itemOfPath.add(1); itemOfPath.add("itemOf"); // unggh.. doc._set(itemOfPath, instance.getThing(), doc.data); - entitiesList.add(doc.getThing()); - } + items.add(wrapDoc(doc, contextDoc)); + } // For normal categories else { - entitiesList.add(doc.getThing()); + items.add(wrapDoc(doc, contextDoc)); } } @@ -223,6 +238,24 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE); } + private static Object wrapDoc(Document doc, Document contextDoc) { + var context = new ArrayList<>(); + context.add(null); + context.add(contextDoc.getRecordIdentifiers().getFirst()); + return Map.of( + JsonLd.ID_KEY, doc.getRecordIdentifiers().getFirst(), + JsonLd.CONTEXT_KEY, context, + JsonLd.GRAPH_KEY, doc.data.get(JsonLd.GRAPH_KEY) + ); + } + + private static Document contextDoc(Whelk whelk) { + // FIXME whelk load by IRI + var docs = whelk.bulkLoad(List.of(whelk.getSystemContextUri())); + assert docs.size() == 1; + return docs.entrySet().stream().findFirst().get().getValue(); + } + private static void invalidateIfOld(Path dumpFilePath) { try { if (!Files.exists(dumpFilePath))