Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/emm #1516

Merged
merged 23 commits into from
Nov 21, 2024
Merged
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
349ef2a
First attempt at getting paged EMM data, not complete.
jannistsiroyannis Oct 30, 2024
2807edd
Progress.
jannistsiroyannis Oct 31, 2024
782f0cb
Starting to look a little like EMM.
jannistsiroyannis Oct 31, 2024
087ae95
Add a missing file.
jannistsiroyannis Oct 31, 2024
2cfae28
Fix DB index usage.
jannistsiroyannis Oct 31, 2024
dfc0c8c
Add the entry point response.
jannistsiroyannis Nov 1, 2024
73f48b9
Work in progress, dumps
jannistsiroyannis Nov 4, 2024
48b7b3f
Read dumps, but don't send em yet.
jannistsiroyannis Nov 5, 2024
6c99a10
Looks like somewhat working dumps
jannistsiroyannis Nov 5, 2024
675055e
No categories for feed.
jannistsiroyannis Nov 11, 2024
bd5882e
Add a few basic dump categories.
jannistsiroyannis Nov 11, 2024
ae381e5
Embedd instances when dumping itemAndInstance-categories.
jannistsiroyannis Nov 11, 2024
db0280c
Type categories for EMM-dumps.
jannistsiroyannis Nov 13, 2024
b72529d
Add dump IDs
jannistsiroyannis Nov 13, 2024
2d81979
Beginnings of an EMM client.
jannistsiroyannis Nov 18, 2024
71b40b7
emm client, write changes on update.
jannistsiroyannis Nov 19, 2024
702d9a5
embellish in emm client
jannistsiroyannis Nov 19, 2024
9b08017
somewhat working emm client.
jannistsiroyannis Nov 19, 2024
c1d9b18
Do a little cleanup of the EMM client.
jannistsiroyannis Nov 19, 2024
cda8b8f
Fix EMM client embellish.
jannistsiroyannis Nov 20, 2024
b087eb9
Add create/delete handling in EMM client.
jannistsiroyannis Nov 20, 2024
3ea4abe
Remove temporary hack from EMM client.
jannistsiroyannis Nov 20, 2024
43c8e31
Minor EMM cleanups.
jannistsiroyannis Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add dump IDs
jannistsiroyannis committed Nov 13, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit b72529df6cfe785e846f9e6a6dcf75f550d04d05
36 changes: 28 additions & 8 deletions emm/src/main/java/whelk/Dump.java
Original file line number Diff line number Diff line change
@@ -12,7 +12,10 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.sql.*;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.*;

import static whelk.util.Jackson.mapper;
@@ -54,7 +57,7 @@ private static void sendDumpIndexResponse(String apiBaseUrl, HttpServletResponse
categoriesList.add(allCategory);

HashMap libraryCategory = new HashMap();
libraryCategory.put("url", apiBaseUrl+"?dump=itemAndInstance-X&offset=0");
libraryCategory.put("url", apiBaseUrl+"?dump=itemAndInstance:X&offset=0");
libraryCategory.put("description", "These categories represent the Items and Instances held by a particular library. " +
"The relevant library-code (sigel) for which you want data must replace the X in the category URL.");
categoriesList.add(libraryCategory);
@@ -127,12 +130,15 @@ private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String
logger.error("Failed reading dumpfile: " + dumpFilePath, e);
}

sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines + EmmChangeSet.TARGET_HITS_PER_PAGE, totalEntityCount);
BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class);
String dumpId = ""+(dumpFilePath.toString() + attributes.creationTime().toInstant().toEpochMilli()).hashCode();
sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines + EmmChangeSet.TARGET_HITS_PER_PAGE, totalEntityCount, dumpId);
}

private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList<String> recordIdsOnPage, HttpServletResponse res, long nextLineOffset, Long totalEntityCount) throws IOException{
private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList<String> recordIdsOnPage, HttpServletResponse res, long nextLineOffset, Long totalEntityCount, String dumpId) throws IOException{
HashMap responseObject = new HashMap();

responseObject.put("id", dumpId);
if (totalEntityCount == null)
responseObject.put("status", "generating");
else {
@@ -149,10 +155,10 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String
Map<String, Document> idsAndRecords = whelk.bulkLoad(recordIdsOnPage);
for (Document doc : idsAndRecords.values()) {

// Here is a bit of SPECIALIZED treatment only for the itemAndInstance-categories. These should
// Here is a bit of SPECIALIZED treatment only for the itemAndInstance:categories. These should
// include not only the Item (which is the root node for this category), but also the linked Instance.
// Without this, a client must individually GET every single Instance in their dataset, which scales poorly.
if (dump.startsWith("itemAndInstance-")) {
if (dump.startsWith("itemAndInstance:")) {
String itemOf = doc.getHoldingFor();
if (itemOf == null) {
logger.warn("Holding of nothing? " + doc.getId());
@@ -183,7 +189,21 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String
}

private static void invalidateIfOld(Path dumpFilePath) {
// TODO
try {
if (!Files.exists(dumpFilePath))
return;

BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class);
if (attributes.creationTime().toInstant().isBefore(Instant.now().minus(5, ChronoUnit.DAYS))) {
Files.delete(dumpFilePath);
}
} catch (IOException e) {
// These exceptions are caught here due to the (theoretical) risk of files access race conditions.
// For example, it could be that a dump is being read by one thread, while passing the too-old-threshold
// and then while still being read, another thread sees the dump as too old and tries to delete it.
// Just log this sort of thing and carry on.
logger.info("Failed to invalidate (delete) EMM dump: " + dumpFilePath, e);
}
}

private static void generateDump(Whelk whelk, String dump, Path dumpFilePath) {
@@ -196,7 +216,7 @@ private static void generateDump(Whelk whelk, String dump, Path dumpFilePath) {

if (dump.equals("all")) {
preparedStatement = getAllDumpStatement(connection);
} else if (dump.startsWith("itemAndInstance-")) {
} else if (dump.startsWith("itemAndInstance:")) {
preparedStatement = getLibraryXDumpStatement(connection, dump.substring(16));
} else if (dump.startsWith("type:")) {
preparedStatement = getTypeXStatement(connection, whelk, dump.substring(5));
@@ -212,7 +232,7 @@ private static void generateDump(Whelk whelk, String dump, Path dumpFilePath) {
try (ResultSet resultSet = p.executeQuery()) {
while (resultSet.next()) {

// Each line must be exactly 17 bytes long, including the (unix-)line break.
// Each line must be exactly 17 bytes long, including the (unix) line break.
String id = String.format("%-16s\n", resultSet.getString(1));
dumpFileWriter.write(id);
}