Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add profile negotiation for EMM dump data. #1539

Merged
merged 2 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 47 additions & 19 deletions emm/src/main/java/whelk/Dump.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public class Dump {
private static final int GZIP_BUF_SIZE = 64 * 1024;
private static final String ND_JSON_LD_GZ_EXT = ".ndjsonld.gz";

public static void sendDumpResponse(Whelk whelk, String apiBaseUrl, HttpServletRequest req, HttpServletResponse res) throws IOException, SQLException {
public static void sendDumpResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String apiBaseUrl, HttpServletRequest req, HttpServletResponse res) throws IOException {
String selection = req.getParameter("selection");

if (selection == null) {
Expand All @@ -86,6 +86,16 @@ public static void sendDumpResponse(Whelk whelk, String apiBaseUrl, HttpServletR
return;
}

String profile = req.getParameter("profile"); // May be null, meaning default (kbv)
Document profileDoc = null;
if (profile != null) {
profileDoc = whelk.getStorage().getDocumentByIri(profile);
if (profileDoc == null) {
logger.info("Bad profile requested for EMM dump: {}", profile);
profile = null;
}
}

String tmpDir = System.getProperty("java.io.tmpdir");
Path dumpsPath = Paths.get(tmpDir, "dumps");
Files.createDirectories(dumpsPath);
Expand All @@ -97,10 +107,10 @@ public static void sendDumpResponse(Whelk whelk, String apiBaseUrl, HttpServletR
}

if (isDownload) {
sendDumpDownloadResponse(whelk, dumpFilePath, res);
sendDumpDownloadResponse(whelk, targetVocabMapper, profile, profileDoc, dumpFilePath, res);
} else {
long offsetNumeric = Long.parseLong(offset);
sendDumpPageResponse(whelk, apiBaseUrl, selection, dumpFilePath, offsetNumeric, res);
sendDumpPageResponse(whelk, targetVocabMapper, profile, profileDoc, apiBaseUrl, selection, dumpFilePath, offsetNumeric, res);
}
}

Expand Down Expand Up @@ -149,7 +159,7 @@ private static void sendDumpIndexResponse(String apiBaseUrl, HttpServletResponse
HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE);
}

private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String dump, Path dumpFilePath, long offsetLines, HttpServletResponse res) throws IOException {
private static void sendDumpPageResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, String apiBaseUrl, String dump, Path dumpFilePath, long offsetLines, HttpServletResponse res) throws IOException {
ArrayList<String> recordIdsOnPage = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE);
Long totalEntityCount = null;

Expand Down Expand Up @@ -212,10 +222,10 @@ private static void sendDumpPageResponse(Whelk whelk, String apiBaseUrl, String

BasicFileAttributes attributes = Files.readAttributes(dumpFilePath, BasicFileAttributes.class);
Instant dumpCreationTime = attributes.creationTime().toInstant();
sendFormattedResponse(whelk, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime);
sendFormattedResponse(whelk, targetVocabMapper, profile, profileDoc, apiBaseUrl, dump, recordIdsOnPage, res, offsetLines, totalEntityCount, dumpCreationTime);
}

private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String dump, ArrayList<String> recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{
private static void sendFormattedResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, String apiBaseUrl, String dump, ArrayList<String> recordIdsOnPage, HttpServletResponse res, long offset, Long totalEntityCount, Instant dumpCreationTime) throws IOException{
var responseObject = new LinkedHashMap<>();

responseObject.put(JsonLd.CONTEXT_KEY, "https://www.w3.org/ns/activitystreams");
Expand All @@ -242,7 +252,12 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String
var items = new ArrayList<>(EmmChangeSet.TARGET_HITS_PER_PAGE);
responseObject.put("items", items);

var contextDoc = contextDoc(whelk);
Document contextDoc = null;
if (profileDoc != null)
contextDoc = profileDoc;
else {
contextDoc = contextDoc(whelk);
}
if (offset == 0) {
items.add(wrapContextDoc(contextDoc));
}
Expand Down Expand Up @@ -272,28 +287,33 @@ private static void sendFormattedResponse(Whelk whelk, String apiBaseUrl, String
itemOfPath.add("@graph"); itemOfPath.add(1); itemOfPath.add("itemOf"); // unggh..
doc._set(itemOfPath, instance.getThing(), doc.data);

items.add(wrapDoc(doc, contextDoc));
items.add(formatDoc(doc, contextDoc, targetVocabMapper, profile, profileDoc));
}
// For normal categories
else {
items.add(wrapDoc(doc, contextDoc));
items.add(formatDoc(doc, contextDoc, targetVocabMapper, profile, profileDoc));
}

}

HttpTools.sendResponse(res, responseObject, JSON_CONTENT_TYPE);
}

private static void sendDumpDownloadResponse(Whelk whelk, Path dumpFilePath, HttpServletResponse res) {
private static void sendDumpDownloadResponse(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, Path dumpFilePath, HttpServletResponse res) {
String filename = Unicode.stripSuffix(dumpFilePath.getFileName().toString(), ".dump") + ND_JSON_LD_GZ_EXT;
res.setHeader("Content-Disposition", "attachment; filename=" + filename);
res.setHeader("Content-Type", "application/octet-stream");

int batchSize = EmmChangeSet.TARGET_HITS_PER_PAGE;
try (GZIPOutputStream os = new GZIPOutputStream(new BufferedOutputStream(res.getOutputStream()), GZIP_BUF_SIZE)) {
res.flushBuffer();

var contextDoc = contextDoc(whelk);

Document contextDoc = null;
if (profileDoc != null)
contextDoc = profileDoc;
else {
contextDoc = contextDoc(whelk);
}
writeJsonLdLine(wrapContextDoc(contextDoc), os);

// Has the dump not begun being written yet ?
Expand Down Expand Up @@ -325,26 +345,26 @@ private static void sendDumpDownloadResponse(Whelk whelk, Path dumpFilePath, Htt
batch.add(line.trim());

if (batch.size() >= batchSize) {
writeJsonLdLines(whelk, batch, contextDoc, os);
writeJsonLdLines(whelk, targetVocabMapper, profile, profileDoc, batch, contextDoc, os);
batch = new ArrayList<>(batchSize);
}
}
writeJsonLdLines(whelk, batch, contextDoc, os);
writeJsonLdLines(whelk, targetVocabMapper, profile, profileDoc, batch, contextDoc, os);
res.flushBuffer();
}
} catch (Exception e) {
logger.info("Error sending dump download: {}", e.getMessage());
}
}

private static void writeJsonLdLines(Whelk whelk, Collection<String> ids, Document contextDoc, OutputStream os) throws IOException {
private static void writeJsonLdLines(Whelk whelk, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc, Collection<String> ids, Document contextDoc, OutputStream os) throws IOException {
Map<String, Document> idsAndRecords = whelk.bulkLoad(ids);
for (Document doc : idsAndRecords.values()) {
if (doc.getDeleted()) {
continue;
}

writeJsonLdLine(wrapDoc(doc, contextDoc), os);
writeJsonLdLine(formatDoc(doc, contextDoc, targetVocabMapper, profile, profileDoc), os);
}
os.flush();
}
Expand All @@ -357,15 +377,23 @@ private static void writeJsonLdLine(Object object, OutputStream os) throws IOExc
os.write("\n".getBytes(StandardCharsets.UTF_8));
}

private static Object wrapDoc(Document doc, Document contextDoc) {
private static Object formatDoc(Document doc, Document contextDoc, TargetVocabMapper targetVocabMapper, String profile, Document profileDoc) {
var context = new ArrayList<>();
context.add(null);
context.add(contextDoc.getRecordIdentifiers().getFirst());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that when a profile is selected we should point to that context here.
Same with the first document of the dump. It should be the profile context instead of the system context.

See e.g. @context
https://libris-qa.kb.se/fxql7jqr38b1dkf/data.jsonld?profile=https%3A%2F%2Fid.kb.se%2Fsys%2Fcontext%2Ftarget%2Fsdo-w3c&embellished=false

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very good comments! I believe I've addressed the problems now. Take another look, if I got the context-stuff right?

return Map.of(

Document formattedDoc = doc; // Will be replaced if there's a profile
if (profile != null && profileDoc != null) {
formattedDoc = new Document((Map) targetVocabMapper.applyTargetVocabularyMap(profile, profileDoc.data, doc.data));
}

Map data = Map.of(
JsonLd.ID_KEY, doc.getRecordIdentifiers().getFirst(),
JsonLd.CONTEXT_KEY, context,
JsonLd.GRAPH_KEY, doc.data.get(JsonLd.GRAPH_KEY)
JsonLd.GRAPH_KEY, formattedDoc.data.get(JsonLd.GRAPH_KEY)
);

return data;
}

private static Object wrapContextDoc(Document contextDoc) {
Expand Down
5 changes: 4 additions & 1 deletion emm/src/main/java/whelk/EmmServlet.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
public class EmmServlet extends HttpServlet {
private final Logger logger = LogManager.getLogger(this.getClass());
private final Whelk whelk;
private final TargetVocabMapper targetVocabMapper;

public static final String AS2_CONTENT_TYPE = "application/activity+json";

public EmmServlet() {
whelk = Whelk.createLoadedCoreWhelk();
Document contextDocument = whelk.getStorage().getDocumentByIri(whelk.getSystemContextUri());
targetVocabMapper = new TargetVocabMapper(whelk.getJsonld(), contextDocument.data);
}

public void init() {
Expand All @@ -31,7 +34,7 @@ public void doGet(HttpServletRequest req, HttpServletResponse res) {
String apiBaseUrl = req.getRequestURL().toString();

if (req.getServletPath() != null && req.getServletPath().endsWith("/full")) {
Dump.sendDumpResponse(whelk, apiBaseUrl, req, res);
Dump.sendDumpResponse(whelk, targetVocabMapper, apiBaseUrl, req, res);
return;
}
String until = req.getParameter("until");
Expand Down
Loading