From a3d06090a14116b14a1438a02f8fb9759c054fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Olov=20Ylinenp=C3=A4=C3=A4?= <51744858+olovy@users.noreply.github.com> Date: Thu, 14 Nov 2024 09:51:41 +0100 Subject: [PATCH] Serve bulk reports (#1517) * Add bulk:execution with document counts and links to reports * Serve bulk whelktool reports from housekeeping --- .../main/groovy/whelk/HouseKeepingServer.java | 56 +++++++++++- .../groovy/whelk/datatool/WhelkTool.groovy | 15 ++-- .../whelk/datatool/bulkchange/BulkJob.java | 87 +++++++++++++++++-- .../datatool/bulkchange/BulkJobDocument.java | 37 ++++++++ .../datatool/bulkchange/BulkPreviewJob.java | 4 +- .../java/whelk/datatool/util/IdLoader.java | 1 - 6 files changed, 185 insertions(+), 15 deletions(-) diff --git a/housekeeping/src/main/groovy/whelk/HouseKeepingServer.java b/housekeeping/src/main/groovy/whelk/HouseKeepingServer.java index b8f43a6bfe..54048228d0 100644 --- a/housekeeping/src/main/groovy/whelk/HouseKeepingServer.java +++ b/housekeeping/src/main/groovy/whelk/HouseKeepingServer.java @@ -1,12 +1,34 @@ package whelk; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.eclipse.jetty.ee8.servlet.DefaultServlet; +import org.eclipse.jetty.ee8.servlet.FilterHolder; import org.eclipse.jetty.ee8.servlet.ServletContextHandler; import org.eclipse.jetty.ee8.servlet.ServletHolder; import org.eclipse.jetty.server.Server; +import whelk.datatool.bulkchange.BulkJob; import whelk.housekeeping.BulkChangePreviewAPI; import whelk.housekeeping.WebInterface; +import whelk.util.WhelkFactory; +import whelk.util.http.HttpTools; + +import javax.servlet.DispatcherType; +import javax.servlet.FilterChain; +import javax.servlet.ServletException; +import javax.servlet.http.HttpFilter; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.EnumSet; + +import static whelk.datatool.bulkchange.BulkJob.BULK_CONTEXT_PATH; +import static whelk.datatool.bulkchange.BulkJob.BULK_REPORTS_PATH; public class HouseKeepingServer extends XlServer { + private final static Logger log = LogManager.getLogger(HouseKeepingServer.class); + @Override protected void configureHandlers(Server server) { ServletContextHandler context = new ServletContextHandler(); @@ -17,11 +39,43 @@ protected void configureHandlers(Server server) { ServletHolder holder = new ServletHolder(WebInterface.class); holder.setInitOrder(0); context.addServlet(holder, "/"); - context.addServlet(BulkChangePreviewAPI.class, "/_bulk-change/*"); + context.addServlet(BulkChangePreviewAPI.class, BULK_CONTEXT_PATH + "/*"); + + serveBulkReports(context); serveStaticContent(context); } + private static void serveBulkReports(ServletContextHandler context) { + String dir; + try { + dir = BulkJob.reportBaseDir(WhelkFactory.getSingletonWhelk()).getCanonicalPath(); + } catch (IOException e) { + throw new RuntimeException(e); + } + + ServletHolder holder = new ServletHolder("bulk-reports", DefaultServlet.class); + holder.setInitParameter("resourceBase", dir); + holder.setInitParameter("dirAllowed", "false"); + holder.setInitParameter("pathInfoOnly", "true"); + var path = BULK_REPORTS_PATH + "/*"; + context.addServlet(holder, path); + + context.addFilter(new FilterHolder(new HttpFilter() { + @Override + protected void doFilter(HttpServletRequest req, HttpServletResponse res, FilterChain chain) throws IOException, ServletException { + var filePath = req.getPathInfo(); + if (filePath != null && BulkJob.FORBIDDEN_REPORTS.contains(Paths.get(filePath).getFileName().toString())) { + HttpTools.sendError(res, 403, "Forbidden"); + } else { + super.doFilter(req, res, chain); + } + } + }), path, EnumSet.of(DispatcherType.REQUEST)); + + log.info("Serving {} on {}", dir, path); + } + public static void main(String[] args) throws Exception { new HouseKeepingServer().run(); } diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy index 566bbb812f..d50e8062cd 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy @@ -47,6 +47,11 @@ class WhelkTool { static final int DEFAULT_BATCH_SIZE = 500 public static final int DEFAULT_FETCH_SIZE = 100 static final int DEFAULT_STATS_NUM_IDS = 3 + public static final String MAIN_LOG_NAME = "MAIN.txt" + public static final String ERROR_LOG_NAME = "ERRORS.txt" + public static final String MODIFIED_LOG_NAME = "MODIFIED.txt" + public static final String CREATED_LOG_NAME = "CREATED.txt" + public static final String DELETED_LOG_NAME = "DELETED.txt" Whelk whelk IdLoader idLoader @@ -114,14 +119,14 @@ class WhelkTool { this.defaultChangedBy = script.scriptJobUri this.reportsDir = reportsDir reportsDir.mkdirs() - mainLog = new PrintWriter(new File(reportsDir, "MAIN.txt")) - errorLog = new PrintWriter(new File(reportsDir, "ERRORS.txt")) + mainLog = new PrintWriter(new File(reportsDir, MAIN_LOG_NAME)) + errorLog = new PrintWriter(new File(reportsDir, ERROR_LOG_NAME)) - def modifiedLogFile = new File(reportsDir, "MODIFIED.txt") + def modifiedLogFile = new File(reportsDir, MODIFIED_LOG_NAME) modifiedLog = new PrintWriter(modifiedLogFile) - def createdLogFile = new File(reportsDir, "CREATED.txt") + def createdLogFile = new File(reportsDir, CREATED_LOG_NAME) createdLog = new PrintWriter(createdLogFile) - def deletedLogFile = new File(reportsDir, "DELETED.txt") + def deletedLogFile = new File(reportsDir, DELETED_LOG_NAME) deletedLog = new PrintWriter(deletedLogFile) try { diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJob.java b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJob.java index 68da78ece3..9ea2c43312 100644 --- a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJob.java +++ b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJob.java @@ -6,17 +6,31 @@ import whelk.component.PostgreSQLComponent; import whelk.datatool.Script; import whelk.datatool.WhelkTool; +import whelk.datatool.bulkchange.BulkJobDocument.Status; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.time.LocalDateTime; import java.time.ZoneId; +import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; +import java.util.stream.Stream; import static whelk.Document.HASH_IT; +import static whelk.datatool.WhelkTool.CREATED_LOG_NAME; +import static whelk.datatool.WhelkTool.DELETED_LOG_NAME; +import static whelk.datatool.WhelkTool.MAIN_LOG_NAME; +import static whelk.datatool.WhelkTool.MODIFIED_LOG_NAME; import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE; import static whelk.datatool.bulkchange.BulkJobDocument.Status.Completed; import static whelk.datatool.bulkchange.BulkJobDocument.Status.Failed; @@ -27,16 +41,23 @@ public class BulkJob implements Runnable { private static final Logger logger = Logger.getLogger(BulkJob.class); + + public static final String BULK_CONTEXT_PATH = "/_bulk-change"; + public static final String BULK_REPORTS_PATH = BULK_CONTEXT_PATH + "/reports"; + public static final Set FORBIDDEN_REPORTS = Set.of(MAIN_LOG_NAME); + protected static final String REPORTS_DIR = "bulk-change-reports"; protected final String id; protected final String systemId; protected final Whelk whelk; + protected final String executionId; public BulkJob(Whelk whelk, String id) { this.whelk = whelk; this.id = id; this.systemId = stripSuffix(stripPrefix(id, Document.getBASE_URI().toString()), Document.HASH_IT); + this.executionId = executionId(systemId); } @Override @@ -61,12 +82,61 @@ public void run() { tool.run(); - storeUpdate(doc -> doc.setStatus(Completed)); + if ((tool.getErrorDetected() != null)) { + finish(Failed); + } else { + finish(Completed); + } } catch (Exception e) { // TODO logger.error(e); System.err.println(e); - storeUpdate(doc -> doc.setStatus(Failed)); + finish(Failed); + } + } + + private void finish(Status status) { + storeUpdate(doc -> { + doc.setStatus(status); + doc.addExecution( + ZonedDateTime.now(ZoneId.systemDefault()), + status, + filteredReports(), + lineCount(CREATED_LOG_NAME), + lineCount(MODIFIED_LOG_NAME), + lineCount(DELETED_LOG_NAME)); + }); + } + + private long lineCount(String reportName) { + try (Stream stream = Files.lines(new File(reportDir(), reportName).toPath(), StandardCharsets.UTF_8)) { + return stream.count(); + } catch(FileNotFoundException ignored) { + return 0; + } catch (IOException e) { + logger.warn("Could not get line count", e); + return 0; + } + } + + private List filteredReports() { + try { + var files = reportDir().listFiles(pathname -> + pathname.isFile() && !FORBIDDEN_REPORTS.contains(pathname.getName()) + ); + + if (files == null) { + throw new IOException("Could not list files"); + } + + var path = BULK_REPORTS_PATH + "/" + reportDir().getName() + "/"; + return Arrays.stream(files) + .filter(f -> lineCount(f.getName()) > 0) + .map(f -> path + f.getName()) + .toList(); + } catch (IOException e) { + logger.warn(e.getMessage(), e); + return Collections.emptyList(); } } @@ -86,7 +156,8 @@ protected WhelkTool buildWhelkTool(BulkJobDocument jobDoc) throws IOException { Script script = jobDoc.getSpecification().getScript(bulkJobThingId); - WhelkTool tool = new WhelkTool(whelk, script, reportDir(systemId), WhelkTool.getDEFAULT_STATS_NUM_IDS()); + WhelkTool tool = new WhelkTool(whelk, script, reportDir(), WhelkTool.getDEFAULT_STATS_NUM_IDS()); + // TODO for now setting changedBy only works for loud changes (!minorChange in PostgreSQLComponent) tool.setDefaultChangedBy(jobDoc.getChangeAgentId()); tool.setAllowLoud(jobDoc.shouldUpdateModifiedTimestamp()); @@ -108,11 +179,15 @@ private void storeUpdate(Consumer updater) { whelk.storeAtomicUpdate(systemId, minorUpdate, writeIdenticalVersions, changedIn, changedBy, updateAgent); } - protected File reportDir(String baseName) throws IOException { - return new File(new File(whelk.getLogRoot(), REPORTS_DIR), reportLeafDir(baseName)); + protected File reportDir() throws IOException { + return new File(reportBaseDir(whelk), executionId); + } + + public static File reportBaseDir(Whelk whelk) throws IOException { + return new File(whelk.getLogRoot(), REPORTS_DIR); } - protected static String reportLeafDir(String baseName) { + protected static String executionId(String baseName) { String now = LocalDateTime .now(ZoneId.systemDefault()) .truncatedTo(ChronoUnit.SECONDS) diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJobDocument.java b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJobDocument.java index 0950a05746..25bece03df 100644 --- a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJobDocument.java +++ b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJobDocument.java @@ -6,11 +6,15 @@ import whelk.util.DocumentUtil; import whelk.util.JsonLdKey; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import static whelk.JsonLd.ID_KEY; +import static whelk.JsonLd.asList; import static whelk.util.JsonLdKey.fromKey; // All terms are defined in https://github.com/libris/definitions/blob/develop/source/vocab/platform.ttl @@ -66,12 +70,20 @@ public String key() { public static final String DEPRECATE_KEY = "bulk:deprecate"; public static final String SCRIPT_KEY = "bulk:script"; public static final String RDF_VALUE = "value"; + public static final String EXECUTION_KEY = "bulk:execution"; + public static final String EXECUTION_TYPE = "bulk:Execution"; + public static final String REPORT_KEY = "bulk:report"; + public static final String END_TIME_KEY = "endTime"; + public static final String NUM_CREATED_KEY = "bulk:numCreated"; + public static final String NUM_UPDATED_KEY = "bulk:numUpdated"; + public static final String NUM_DELETED_KEY = "bulk:numDeleted"; private static final List STATUS_PATH = List.of(JsonLd.GRAPH_KEY, 1, STATUS_KEY); private static final List UPDATE_TIMESTAMP_PATH = List.of(JsonLd.GRAPH_KEY, 1, SHOULD_UPDATE_TIMESTAMP_KEY); private static final List LABELS_PATH = List.of(JsonLd.GRAPH_KEY, 1, LABEL_KEY, "*"); private static final List COMMENTS_PATH = List.of(JsonLd.GRAPH_KEY, 1, COMMENT_KEY, "*"); private static final List SPECIFICATION_PATH = List.of(JsonLd.GRAPH_KEY, 1, CHANGE_SPEC_KEY); + private static final List EXECUTION_PATH = List.of(JsonLd.GRAPH_KEY, 1, EXECUTION_KEY); public BulkJobDocument(Document doc) { this(doc.data); @@ -109,6 +121,31 @@ public Map getSpecificationRaw() { return get(data, SPECIFICATION_PATH); } + @SuppressWarnings("unchecked") + public void addExecution(ZonedDateTime endTime, Status status, List reportPaths, + long numCreated, long numUpdated, long numDeleted) { + var e = new HashMap<>(Map.of( + JsonLd.TYPE_KEY, EXECUTION_TYPE, + REPORT_KEY, reportPaths.stream().map(s -> Map.of(ID_KEY, s)).toList(), + END_TIME_KEY, endTime.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME), + STATUS_KEY, status.key() + )); + + if (numCreated > 0) { + e.put(NUM_CREATED_KEY, numCreated); + } + if (numUpdated > 0) { + e.put(NUM_UPDATED_KEY, numUpdated); + } + if (numDeleted > 0) { + e.put(NUM_DELETED_KEY, numDeleted); + } + + var executions = asList(get(data, EXECUTION_PATH)); + executions.add(e); + _set(EXECUTION_PATH, executions, data); + } + public Specification getSpecification() { Map spec = getSpecificationRaw(); if (spec == null) { diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkPreviewJob.java b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkPreviewJob.java index 1416490edb..f45095c6a7 100644 --- a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkPreviewJob.java +++ b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkPreviewJob.java @@ -76,7 +76,7 @@ public void run() { } @Override - protected File reportDir(String baseName) throws IOException { - return new File(Files.createTempDirectory(REPORTS_DIR).toFile(), reportLeafDir(baseName)); + protected File reportDir() throws IOException { + return new File(Files.createTempDirectory(REPORTS_DIR).toFile(), executionId); } } diff --git a/whelktool/src/main/java/whelk/datatool/util/IdLoader.java b/whelktool/src/main/java/whelk/datatool/util/IdLoader.java index dedd00cf6f..1069adba37 100644 --- a/whelktool/src/main/java/whelk/datatool/util/IdLoader.java +++ b/whelktool/src/main/java/whelk/datatool/util/IdLoader.java @@ -7,7 +7,6 @@ import java.io.InputStreamReader; import java.net.URI; import java.net.URISyntaxException; -import java.sql.Array; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet;