Skip to content

Commit

Permalink
Skip and log invalid docs (#1518)
Browse files Browse the repository at this point in the history
  • Loading branch information
kwahlin authored Nov 19, 2024
1 parent ac9ceba commit 4a2d8e1
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 37 deletions.
38 changes: 24 additions & 14 deletions whelktool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,30 @@ Options
```
$ java -jar build/libs/whelktool.jar --help
usage: whelktool [options] <SCRIPT>
-a,--allow-loud Allow scripts to do loud modifications.
-d,--dry-run Do not save any modifications.
-h,--help Print this help message and exit.
-I,--skip-index Do not index any changes, only write to
storage.
-l,--limit <LIMIT> Amount of documents to process.
-n,--stats-num-ids <arg> Number of ids to print per entry in
STATISTICS.txt.
-r,--report <REPORT-DIR> Directory where reports are written (defaults
to "reports").
-s,--step Change one document at a time, prompting to
continue.
-T,--no-threads Do not use threads to parallellize batch
processing.
-a,--allow-loud Allow scripts to do loud
modifications.
-d,--dry-run Do not save any modifications.
-h,--help Print this help message and exit.
-I,--skip-index Do not index any changes, only write
to storage.
-idchg,--allow-id-removal [UNSAFE] Allow script to remove
document ids, e.g. sameAs.
-l,--limit <LIMIT> Amount of documents to process.
-n,--stats-num-ids <arg> Number of ids to print per entry in
STATISTICS.txt.
-p,--parameters <PARAMETER-FILE> Path to JSON file with parameters to
script
-r,--report <REPORT-DIR> Directory where reports are written
(defaults to "reports").
-s,--step Change one document at a time,
prompting to continue.
-T,--no-threads Do not use threads to parallellize
batch processing.
-t,--num-threads <N> Override default number of threads
(32).
-v,--validation <MODE> [UNSAFE] Set JSON-LD validation mode.
Defaults to ON. Possible values:
ON/OFF/LOG_ONLY
```

* Use `--dry-run` to test your script without writing any changes.
Expand Down
72 changes: 51 additions & 21 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
package whelk.datatool

import com.google.common.util.concurrent.MoreExecutors
import groovy.transform.Immutable
import org.apache.logging.log4j.Logger
import org.codehaus.groovy.jsr223.GroovyScriptEngineImpl
import whelk.Document
import whelk.IdGenerator
import whelk.JsonLd
import whelk.JsonLdValidator
import whelk.Whelk
import whelk.datatool.form.ModifiedThing
import whelk.datatool.form.Transform
import whelk.datatool.util.IdLoader
import whelk.exception.StaleUpdateException
Expand All @@ -21,12 +19,10 @@ import whelk.util.DocumentUtil
import whelk.util.LegacyIntegrationTools
import whelk.util.Statistics

import javax.print.Doc
import javax.script.Bindings
import javax.script.CompiledScript
import javax.script.ScriptEngineManager
import javax.script.SimpleBindings
import java.nio.charset.StandardCharsets
import java.time.ZonedDateTime
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue
Expand All @@ -41,7 +37,6 @@ import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicInteger

import static java.util.concurrent.TimeUnit.SECONDS
import static whelk.JsonLd.RECORD_KEY
import static whelk.util.Jackson.mapper

class WhelkTool {
Expand All @@ -50,6 +45,7 @@ class WhelkTool {
static final int DEFAULT_STATS_NUM_IDS = 3
public static final String MAIN_LOG_NAME = "MAIN.txt"
public static final String ERROR_LOG_NAME = "ERRORS.txt"
public static final String FAILED_LOG_NAME = "FAILED.txt"
public static final String MODIFIED_LOG_NAME = "MODIFIED.txt"
public static final String CREATED_LOG_NAME = "CREATED.txt"
public static final String DELETED_LOG_NAME = "DELETED.txt"
Expand All @@ -68,6 +64,8 @@ class WhelkTool {
File reportsDir
PrintWriter mainLog
PrintWriter errorLog

PrintWriter failedLog
PrintWriter modifiedLog
PrintWriter createdLog
PrintWriter deletedLog
Expand All @@ -90,7 +88,14 @@ class WhelkTool {

boolean allowLoud
boolean allowIdRemoval
boolean skipValidation = false

enum ValidationMode {
ON,
OFF,
LOG_ONLY
}

ValidationMode validationMode = ValidationMode.ON

Throwable errorDetected

Expand Down Expand Up @@ -123,7 +128,7 @@ class WhelkTool {
reportsDir.mkdirs()
mainLog = new PrintWriter(new File(reportsDir, MAIN_LOG_NAME))
errorLog = new PrintWriter(new File(reportsDir, ERROR_LOG_NAME))

failedLog = new PrintWriter(new File(reportsDir, FAILED_LOG_NAME))
def modifiedLogFile = new File(reportsDir, MODIFIED_LOG_NAME)
modifiedLog = new PrintWriter(modifiedLogFile)
def createdLogFile = new File(reportsDir, CREATED_LOG_NAME)
Expand Down Expand Up @@ -550,8 +555,8 @@ class WhelkTool {
doc.setGenerationDate(new Date())
doc.setGenerationProcess(item.generationProcess ?: script.scriptJobUri)

if (!skipValidation) {
validateJsonLd(doc)
if (validationMode in [ValidationMode.ON, ValidationMode.LOG_ONLY] && !validateJsonLd(doc)) {
return
}

if (recordChanges) {
Expand All @@ -570,8 +575,8 @@ class WhelkTool {
doc.setGenerationDate(new Date())
doc.setGenerationProcess(item.generationProcess ?: script.scriptJobUri)

if (!skipValidation) {
validateJsonLd(doc)
if (validationMode in [ValidationMode.ON, ValidationMode.LOG_ONLY] && !validateJsonLd(doc)) {
return
}

if (recordChanges) {
Expand All @@ -585,11 +590,19 @@ class WhelkTool {
createdLog.println(doc.shortId)
}

private void validateJsonLd(Document doc) {
private boolean validateJsonLd(Document doc) {
List<JsonLdValidator.Error> errors = validator.validate(doc.data, doc.getLegacyCollection(whelk.jsonld))
if (errors) {
throw new Exception("Invalid JSON-LD. Errors: ${errors.collect{ it.toMap() }}")
String msg = "Invalid JSON-LD in document ${doc.completeId}. Errors: ${errors.collect { it.toMap() }}"
if (validationMode == ValidationMode.ON) {
throw new Exception(msg)
} else if (validationMode == ValidationMode.LOG_ONLY) {
failedLog.println(doc.shortId)
errorLog.println(msg)
return true
}
}
return false
}

private boolean confirmNextStep(String inJsonStr, Document doc) {
Expand Down Expand Up @@ -694,7 +707,7 @@ class WhelkTool {
if (limit > -1) log " limit: $limit"
if (allowLoud) log " allowLoud"
if (allowIdRemoval) log " allowIdRemoval"
if (skipValidation) log " skipValidation"
log " validation: ${validationMode.name()}"
log()

bindings = createMainBindings()
Expand Down Expand Up @@ -760,7 +773,8 @@ class WhelkTool {
cli.l(longOpt: 'limit', args: 1, argName: 'LIMIT', 'Amount of documents to process.')
cli.a(longOpt: 'allow-loud', 'Allow scripts to do loud modifications.')
cli.idchg(longOpt: 'allow-id-removal', '[UNSAFE] Allow script to remove document ids, e.g. sameAs.')
cli.sv(longOpt: 'skip-validation', '[UNSAFE] Skip JSON-LD validation before saving to database.')
cli.v(longOpt: 'validation', args: 1, argName: 'MODE', '[UNSAFE] Set JSON-LD validation mode. Defaults to ON.' +
' Possible values: ON/OFF/LOG_ONLY')
cli.n(longOpt: 'stats-num-ids', args: 1, 'Number of ids to print per entry in STATISTICS.txt.')
cli.p(longOpt: 'parameters', args: 1, argName: 'PARAMETER-FILE', 'Path to JSON file with parameters to script')

Expand All @@ -778,9 +792,8 @@ class WhelkTool {
try {
script = new FileScript(scriptPath)

String paramPath = options.p
if (paramPath) {
script.setParameters(mapper.readValue(new File(paramPath).getText("UTF-8"), Map))
if (options.p) {
script.setParameters(mapper.readValue(new File(options.p).getText("UTF-8"), Map))
}
}
catch (IOException e) {
Expand All @@ -793,11 +806,24 @@ class WhelkTool {
tool.dryRun = options.d
tool.stepWise = options.s
tool.noThreads = options.T
tool.numThreads = options.t ? Integer.parseInt(options.t) : -1
tool.limit = options.l ? Integer.parseInt(options.l) : -1
tool.allowLoud = options.a
tool.allowIdRemoval = options.idchg
tool.skipValidation = options.sv
try {
if (options.t) {
tool.numThreads = Integer.parseInt(options.t)
}
if (options.l) {
tool.limit = Integer.parseInt(options.l)
}
if (options.v) {
tool.validationMode = parseValidationMode(options.v)
}
} catch (IllegalArgumentException ignored) {
System.err.println("Invalid argument(s)")
cli.usage()
System.exit(1)
}

try {
tool.run()
} catch (Exception e) {
Expand All @@ -806,6 +832,10 @@ class WhelkTool {
}
}

private static ValidationMode parseValidationMode(String arg) throws IllegalArgumentException {
return ValidationMode.valueOf(arg.toUpperCase())
}

void recordChange(Document before, Document after, int number) {
//if (recordingLimit >= 0 && number > recordingLimit) {
if (recordingLimit >= 0 && recordedChanges.size() > recordingLimit) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
import static whelk.Document.HASH_IT;
import static whelk.datatool.WhelkTool.CREATED_LOG_NAME;
import static whelk.datatool.WhelkTool.DELETED_LOG_NAME;
import static whelk.datatool.WhelkTool.FAILED_LOG_NAME;
import static whelk.datatool.WhelkTool.MAIN_LOG_NAME;
import static whelk.datatool.WhelkTool.MODIFIED_LOG_NAME;
import static whelk.datatool.WhelkTool.ValidationMode.LOG_ONLY;
import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE;
import static whelk.datatool.bulkchange.BulkJobDocument.Status.Completed;
import static whelk.datatool.bulkchange.BulkJobDocument.Status.Failed;
Expand Down Expand Up @@ -105,7 +107,8 @@ private void finish(Status status) {
filteredReports(),
lineCount(CREATED_LOG_NAME),
lineCount(MODIFIED_LOG_NAME),
lineCount(DELETED_LOG_NAME));
lineCount(DELETED_LOG_NAME),
lineCount(FAILED_LOG_NAME));
});
}

Expand Down Expand Up @@ -163,6 +166,7 @@ protected WhelkTool buildWhelkTool(BulkJobDocument jobDoc) throws IOException {
tool.setDefaultChangedBy(jobDoc.getChangeAgentId());
tool.setAllowLoud(jobDoc.shouldUpdateModifiedTimestamp());
tool.setNoThreads(false);
tool.setValidationMode(LOG_ONLY);

return tool;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ public String key() {
public static final String NUM_CREATED_KEY = "bulk:numCreated";
public static final String NUM_UPDATED_KEY = "bulk:numUpdated";
public static final String NUM_DELETED_KEY = "bulk:numDeleted";
public static final String NUM_FAILED_KEY = "bulk:numFailed";

private static final List<Object> STATUS_PATH = List.of(JsonLd.GRAPH_KEY, 1, STATUS_KEY);
private static final List<Object> UPDATE_TIMESTAMP_PATH = List.of(JsonLd.GRAPH_KEY, 1, SHOULD_UPDATE_TIMESTAMP_KEY);
Expand Down Expand Up @@ -125,7 +126,7 @@ public Map<String, Object> getSpecificationRaw() {

@SuppressWarnings("unchecked")
public void addExecution(ZonedDateTime endTime, Status status, List<String> reportPaths,
long numCreated, long numUpdated, long numDeleted) {
long numCreated, long numUpdated, long numDeleted, long numFailed) {
var e = new HashMap<>(Map.of(
JsonLd.TYPE_KEY, EXECUTION_TYPE,
REPORT_KEY, reportPaths.stream().map(s -> Map.of(ID_KEY, s)).toList(),
Expand All @@ -142,6 +143,9 @@ public void addExecution(ZonedDateTime endTime, Status status, List<String> repo
if (numDeleted > 0) {
e.put(NUM_DELETED_KEY, numDeleted);
}
if (numFailed > 0) {
e.put(NUM_FAILED_KEY, numFailed);
}

var executions = asList(get(data, EXECUTION_PATH));
executions.add(e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import java.nio.file.Files;
import java.util.List;

import static whelk.datatool.WhelkTool.ValidationMode.LOG_ONLY;
import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE;

public class BulkPreviewJob extends BulkJob {
Expand All @@ -30,6 +31,7 @@ public BulkPreviewJob(Whelk whelk, String id) throws IOException {
tool.setDryRun(true);
tool.setRecordChanges(true);
tool.setRecordingLimit(RECORD_MAX_ITEMS);
tool.setValidationMode(LOG_ONLY);
tool.setLogger(log);
}

Expand Down

0 comments on commit 4a2d8e1

Please sign in to comment.