Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip and log invalid docs #1518

Merged
merged 9 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 24 additions & 14 deletions whelktool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,30 @@ Options
```
$ java -jar build/libs/whelktool.jar --help
usage: whelktool [options] <SCRIPT>
-a,--allow-loud Allow scripts to do loud modifications.
-d,--dry-run Do not save any modifications.
-h,--help Print this help message and exit.
-I,--skip-index Do not index any changes, only write to
storage.
-l,--limit <LIMIT> Amount of documents to process.
-n,--stats-num-ids <arg> Number of ids to print per entry in
STATISTICS.txt.
-r,--report <REPORT-DIR> Directory where reports are written (defaults
to "reports").
-s,--step Change one document at a time, prompting to
continue.
-T,--no-threads Do not use threads to parallellize batch
processing.
-a,--allow-loud Allow scripts to do loud
modifications.
-d,--dry-run Do not save any modifications.
-h,--help Print this help message and exit.
-I,--skip-index Do not index any changes, only write
to storage.
-idchg,--allow-id-removal [UNSAFE] Allow script to remove
document ids, e.g. sameAs.
-l,--limit <LIMIT> Amount of documents to process.
-n,--stats-num-ids <arg> Number of ids to print per entry in
STATISTICS.txt.
-p,--parameters <PARAMETER-FILE> Path to JSON file with parameters to
script
-r,--report <REPORT-DIR> Directory where reports are written
(defaults to "reports").
-s,--step Change one document at a time,
prompting to continue.
-T,--no-threads Do not use threads to parallellize
batch processing.
-t,--num-threads <N> Override default number of threads
(32).
-v,--validation <MODE> [UNSAFE] Set JSON-LD validation mode.
Defaults to ON. Possible values:
ON/OFF/LOG_ONLY
```

* Use `--dry-run` to test your script without writing any changes.
Expand Down
72 changes: 51 additions & 21 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
package whelk.datatool

import com.google.common.util.concurrent.MoreExecutors
import groovy.transform.Immutable
import org.apache.logging.log4j.Logger
import org.codehaus.groovy.jsr223.GroovyScriptEngineImpl
import whelk.Document
import whelk.IdGenerator
import whelk.JsonLd
import whelk.JsonLdValidator
import whelk.Whelk
import whelk.datatool.form.ModifiedThing
import whelk.datatool.form.Transform
import whelk.datatool.util.IdLoader
import whelk.exception.StaleUpdateException
Expand All @@ -21,12 +19,10 @@ import whelk.util.DocumentUtil
import whelk.util.LegacyIntegrationTools
import whelk.util.Statistics

import javax.print.Doc
import javax.script.Bindings
import javax.script.CompiledScript
import javax.script.ScriptEngineManager
import javax.script.SimpleBindings
import java.nio.charset.StandardCharsets
import java.time.ZonedDateTime
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue
Expand All @@ -41,7 +37,6 @@ import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicInteger

import static java.util.concurrent.TimeUnit.SECONDS
import static whelk.JsonLd.RECORD_KEY
import static whelk.util.Jackson.mapper

class WhelkTool {
Expand All @@ -50,6 +45,7 @@ class WhelkTool {
static final int DEFAULT_STATS_NUM_IDS = 3
public static final String MAIN_LOG_NAME = "MAIN.txt"
public static final String ERROR_LOG_NAME = "ERRORS.txt"
public static final String FAILED_LOG_NAME = "FAILED.txt"
public static final String MODIFIED_LOG_NAME = "MODIFIED.txt"
public static final String CREATED_LOG_NAME = "CREATED.txt"
public static final String DELETED_LOG_NAME = "DELETED.txt"
Expand All @@ -68,6 +64,8 @@ class WhelkTool {
File reportsDir
PrintWriter mainLog
PrintWriter errorLog

PrintWriter failedLog
PrintWriter modifiedLog
PrintWriter createdLog
PrintWriter deletedLog
Expand All @@ -90,7 +88,14 @@ class WhelkTool {

boolean allowLoud
boolean allowIdRemoval
boolean skipValidation = false

enum ValidationMode {
ON,
OFF,
LOG_ONLY
}

ValidationMode validationMode = ValidationMode.ON

Throwable errorDetected

Expand Down Expand Up @@ -123,7 +128,7 @@ class WhelkTool {
reportsDir.mkdirs()
mainLog = new PrintWriter(new File(reportsDir, MAIN_LOG_NAME))
errorLog = new PrintWriter(new File(reportsDir, ERROR_LOG_NAME))

failedLog = new PrintWriter(new File(reportsDir, FAILED_LOG_NAME))
def modifiedLogFile = new File(reportsDir, MODIFIED_LOG_NAME)
modifiedLog = new PrintWriter(modifiedLogFile)
def createdLogFile = new File(reportsDir, CREATED_LOG_NAME)
Expand Down Expand Up @@ -562,8 +567,8 @@ class WhelkTool {
doc.setGenerationDate(new Date())
doc.setGenerationProcess(item.generationProcess ?: script.scriptJobUri)

if (!skipValidation) {
validateJsonLd(doc)
if (validationMode in [ValidationMode.ON, ValidationMode.LOG_ONLY] && !validateJsonLd(doc)) {
return
}

if (recordChanges) {
Expand All @@ -582,8 +587,8 @@ class WhelkTool {
doc.setGenerationDate(new Date())
doc.setGenerationProcess(item.generationProcess ?: script.scriptJobUri)

if (!skipValidation) {
validateJsonLd(doc)
if (validationMode in [ValidationMode.ON, ValidationMode.LOG_ONLY] && !validateJsonLd(doc)) {
return
}

if (recordChanges) {
Expand All @@ -597,11 +602,19 @@ class WhelkTool {
createdLog.println(doc.shortId)
}

private void validateJsonLd(Document doc) {
private boolean validateJsonLd(Document doc) {
List<JsonLdValidator.Error> errors = validator.validate(doc.data, doc.getLegacyCollection(whelk.jsonld))
if (errors) {
throw new Exception("Invalid JSON-LD. Errors: ${errors.collect{ it.toMap() }}")
String msg = "Invalid JSON-LD in document ${doc.completeId}. Errors: ${errors.collect { it.toMap() }}"
if (validationMode == ValidationMode.ON) {
throw new Exception(msg)
} else if (validationMode == ValidationMode.LOG_ONLY) {
failedLog.println(doc.shortId)
errorLog.println(msg)
return true
}
}
return false
}

private boolean confirmNextStep(String inJsonStr, Document doc) {
Expand Down Expand Up @@ -707,7 +720,7 @@ class WhelkTool {
if (limit > -1) log " limit: $limit"
if (allowLoud) log " allowLoud"
if (allowIdRemoval) log " allowIdRemoval"
if (skipValidation) log " skipValidation"
log " validation: ${validationMode.name()}"
log()

bindings = createMainBindings()
Expand Down Expand Up @@ -773,7 +786,8 @@ class WhelkTool {
cli.l(longOpt: 'limit', args: 1, argName: 'LIMIT', 'Amount of documents to process.')
cli.a(longOpt: 'allow-loud', 'Allow scripts to do loud modifications.')
cli.idchg(longOpt: 'allow-id-removal', '[UNSAFE] Allow script to remove document ids, e.g. sameAs.')
cli.sv(longOpt: 'skip-validation', '[UNSAFE] Skip JSON-LD validation before saving to database.')
cli.v(longOpt: 'validation', args: 1, argName: 'MODE', '[UNSAFE] Set JSON-LD validation mode. Defaults to ON.' +
' Possible values: ON/OFF/LOG_ONLY')
cli.n(longOpt: 'stats-num-ids', args: 1, 'Number of ids to print per entry in STATISTICS.txt.')
cli.p(longOpt: 'parameters', args: 1, argName: 'PARAMETER-FILE', 'Path to JSON file with parameters to script')

Expand All @@ -791,9 +805,8 @@ class WhelkTool {
try {
script = new FileScript(scriptPath)

String paramPath = options.p
if (paramPath) {
script.setParameters(mapper.readValue(new File(paramPath).getText("UTF-8"), Map))
if (options.p) {
script.setParameters(mapper.readValue(new File(options.p).getText("UTF-8"), Map))
}
}
catch (IOException e) {
Expand All @@ -806,11 +819,24 @@ class WhelkTool {
tool.dryRun = options.d
tool.stepWise = options.s
tool.noThreads = options.T
tool.numThreads = options.t ? Integer.parseInt(options.t) : -1
tool.limit = options.l ? Integer.parseInt(options.l) : -1
tool.allowLoud = options.a
tool.allowIdRemoval = options.idchg
tool.skipValidation = options.sv
try {
if (options.t) {
tool.numThreads = Integer.parseInt(options.t)
}
if (options.l) {
tool.limit = Integer.parseInt(options.l)
}
if (options.v) {
tool.validationMode = parseValidationMode(options.v)
}
} catch (IllegalArgumentException ignored) {
System.err.println("Invalid argument(s)")
cli.usage()
System.exit(1)
}

try {
tool.run()
} catch (Exception e) {
Expand All @@ -819,6 +845,10 @@ class WhelkTool {
}
}

private static ValidationMode parseValidationMode(String arg) throws IllegalArgumentException {
return ValidationMode.valueOf(arg.toUpperCase())
}

void recordChange(Document before, Document after, int number) {
//if (recordingLimit >= 0 && number > recordingLimit) {
if (recordingLimit >= 0 && recordedChanges.size() > recordingLimit) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
import static whelk.Document.HASH_IT;
import static whelk.datatool.WhelkTool.CREATED_LOG_NAME;
import static whelk.datatool.WhelkTool.DELETED_LOG_NAME;
import static whelk.datatool.WhelkTool.FAILED_LOG_NAME;
import static whelk.datatool.WhelkTool.MAIN_LOG_NAME;
import static whelk.datatool.WhelkTool.MODIFIED_LOG_NAME;
import static whelk.datatool.WhelkTool.ValidationMode.LOG_ONLY;
import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE;
import static whelk.datatool.bulkchange.BulkJobDocument.Status.Completed;
import static whelk.datatool.bulkchange.BulkJobDocument.Status.Failed;
Expand Down Expand Up @@ -105,7 +107,8 @@ private void finish(Status status) {
filteredReports(),
lineCount(CREATED_LOG_NAME),
lineCount(MODIFIED_LOG_NAME),
lineCount(DELETED_LOG_NAME));
lineCount(DELETED_LOG_NAME),
lineCount(FAILED_LOG_NAME));
});
}

Expand Down Expand Up @@ -163,6 +166,7 @@ protected WhelkTool buildWhelkTool(BulkJobDocument jobDoc) throws IOException {
tool.setDefaultChangedBy(jobDoc.getChangeAgentId());
tool.setAllowLoud(jobDoc.shouldUpdateModifiedTimestamp());
tool.setNoThreads(false);
tool.setValidationMode(LOG_ONLY);

return tool;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ public String key() {
public static final String NUM_CREATED_KEY = "bulk:numCreated";
public static final String NUM_UPDATED_KEY = "bulk:numUpdated";
public static final String NUM_DELETED_KEY = "bulk:numDeleted";
public static final String NUM_FAILED_KEY = "bulk:numFailed";

private static final List<Object> STATUS_PATH = List.of(JsonLd.GRAPH_KEY, 1, STATUS_KEY);
private static final List<Object> UPDATE_TIMESTAMP_PATH = List.of(JsonLd.GRAPH_KEY, 1, SHOULD_UPDATE_TIMESTAMP_KEY);
Expand Down Expand Up @@ -123,7 +124,7 @@ public Map<String, Object> getSpecificationRaw() {

@SuppressWarnings("unchecked")
public void addExecution(ZonedDateTime endTime, Status status, List<String> reportPaths,
long numCreated, long numUpdated, long numDeleted) {
long numCreated, long numUpdated, long numDeleted, long numFailed) {
var e = new HashMap<>(Map.of(
JsonLd.TYPE_KEY, EXECUTION_TYPE,
REPORT_KEY, reportPaths.stream().map(s -> Map.of(ID_KEY, s)).toList(),
Expand All @@ -140,6 +141,9 @@ public void addExecution(ZonedDateTime endTime, Status status, List<String> repo
if (numDeleted > 0) {
e.put(NUM_DELETED_KEY, numDeleted);
}
if (numFailed > 0) {
e.put(NUM_FAILED_KEY, numFailed);
}

var executions = asList(get(data, EXECUTION_PATH));
executions.add(e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import java.nio.file.Files;
import java.util.List;

import static whelk.datatool.WhelkTool.ValidationMode.LOG_ONLY;
import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE;

public class BulkPreviewJob extends BulkJob {
Expand All @@ -30,6 +31,7 @@ public BulkPreviewJob(Whelk whelk, String id) throws IOException {
tool.setDryRun(true);
tool.setRecordChanges(true);
tool.setRecordingLimit(RECORD_MAX_ITEMS);
tool.setValidationMode(LOG_ONLY);
tool.setLogger(log);
}

Expand Down
Loading