Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip and log invalid docs #1518

Merged
merged 9 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 24 additions & 14 deletions whelktool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,30 @@ Options
```
$ java -jar build/libs/whelktool.jar --help
usage: whelktool [options] <SCRIPT>
-a,--allow-loud Allow scripts to do loud modifications.
-d,--dry-run Do not save any modifications.
-h,--help Print this help message and exit.
-I,--skip-index Do not index any changes, only write to
storage.
-l,--limit <LIMIT> Amount of documents to process.
-n,--stats-num-ids <arg> Number of ids to print per entry in
STATISTICS.txt.
-r,--report <REPORT-DIR> Directory where reports are written (defaults
to "reports").
-s,--step Change one document at a time, prompting to
continue.
-T,--no-threads Do not use threads to parallellize batch
processing.
-a,--allow-loud Allow scripts to do loud
modifications.
-d,--dry-run Do not save any modifications.
-h,--help Print this help message and exit.
-I,--skip-index Do not index any changes, only write
to storage.
-idchg,--allow-id-removal [UNSAFE] Allow script to remove
document ids, e.g. sameAs.
-l,--limit <LIMIT> Amount of documents to process.
-n,--stats-num-ids <arg> Number of ids to print per entry in
STATISTICS.txt.
-p,--parameters <PARAMETER-FILE> Path to JSON file with parameters to
script
-r,--report <REPORT-DIR> Directory where reports are written
(defaults to "reports").
-s,--step Change one document at a time,
prompting to continue.
-T,--no-threads Do not use threads to parallellize
batch processing.
-t,--num-threads <N> Override default number of threads
(32).
-v,--validation <MODE> [UNSAFE] Set JSON-LD validation mode.
Defaults to ON. Possible values:
ON/OFF/LOG-ONLY
kwahlin marked this conversation as resolved.
Show resolved Hide resolved
```

* Use `--dry-run` to test your script without writing any changes.
Expand Down
46 changes: 34 additions & 12 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import whelk.IdGenerator
import whelk.JsonLd
import whelk.JsonLdValidator
import whelk.Whelk
import whelk.datatool.form.ModifiedThing
import whelk.datatool.form.Transform
import whelk.datatool.util.IdLoader
import whelk.exception.StaleUpdateException
Expand All @@ -21,12 +20,10 @@ import whelk.util.DocumentUtil
import whelk.util.LegacyIntegrationTools
import whelk.util.Statistics

import javax.print.Doc
import javax.script.Bindings
import javax.script.CompiledScript
import javax.script.ScriptEngineManager
import javax.script.SimpleBindings
import java.nio.charset.StandardCharsets
import java.time.ZonedDateTime
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue
Expand All @@ -41,7 +38,6 @@ import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicInteger

import static java.util.concurrent.TimeUnit.SECONDS
import static whelk.JsonLd.RECORD_KEY
import static whelk.util.Jackson.mapper

class WhelkTool {
Expand All @@ -50,6 +46,7 @@ class WhelkTool {
static final int DEFAULT_STATS_NUM_IDS = 3
public static final String MAIN_LOG_NAME = "MAIN.txt"
public static final String ERROR_LOG_NAME = "ERRORS.txt"
public static final String INVALID_LOG_NAME = "INVALID.txt"
public static final String MODIFIED_LOG_NAME = "MODIFIED.txt"
public static final String CREATED_LOG_NAME = "CREATED.txt"
public static final String DELETED_LOG_NAME = "DELETED.txt"
Expand All @@ -68,6 +65,8 @@ class WhelkTool {
File reportsDir
PrintWriter mainLog
PrintWriter errorLog

PrintWriter invalidLog
PrintWriter modifiedLog
PrintWriter createdLog
PrintWriter deletedLog
Expand All @@ -90,7 +89,14 @@ class WhelkTool {

boolean allowLoud
boolean allowIdRemoval
boolean skipValidation = false

enum ValidationMode {
ON,
OFF,
LOG_ONLY
}

ValidationMode validationMode = ValidationMode.ON

Throwable errorDetected

Expand Down Expand Up @@ -123,7 +129,7 @@ class WhelkTool {
reportsDir.mkdirs()
mainLog = new PrintWriter(new File(reportsDir, MAIN_LOG_NAME))
errorLog = new PrintWriter(new File(reportsDir, ERROR_LOG_NAME))

invalidLog = new PrintWriter(new File(reportsDir, INVALID_LOG_NAME))
def modifiedLogFile = new File(reportsDir, MODIFIED_LOG_NAME)
modifiedLog = new PrintWriter(modifiedLogFile)
def createdLogFile = new File(reportsDir, CREATED_LOG_NAME)
Expand Down Expand Up @@ -562,7 +568,7 @@ class WhelkTool {
doc.setGenerationDate(new Date())
doc.setGenerationProcess(item.generationProcess ?: script.scriptJobUri)

if (!skipValidation) {
if (validationMode in [ValidationMode.ON, ValidationMode.LOG_ONLY]) {
validateJsonLd(doc)
}

Expand All @@ -582,7 +588,7 @@ class WhelkTool {
doc.setGenerationDate(new Date())
doc.setGenerationProcess(item.generationProcess ?: script.scriptJobUri)

if (!skipValidation) {
if (validationMode in [ValidationMode.ON, ValidationMode.LOG_ONLY]) {
validateJsonLd(doc)
}

Expand All @@ -600,7 +606,13 @@ class WhelkTool {
private void validateJsonLd(Document doc) {
List<JsonLdValidator.Error> errors = validator.validate(doc.data, doc.getLegacyCollection(whelk.jsonld))
if (errors) {
throw new Exception("Invalid JSON-LD. Errors: ${errors.collect{ it.toMap() }}")
String msg = "Invalid JSON-LD in document ${doc.completeId}. Errors: ${errors.collect { it.toMap() }}"
if (validationMode == ValidationMode.ON) {
throw new Exception(msg)
} else if (validationMode == ValidationMode.LOG_ONLY) {
invalidLog.println(doc.shortId)
errorLog.println(msg)
}
}
}

Expand Down Expand Up @@ -707,7 +719,7 @@ class WhelkTool {
if (limit > -1) log " limit: $limit"
if (allowLoud) log " allowLoud"
if (allowIdRemoval) log " allowIdRemoval"
if (skipValidation) log " skipValidation"
log " validation: ${validationMode.name()}"
log()

bindings = createMainBindings()
Expand Down Expand Up @@ -773,7 +785,8 @@ class WhelkTool {
cli.l(longOpt: 'limit', args: 1, argName: 'LIMIT', 'Amount of documents to process.')
cli.a(longOpt: 'allow-loud', 'Allow scripts to do loud modifications.')
cli.idchg(longOpt: 'allow-id-removal', '[UNSAFE] Allow script to remove document ids, e.g. sameAs.')
cli.sv(longOpt: 'skip-validation', '[UNSAFE] Skip JSON-LD validation before saving to database.')
cli.v(longOpt: 'validation', args: 1, argName: 'MODE', '[UNSAFE] Set JSON-LD validation mode. Defaults to ON.' +
' Possible values: ON/OFF/LOG-ONLY')
cli.n(longOpt: 'stats-num-ids', args: 1, 'Number of ids to print per entry in STATISTICS.txt.')
cli.p(longOpt: 'parameters', args: 1, argName: 'PARAMETER-FILE', 'Path to JSON file with parameters to script')

Expand Down Expand Up @@ -810,7 +823,7 @@ class WhelkTool {
tool.limit = options.l ? Integer.parseInt(options.l) : -1
tool.allowLoud = options.a
tool.allowIdRemoval = options.idchg
tool.skipValidation = options.sv
tool.validationMode = parseValidationMode(options.v) ?: tool.validationMode
try {
tool.run()
} catch (Exception e) {
Expand All @@ -819,6 +832,15 @@ class WhelkTool {
}
}

private static ValidationMode parseValidationMode(String arg) {
kwahlin marked this conversation as resolved.
Show resolved Hide resolved
for (vm in ValidationMode.getEnumConstants()) {
if (arg.toUpperCase() == vm.name()) {
return vm
}
}
return null
}

void recordChange(Document before, Document after, int number) {
//if (recordingLimit >= 0 && number > recordingLimit) {
if (recordingLimit >= 0 && recordedChanges.size() > recordingLimit) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import static whelk.datatool.WhelkTool.DELETED_LOG_NAME;
import static whelk.datatool.WhelkTool.MAIN_LOG_NAME;
import static whelk.datatool.WhelkTool.MODIFIED_LOG_NAME;
import static whelk.datatool.WhelkTool.ValidationMode.LOG_ONLY;
import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE;
import static whelk.datatool.bulkchange.BulkJobDocument.Status.Completed;
import static whelk.datatool.bulkchange.BulkJobDocument.Status.Failed;
Expand Down Expand Up @@ -163,6 +164,7 @@ protected WhelkTool buildWhelkTool(BulkJobDocument jobDoc) throws IOException {
tool.setDefaultChangedBy(jobDoc.getChangeAgentId());
tool.setAllowLoud(jobDoc.shouldUpdateModifiedTimestamp());
tool.setNoThreads(false);
tool.setValidationMode(LOG_ONLY);

return tool;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import java.nio.file.Files;
import java.util.List;

import static whelk.datatool.WhelkTool.ValidationMode.LOG_ONLY;
import static whelk.datatool.bulkchange.BulkJobDocument.JOB_TYPE;

public class BulkPreviewJob extends BulkJob {
Expand All @@ -30,6 +31,7 @@ public BulkPreviewJob(Whelk whelk, String id) throws IOException {
tool.setDryRun(true);
tool.setRecordChanges(true);
tool.setRecordingLimit(RECORD_MAX_ITEMS);
tool.setValidationMode(LOG_ONLY);
tool.setLogger(log);
}

Expand Down
Loading