Skip to content

Commit

Permalink
Add bulk script for removing topic subdivisions (#1513)
Browse files Browse the repository at this point in the history
  • Loading branch information
olovy authored Nov 11, 2024
1 parent d45be52 commit 6f75910
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
Expand Down Expand Up @@ -179,6 +178,7 @@ private Map<?,?> makePreviewChangeSet(RecordedChange recordChange) {
var id = (String) recordCopy.get(ID_KEY);

var result = getChangeSetsMap(beforeDoc, afterDoc, id);

((Map<String,Object>) DocumentUtil.getAtPath(result, List.of("changeSets", 0))).put("version",
beforeDoc.getThing());
((Map<String,Object>) DocumentUtil.getAtPath(result, List.of("changeSets", 1))).put("version",
Expand Down
7 changes: 7 additions & 0 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import javax.script.Bindings
import javax.script.CompiledScript
import javax.script.ScriptEngineManager
import javax.script.SimpleBindings
import java.nio.charset.StandardCharsets
import java.time.ZonedDateTime
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue
Expand Down Expand Up @@ -762,6 +763,7 @@ class WhelkTool {
cli.idchg(longOpt: 'allow-id-removal', '[UNSAFE] Allow script to remove document ids, e.g. sameAs.')
cli.sv(longOpt: 'skip-validation', '[UNSAFE] Skip JSON-LD validation before saving to database.')
cli.n(longOpt: 'stats-num-ids', args: 1, 'Number of ids to print per entry in STATISTICS.txt.')
cli.p(longOpt: 'parameters', args: 1, argName: 'PARAMETER-FILE', 'Path to JSON file with parameters to script')

def options = cli.parse(args)
if (options.h) {
Expand All @@ -776,6 +778,11 @@ class WhelkTool {
Script script = null
try {
script = new FileScript(scriptPath)

String paramPath = options.p
if (paramPath) {
script.setParameters(mapper.readValue(new File(paramPath).getText("UTF-8"), Map))
}
}
catch (IOException e) {
System.err.println("Could not load script [$scriptPath] : $e")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ protected WhelkTool buildWhelkTool(BulkJobDocument jobDoc) throws IOException {
var bulkJobThingId = stripSuffix(id, HASH_IT) + HASH_IT;

Script script = jobDoc.getSpecification().getScript(bulkJobThingId);

WhelkTool tool = new WhelkTool(whelk, script, reportDir(systemId), WhelkTool.getDEFAULT_STATS_NUM_IDS());
// TODO for now setting changedBy only works for loud changes (!minorChange in PostgreSQLComponent)
tool.setDefaultChangedBy(jobDoc.getChangeAgentId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ public enum SpecType implements JsonLdKey {
Update("bulk:Update"),
Delete("bulk:Delete"),
Create("bulk:Create"),
Merge("bulk:Merge");
Merge("bulk:Merge"),
Other("bulk:Other");

private final String key;

Expand Down Expand Up @@ -63,6 +64,7 @@ public String key() {
public static final String LABEL_KEY = "label";
public static final String KEEP_KEY = "bulk:keep";
public static final String DEPRECATE_KEY = "bulk:deprecate";
public static final String SCRIPT_KEY = "bulk:script";

private static final List<Object> STATUS_PATH = List.of(JsonLd.GRAPH_KEY, 1, STATUS_KEY);
private static final List<Object> UPDATE_TIMESTAMP_PATH = List.of(JsonLd.GRAPH_KEY, 1, SHOULD_UPDATE_TIMESTAMP_KEY);
Expand Down Expand Up @@ -128,6 +130,10 @@ public Specification getSpecification() {
get(spec, List.of(DEPRECATE_KEY, "*", ID_KEY), Collections.emptyList()),
get(spec, List.of(KEEP_KEY, ID_KEY), "")
);
case SpecType.Other -> new Specification.Other(
get(spec, SCRIPT_KEY, null),
spec
);
case null -> throw new ModelValidationException(String.format("Bad %s %s: %s",
CHANGE_SPEC_KEY, JsonLd.TYPE_KEY, specType));
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package whelk.datatool.bulkchange;

import com.google.common.collect.Maps;
import org.apache.commons.io.IOUtils;
import whelk.Document;
import whelk.Whelk;
Expand All @@ -11,6 +12,7 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

Expand All @@ -21,7 +23,7 @@
import static whelk.datatool.bulkchange.BulkJobDocument.DEPRECATE_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.TARGET_FORM_KEY;

public sealed interface Specification permits Specification.Create, Specification.Delete, Specification.Merge, Specification.Update {
public sealed interface Specification permits Specification.Create, Specification.Delete, Specification.Merge, Specification.Update, Specification.Other {

Script getScript(String bulkJobId);

Expand Down Expand Up @@ -123,6 +125,26 @@ public Script getScript(String bulkJobId) {
}
}

record Other(String name, Map<String, ?> parameters) implements Specification {
private static final Map<String, List<String>> ALLOWED_SCRIPTS_PARAMS = Map.of(
"removeTopicSubdivision", List.of(DEPRECATE_KEY, KEEP_KEY)
);

@Override
public Script getScript(String bulkJobId) {
if (!ALLOWED_SCRIPTS_PARAMS.containsKey(name)) {
throw new IllegalArgumentException("Script " + name + " not supported");
}

Script s = new Script(loadClasspathScriptSource(name +".groovy"), bulkJobId);

Map<Object, Object> params = new HashMap<>();
params.putAll(Maps.filterKeys(parameters, k -> ALLOWED_SCRIPTS_PARAMS.get(name).contains(k)));
s.setParameters(params);
return s;
}
}

private static String loadClasspathScriptSource(String scriptName) {
String path = "bulk-change-scripts/" + scriptName;
try (InputStream scriptStream = Specification.class.getClassLoader().getResourceAsStream(path)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* Remove all uses of a certain TopicSubdivision within ComplexSubject
* The TopicSubdivision itself is not removed, only the usages.
*
* Parameters:
* bulk:deprecate - The subdivision(s) to be removed
* bulk:keep - If specified, add this regular Topic to :subject instead
*/

import whelk.JsonLd
import whelk.util.DocumentUtil

import static whelk.JsonLd.ID_KEY
import static whelk.datatool.bulkchange.BulkJobDocument.DEPRECATE_KEY
import static whelk.datatool.bulkchange.BulkJobDocument.KEEP_KEY

List deprecateLinks = asList(parameters.get(DEPRECATE_KEY))
Map keepLink = parameters.get(KEEP_KEY)

deprecateLinks.each { deprecate ->
selectByIds([deprecate[ID_KEY]]) { obsoleteSubdivision ->
selectByIds(obsoleteSubdivision.getDependers()) { depender ->
Map thing = depender.graph[1] as Map

if (thing[JsonLd.TYPE_KEY] == 'ComplexSubject') {
return
}

def modified = DocumentUtil.traverse(thing) { value, path ->
if (value instanceof Map && value[JsonLd.TYPE_KEY] == 'ComplexSubject') {
var t = asList(value.get('termComponentList'))
if (deprecate in t) {
// TODO? add way to do this with an op? SplitReplace? [Replace, Insert]?
if (keepLink && path.size() > 1) {
var parent = DocumentUtil.getAtPath(thing, path.dropRight(1))
if (parent instanceof List && !parent.contains(keepLink)) {
parent.add(keepLink)
}
}

return mapSubject(value, t, deprecate)
}
}
return DocumentUtil.NOP
}

if (modified) {
depender.scheduleSave(loud: isLoudAllowed)
}
}
}
}

static DocumentUtil.Operation mapSubject(Map subject, termComponentList, deprecateLink) {
var t2 = termComponentList.findAll { it != deprecateLink }
if (t2.size() == 0) {
return new DocumentUtil.Remove()
}
if (t2.size() == 1) {
return new DocumentUtil.Replace(t2.first())
}

Map result = new HashMap(subject)
result.termComponentList = t2
return new DocumentUtil.Replace(result)
}

0 comments on commit 6f75910

Please sign in to comment.