More flexible subdivision job (#1520)

- Enable bulk remove/replace blank topic subdivisions - Allow removing any :Subdivision subtype - Allow removing a combination of multiple :Subdivision within a ComplexSubject - Allow removing both linked and/or blank :Subdivision - Allow adding any :Subject subtype - Allow adding either linked or blank :Subject
libris · Nov 18, 2024 · 9177045 · 9177045
1 parent 4eaf8c8
commit 9177045
Showing 12 changed files with 159 additions and 137 deletions.
diff --git a/whelk-core/src/main/groovy/whelk/component/SparqlQueryClient.groovy b/whelk-core/src/main/groovy/whelk/component/SparqlQueryClient.groovy
@@ -7,7 +7,9 @@ import org.apache.jena.query.QueryExecutionFactory
 import org.apache.jena.query.ResultSet
 import whelk.Document
 import whelk.JsonLd
+import whelk.converter.JsonLdToTrigSerializer
 
+import static java.nio.charset.StandardCharsets.UTF_8
 import static trld.trig.Serializer.collectPrefixes
 
 @Log

diff --git a/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy b/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy
@@ -3,10 +3,8 @@ package whelk.converter
 import groovy.util.logging.Log4j2 as Log
 import whelk.JsonLd
 import whelk.Whelk
-import whelk.component.PostgreSQLComponent
-import whelk.util.PropertyLoader
 
-import static whelk.util.Jackson.mapper
+import static java.nio.charset.StandardCharsets.UTF_8
 
 @Log
 class JsonLDTurtleConverter implements FormatConverter {
@@ -20,7 +18,25 @@ class JsonLDTurtleConverter implements FormatConverter {
     }
 
     Map convert(Map source, String id) {
-        def bytes = JsonLdToTrigSerializer.toTurtle(null, source, base).toByteArray()
-        return [(JsonLd.NON_JSON_CONTENT_KEY) : (new String(bytes, "UTF-8"))]
+        return [(JsonLd.NON_JSON_CONTENT_KEY) : _toTurtle(source, null, base, false)]
+    }
+
+    static String toTurtle(Map source, Map context, boolean skipPrelude) {
+        return _toTurtle(source, context, null, skipPrelude)
+    }
+
+    private static String _toTurtle(Map source, Map context, base, boolean skipPrelude) {
+        def bytes = JsonLdToTrigSerializer.toTurtle(context, source, base).toByteArray()
+        def s = new String(bytes, UTF_8)
+        // Add skip prelude flag in trld.trig.SerializerState.serialize?
+        return skipPrelude ? withoutPrefixes(s) : s
+    }
+
+    private static String withoutPrefixes(String ttl) {
+        return ttl.readLines()
+                .split { it.startsWith('prefix') }
+                .get(1)
+                .join('\n')
+                .trim()
     }
 }
diff --git a/whelk-core/src/main/groovy/whelk/filter/LinkFinder.groovy b/whelk-core/src/main/groovy/whelk/filter/LinkFinder.groovy
@@ -19,6 +19,12 @@ class LinkFinder {
 
     static String ENTITY_QUERY
 
+    /*
+    Non-primary ids appearing in these paths should be kept as is upon normalization, i.e. they should *not* be
+    replaced by their primary id.
+     */
+    private static Set<String> RETAIN_NON_PRIMARY_IDS = ['bulk:changeSpec.bulk:deprecate'] as Set
+
     LinkFinder(PostgreSQLComponent pgsql) {
         postgres = pgsql
         ENTITY_QUERY = """SELECT ids2.iri AS thingUri
@@ -115,8 +121,6 @@ class LinkFinder {
     }
 
     private void replaceSameAsLinksWithPrimaries(Map data, List path = []) {
-        def exceptedPaths = ['bulk:changeSpec.bulk:deprecate'] as Set
-
         // If this is a link (an object containing _only_ an id)
         String id = data.get("@id")
         if (id != null && data.keySet().size() == 1) {
@@ -127,7 +131,7 @@ class LinkFinder {
             )
                     .findAll { it instanceof String }
                     .join('.')
-            if (exceptedPaths.contains(normalizedPath)) {
+            if (RETAIN_NON_PRIMARY_IDS.contains(normalizedPath)) {
                 return
             }
             String primaryId = lookupPrimaryId(id, normalizedPath)

diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl
@@ -41,10 +41,6 @@ contributor(ctx) {
     method name:"selectByIds", params:['ids':Collection<String>, 'process':DC, 'batchSize':int], type:void
     method name:"selectByIds", params:['ids':Collection<String>, 'process':DC, 'silent':boolean], type:void
     method name:"selectByIds", params:['ids':Collection<String>, 'process':DC], type:void
-    method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int, 'silent':boolean], type:void
-    method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int], type:void
-    method name:"selectByForm", params:['form':Map, 'process':DC, 'silent':boolean], type:void
-    method name:"selectByForm", params:['form':Map, 'process':DC], type:void
     method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'silent':boolean, 'process':DC], type:void
     method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'process':DC], type:void
     method name:"selectBySqlWhere", params:['whereClause':String, 'silent':boolean, 'process':DC], type:void

diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
@@ -191,18 +191,6 @@ class WhelkTool {
                 batchSize, [1: idItems, 2: collection])
     }
 
-    void selectByForm(Map form, Closure process,
-                     int batchSize = DEFAULT_BATCH_SIZE, boolean silent = false) {
-        if (!silent) {
-            log "Select by form"
-        }
-
-        var sparqlPattern = new Transform.MatchForm(form, whelk).getSparqlPattern(whelk.jsonld.context)
-        var ids = whelk.sparqlQueryClient.queryIdsByPattern(sparqlPattern)
-
-        selectByIds(ids, process, batchSize, silent)
-    }
-
     DocumentItem create(Map data) {
         Document doc = new Document(data)
         doc.deepReplaceId(Document.BASE_URI.toString() + IdGenerator.generate())
@@ -671,7 +659,6 @@ class WhelkTool {
         bindings.put("selectByIds", this.&selectByIds)
         bindings.put("selectByIdsAndCollection", this.&selectByIdsAndCollection)
         bindings.put("selectBySqlWhere", this.&selectBySqlWhere)
-        bindings.put("selectByForm", this.&selectByForm)
         bindings.put("selectFromIterable", this.&selectFromIterable)
         bindings.put("create", this.&create)
         bindings.put("queryIds", this.&queryIds)

diff --git a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy
@@ -4,19 +4,19 @@ import groovy.transform.Memoized
 import whelk.Document
 import whelk.JsonLd
 import whelk.Whelk
-import whelk.converter.JsonLdToTrigSerializer
 import whelk.datatool.util.DocumentComparator
 import whelk.datatool.util.IdLoader
 import whelk.util.DocumentUtil
 
-import static java.nio.charset.StandardCharsets.UTF_8
+import static whelk.JsonLd.GRAPH_KEY
 import static whelk.JsonLd.ID_KEY
 import static whelk.JsonLd.RECORD_KEY
 import static whelk.JsonLd.RECORD_TYPE
 import static whelk.JsonLd.THING_KEY
 import static whelk.JsonLd.TYPE_KEY
 import static whelk.JsonLd.asList
 import static whelk.component.SparqlQueryClient.GRAPH_VAR
+import static whelk.converter.JsonLDTurtleConverter.toTurtle
 import static whelk.util.DocumentUtil.getAtPath
 import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy
 
@@ -180,11 +180,9 @@ class Transform {
         thing[ID_KEY] = getThingTmpId()
         record[THING_KEY] = [(ID_KEY): getThingTmpId()]
 
-        def ttl = ((ByteArrayOutputStream) JsonLdToTrigSerializer.toTurtle(context, [record, thing]))
-                .toByteArray()
-                .with { new String(it, UTF_8) }
-        // Add skip prelude flag to JsonLdToTrigSerializer.toTurtle?
-                .with { withoutPrefixes(it) }
+        Map graph = [(GRAPH_KEY): [record, thing]]
+
+        String ttl = toTurtle(graph, context, true)
 
         return insertTypeMappings(insertIdMappings(insertVars(ttl)))
     }
@@ -254,14 +252,6 @@ class Transform {
                 : "?${bNodeId.replace('#', '')}"
     }
 
-    private static String withoutPrefixes(String ttl) {
-        ttl.readLines()
-                .split { it.startsWith('prefix') }
-                .get(1)
-                .join('\n')
-                .trim()
-    }
-
     Map<String, Set<String>> collectNodeIdMappings(Whelk whelk) {
         return collectNodeIdMappings(matchForm, whelk)
     }

diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJobDocument.java b/whelktool/src/main/java/whelk/datatool/bulkchange/BulkJobDocument.java
@@ -69,6 +69,8 @@ public String key() {
     public static final String ADD_KEY = "bulk:add";
     public static final String KEEP_KEY = "bulk:keep";
     public static final String DEPRECATE_KEY = "bulk:deprecate";
+    public static final String REMOVE_SUBDIVISION_KEY = "bulk:removeSubdivision";
+    public static final String ADD_SUBJECT_KEY = "bulk:addSubject";
     public static final String SCRIPT_KEY = "bulk:script";
     public static final String EXECUTION_KEY = "bulk:execution";
     public static final String EXECUTION_TYPE = "bulk:Execution";

diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java b/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java
@@ -11,17 +11,17 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 import static whelk.JsonLd.GRAPH_KEY;
 import static whelk.JsonLd.RECORD_KEY;
-import static whelk.datatool.bulkchange.BulkJobDocument.ADD_KEY;
+import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY;
 import static whelk.datatool.bulkchange.BulkJobDocument.KEEP_KEY;
 import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY;
 import static whelk.datatool.bulkchange.BulkJobDocument.DEPRECATE_KEY;
+import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY;
 import static whelk.datatool.bulkchange.BulkJobDocument.TARGET_FORM_KEY;
 
 public sealed interface Specification permits Specification.Create, Specification.Delete, Specification.Merge, Specification.Update, Specification.Other {
@@ -49,6 +49,10 @@ public Script getScript(String bulkJobId) {
             return s;
         }
 
+        public List<String> findIds(Whelk whelk) {
+            return queryIds(getTransform(whelk), whelk);
+        }
+
         @SuppressWarnings("unchecked")
         public boolean modify(Document doc, Whelk whelk) {
             Map<String, Object> thing = doc.getThing();
@@ -95,7 +99,11 @@ public boolean matches(Document doc, Whelk whelk) {
             return getMatchForm(whelk).matches(thing);
         }
 
-        public Transform.MatchForm getMatchForm(Whelk whelk) {
+        public List<String> findIds(Whelk whelk) {
+            return queryIds(getMatchForm(whelk), whelk);
+        }
+
+        private Transform.MatchForm getMatchForm(Whelk whelk) {
             if (matchFormObj == null) {
                 matchFormObj = new Transform.MatchForm(matchForm, whelk);
             }
@@ -128,7 +136,7 @@ public Script getScript(String bulkJobId) {
 
     record Other(String name, Map<String, ?> parameters) implements Specification {
         private static final Map<String, List<String>> ALLOWED_SCRIPTS_PARAMS = Map.of(
-                "removeTopicSubdivision", List.of(DEPRECATE_KEY, ADD_KEY)
+                "removeSubdivision", List.of(REMOVE_SUBDIVISION_KEY, ADD_SUBJECT_KEY)
         );
 
         @Override
@@ -155,4 +163,9 @@ private static String loadClasspathScriptSource(String scriptName) {
             throw new RuntimeException(e);
         }
     }
+
+    private static List<String> queryIds(Transform transform, Whelk whelk) {
+        return whelk.getSparqlQueryClient()
+                .queryIdsByPattern(transform.getSparqlPattern(whelk.getJsonld().context));
+    }
 }
diff --git a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy
@@ -5,9 +5,11 @@ import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY
 Map matchForm = parameters.get(MATCH_FORM_KEY)
 
 Specification.Delete delete = new Specification.Delete(matchForm)
+List<String> ids = delete.findIds(getWhelk())
 
-selectByForm(matchForm) {
+selectByIds(ids) {
     if(delete.matches(it.doc, it.whelk)) {
         it.scheduleDelete(loud: isLoudAllowed)
     }
-}
+}
+
diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy
@@ -0,0 +1,99 @@
+/**
+ * Remove all uses of a certain Subdivision within ComplexSubject
+ * The Subdivision itself is not removed, only the usages.
+ *
+ * Parameters:
+ * bulk:removeSubdivision - The subdivision(s) to be removed
+ * bulk:addSubject - If specified, add this regular Subject to :subject instead
+ */
+
+import whelk.JsonLd
+import whelk.Whelk
+import whelk.util.DocumentUtil
+
+import static whelk.JsonLd.GRAPH_KEY
+import static whelk.JsonLd.ID_KEY
+import static whelk.JsonLd.asList
+import static whelk.converter.JsonLDTurtleConverter.toTurtle
+import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY
+import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY
+
+List<Map> removeSubdivision = asList(parameters.get(REMOVE_SUBDIVISION_KEY))
+Map addSubject = parameters.get(ADD_SUBJECT_KEY)
+
+def process = { doc ->
+    Map thing = doc.graph[1] as Map
+
+    if (thing[JsonLd.TYPE_KEY] == 'ComplexSubject') {
+        return
+    }
+
+    Set<List> modifiedListPaths = [] as Set
+    def modified = DocumentUtil.traverse(thing) { value, path ->
+        if (value instanceof Map && value[JsonLd.TYPE_KEY] == 'ComplexSubject') {
+            var t = asList(value.get('termComponentList'))
+            if (t.containsAll(removeSubdivision)) {
+                var parentPath = path.size() > 1 ? path.dropRight(1) : null
+                if (parentPath) {
+                    var parent = DocumentUtil.getAtPath(thing, parentPath)
+                    if (parent instanceof List) {
+                        modifiedListPaths.add(parentPath)
+                        if (addSubject) {
+                            parent.add(addSubject)
+                        }
+                    }
+                }
+
+                return mapSubject(value, t, removeSubdivision)
+            }
+        }
+        return DocumentUtil.NOP
+    }
+
+    // Remove duplicates
+    modifiedListPaths.each {
+        var obj = DocumentUtil.getAtPath(thing, it)
+        if (obj instanceof List) {
+            obj.unique(true)
+        }
+    }
+
+    if (modified) {
+        doc.scheduleSave(loud: isLoudAllowed)
+    }
+}
+
+Set<String> ids = Collections.synchronizedSet([] as Set<String>)
+removeSubdivision.each { subdivision ->
+    if (subdivision[ID_KEY]) {
+        selectByIds([subdivision[ID_KEY]]) { obsoleteSubdivision ->
+            ids.addAll(obsoleteSubdivision.getDependers())
+        }
+    } else {
+        Whelk whelk = getWhelk()
+        ids.addAll(whelk.sparqlQueryClient.queryIdsByPattern(asTurtle((Map) subdivision, whelk.jsonld.context)))
+    }
+}
+
+selectByIds(ids) {
+    process(it)
+}
+
+static DocumentUtil.Operation mapSubject(Map subject, termComponentList, removeSubdivision) {
+    var t2 = termComponentList.findAll { !removeSubdivision.contains(it) }
+    if (t2.size() == 0) {
+        return new DocumentUtil.Remove()
+    }
+    if (t2.size() == 1) {
+        return new DocumentUtil.Replace(t2.first())
+    }
+
+    Map result = new HashMap(subject)
+    result.termComponentList = t2
+    return new DocumentUtil.Replace(result)
+}
+
+static String asTurtle(Map thing, Map context) {
+    Map graph = [(GRAPH_KEY): [[:], thing]]
+    return toTurtle(graph, context, true)
+}