From db625ec5afd1faa6d266863ec5316b429a80a403 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 13:52:37 +0100 Subject: [PATCH 01/13] Require equal inScheme in ComplexSubject and Subdivision --- .../bulk-change-scripts/removeSubdivision.groovy | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index b08a02873e..4fcbfd6dd0 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -18,7 +18,12 @@ import static whelk.converter.JsonLDTurtleConverter.toTurtle import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY -List removeSubdivision = asList(parameters.get(REMOVE_SUBDIVISION_KEY)) +String inScheme +List removeSubdivision = asList(parameters.get(REMOVE_SUBDIVISION_KEY)).collect { + Map copy = new HashMap((Map) it) + inScheme = copy.remove('inScheme') + return copy +} Map addSubject = parameters.get(ADD_SUBJECT_KEY) def process = { doc -> @@ -32,7 +37,7 @@ def process = { doc -> def modified = DocumentUtil.traverse(thing) { value, path -> if (value instanceof Map && value[JsonLd.TYPE_KEY] == 'ComplexSubject') { var t = asList(value.get('termComponentList')) - if (t.containsAll(removeSubdivision)) { + if (inScheme == value['inScheme'] && t.containsAll(removeSubdivision)) { var parentPath = path.size() > 1 ? path.dropRight(1) : null if (parentPath) { var parent = DocumentUtil.getAtPath(thing, parentPath) From 80382f20436d13e68ea60ce845304feafc87a532 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 14:54:14 +0100 Subject: [PATCH 02/13] Copy inScheme from ComplexSubject into replacement Subject --- .../bulk-change-scripts/removeSubdivision.groovy | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index 4fcbfd6dd0..acc57a1351 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -84,16 +84,20 @@ selectByIds(ids) { process(it) } -static DocumentUtil.Operation mapSubject(Map subject, termComponentList, removeSubdivision) { +static DocumentUtil.Operation mapSubject(Map complexSubject, termComponentList, removeSubdivision) { var t2 = termComponentList.findAll { !removeSubdivision.contains(it) } if (t2.size() == 0) { return new DocumentUtil.Remove() } if (t2.size() == 1) { - return new DocumentUtil.Replace(t2.first()) + def remaining = t2.first() + if (complexSubject['inScheme']) { + remaining['inScheme'] = complexSubject['inScheme'] + } + return new DocumentUtil.Replace(remaining) } - Map result = new HashMap(subject) + Map result = new HashMap(complexSubject) result.termComponentList = t2 return new DocumentUtil.Replace(result) } From 863a14dec5908d5fb5b88e556eaed3f3cd374b6a Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 17:47:58 +0100 Subject: [PATCH 03/13] Check inScheme equality only if given in blank Subdivision --- .../main/resources/bulk-change-scripts/removeSubdivision.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index acc57a1351..5555eec5c9 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -37,7 +37,7 @@ def process = { doc -> def modified = DocumentUtil.traverse(thing) { value, path -> if (value instanceof Map && value[JsonLd.TYPE_KEY] == 'ComplexSubject') { var t = asList(value.get('termComponentList')) - if (inScheme == value['inScheme'] && t.containsAll(removeSubdivision)) { + if ((!inScheme || inScheme == value['inScheme']) && t.containsAll(removeSubdivision)) { var parentPath = path.size() > 1 ? path.dropRight(1) : null if (parentPath) { var parent = DocumentUtil.getAtPath(thing, parentPath) From c1056e50d5e5e84ae9749742f728b56e1054c0ba Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 18:34:42 +0100 Subject: [PATCH 04/13] Select only records containing a combination of given subdivisions to remove --- .../bulk-change-scripts/removeSubdivision.groovy | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index 5555eec5c9..355270cd5f 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -7,14 +7,14 @@ * bulk:addSubject - If specified, add this regular Subject to :subject instead */ + import whelk.JsonLd import whelk.Whelk import whelk.util.DocumentUtil -import static whelk.JsonLd.GRAPH_KEY import static whelk.JsonLd.ID_KEY import static whelk.JsonLd.asList -import static whelk.converter.JsonLDTurtleConverter.toTurtle +import static whelk.converter.JsonLDTurtleConverter.toTurtleData import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY @@ -68,15 +68,15 @@ def process = { doc -> } } -Set ids = Collections.synchronizedSet([] as Set) +Set ids = [] as Set removeSubdivision.each { subdivision -> if (subdivision[ID_KEY]) { selectByIds([subdivision[ID_KEY]]) { obsoleteSubdivision -> - ids.addAll(obsoleteSubdivision.getDependers()) + ids = ids.intersect(obsoleteSubdivision.getDependers()) as Set } } else { Whelk whelk = getWhelk() - ids.addAll(whelk.sparqlQueryClient.queryIdsByPattern(asTurtle((Map) subdivision, whelk.jsonld.context))) + ids = ids.intersect(whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) subdivision, whelk.jsonld.context))) } } @@ -100,9 +100,4 @@ static DocumentUtil.Operation mapSubject(Map complexSubject, termComponentList, Map result = new HashMap(complexSubject) result.termComponentList = t2 return new DocumentUtil.Replace(result) -} - -static String asTurtle(Map thing, Map context) { - Map graph = [(GRAPH_KEY): [[:], thing]] - return toTurtle(graph, context, true) } \ No newline at end of file From 01a609244e22bddc6bbaa86919c68c438bb3df99 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 18:35:00 +0100 Subject: [PATCH 05/13] Refactor turtle conversion methods --- .../whelk/converter/JsonLDTurtleConverter.groovy | 13 ++++++------- .../groovy/whelk/datatool/form/Transform.groovy | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy b/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy index 7508f1779d..f315089c97 100644 --- a/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy +++ b/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy @@ -18,18 +18,17 @@ class JsonLDTurtleConverter implements FormatConverter { } Map convert(Map source, String id) { - return [(JsonLd.NON_JSON_CONTENT_KEY) : _toTurtle(source, null, base, false)] + return [(JsonLd.NON_JSON_CONTENT_KEY) : toTurtle(source, null, base)] } - static String toTurtle(Map source, Map context, boolean skipPrelude) { - return _toTurtle(source, context, null, skipPrelude) + static String toTurtleData(source, Map context) { + // Add skip prelude flag in trld.trig.SerializerState.serialize? + return withoutPrefixes(toTurtle(source, context, null)) } - private static String _toTurtle(Map source, Map context, base, boolean skipPrelude) { + private static String toTurtle(source, Map context, base) { def bytes = JsonLdToTrigSerializer.toTurtle(context, source, base).toByteArray() - def s = new String(bytes, UTF_8) - // Add skip prelude flag in trld.trig.SerializerState.serialize? - return skipPrelude ? withoutPrefixes(s) : s + return new String(bytes, UTF_8) } private static String withoutPrefixes(String ttl) { diff --git a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy index dbb9dab809..e8f6e2d1af 100644 --- a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy @@ -16,7 +16,7 @@ import static whelk.JsonLd.THING_KEY import static whelk.JsonLd.TYPE_KEY import static whelk.JsonLd.asList import static whelk.component.SparqlQueryClient.GRAPH_VAR -import static whelk.converter.JsonLDTurtleConverter.toTurtle +import static whelk.converter.JsonLDTurtleConverter.toTurtleData import static whelk.util.DocumentUtil.getAtPath import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy @@ -182,7 +182,7 @@ class Transform { Map graph = [(GRAPH_KEY): [record, thing]] - String ttl = toTurtle(graph, context, true) + String ttl = toTurtleData(graph, context) return insertTypeMappings(insertIdMappings(insertVars(ttl))) } From 21aaa42e958167a44c24c08a643546fb5da97349 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 18:52:32 +0100 Subject: [PATCH 06/13] Restore selectByForm --- .../src/main/groovy/whelk/datatool/WhelkTool.gdsl | 4 ++++ .../main/groovy/whelk/datatool/WhelkTool.groovy | 13 +++++++++++++ .../whelk/datatool/bulkchange/Specification.java | 15 +-------------- .../resources/bulk-change-scripts/delete.groovy | 6 ++---- .../resources/bulk-change-scripts/update.groovy | 3 +-- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl index e4fd1ee111..715345b04b 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl @@ -41,6 +41,10 @@ contributor(ctx) { method name:"selectByIds", params:['ids':Collection, 'process':DC, 'batchSize':int], type:void method name:"selectByIds", params:['ids':Collection, 'process':DC, 'silent':boolean], type:void method name:"selectByIds", params:['ids':Collection, 'process':DC], type:void + method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int, 'silent':boolean], type:void + method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int], type:void + method name:"selectByForm", params:['form':Map, 'process':DC, 'silent':boolean], type:void + method name:"selectByForm", params:['form':Map, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'silent':boolean, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'silent':boolean, 'process':DC], type:void diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy index ad01135e1d..0d92f53716 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy @@ -196,6 +196,18 @@ class WhelkTool { batchSize, [1: idItems, 2: collection]) } + void selectByForm(Map form, Closure process, + int batchSize = DEFAULT_BATCH_SIZE, boolean silent = false) { + if (!silent) { + log "Select by form" + } + + var sparqlPattern = new Transform.MatchForm(form, whelk).getSparqlPattern(whelk.jsonld.context) + var ids = whelk.sparqlQueryClient.queryIdsByPattern(sparqlPattern) + + selectByIds(ids, process, batchSize, silent) + } + DocumentItem create(Map data) { Document doc = new Document(data) doc.deepReplaceId(Document.BASE_URI.toString() + IdGenerator.generate()) @@ -677,6 +689,7 @@ class WhelkTool { bindings.put("selectByIds", this.&selectByIds) bindings.put("selectByIdsAndCollection", this.&selectByIdsAndCollection) bindings.put("selectBySqlWhere", this.&selectBySqlWhere) + bindings.put("selectByForm", this.&selectByForm) bindings.put("selectFromIterable", this.&selectFromIterable) bindings.put("create", this.&create) bindings.put("queryIds", this.&queryIds) diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java b/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java index 1c998e2eb5..cb1a49138b 100644 --- a/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java +++ b/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java @@ -49,10 +49,6 @@ public Script getScript(String bulkJobId) { return s; } - public List findIds(Whelk whelk) { - return queryIds(getTransform(whelk), whelk); - } - @SuppressWarnings("unchecked") public boolean modify(Document doc, Whelk whelk) { Map thing = doc.getThing(); @@ -99,11 +95,7 @@ public boolean matches(Document doc, Whelk whelk) { return getMatchForm(whelk).matches(thing); } - public List findIds(Whelk whelk) { - return queryIds(getMatchForm(whelk), whelk); - } - - private Transform.MatchForm getMatchForm(Whelk whelk) { + public Transform.MatchForm getMatchForm(Whelk whelk) { if (matchFormObj == null) { matchFormObj = new Transform.MatchForm(matchForm, whelk); } @@ -163,9 +155,4 @@ private static String loadClasspathScriptSource(String scriptName) { throw new RuntimeException(e); } } - - private static List queryIds(Transform transform, Whelk whelk) { - return whelk.getSparqlQueryClient() - .queryIdsByPattern(transform.getSparqlPattern(whelk.getJsonld().context)); - } } diff --git a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy index dbacb144ac..55301856c3 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy @@ -5,11 +5,9 @@ import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY Map matchForm = parameters.get(MATCH_FORM_KEY) Specification.Delete delete = new Specification.Delete(matchForm) -List ids = delete.findIds(getWhelk()) -selectByIds(ids) { +selectByForm(matchForm) { if(delete.matches(it.doc, it.whelk)) { it.scheduleDelete(loud: isLoudAllowed) } -} - +} \ No newline at end of file diff --git a/whelktool/src/main/resources/bulk-change-scripts/update.groovy b/whelktool/src/main/resources/bulk-change-scripts/update.groovy index 616d87cdef..217c2fa14d 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/update.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/update.groovy @@ -7,9 +7,8 @@ Map matchForm = parameters.get(MATCH_FORM_KEY) Map targetForm = parameters.get(TARGET_FORM_KEY) Specification.Update update = new Specification.Update(matchForm, targetForm) -List ids = update.findIds(getWhelk()) -selectByIds(ids) { +selectByForm(matchForm) { if(update.modify(it.doc, it.whelk)) { it.scheduleSave(loud: isLoudAllowed) } From 1e8c92e224eb98f3346e8b8d7d7207f186b7afb9 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 19:11:01 +0100 Subject: [PATCH 07/13] Move logic from Specification into scripts --- .../datatool/bulkchange/Specification.java | 54 ++----------------- .../bulk-change-scripts/delete.groovy | 14 +++-- .../bulk-change-scripts/update.groovy | 26 +++++++-- 3 files changed, 36 insertions(+), 58 deletions(-) diff --git a/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java b/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java index cb1a49138b..ff6abc78b9 100644 --- a/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java +++ b/whelktool/src/main/java/whelk/datatool/bulkchange/Specification.java @@ -28,17 +28,7 @@ public sealed interface Specification permits Specification.Create, Specificatio Script getScript(String bulkJobId); - final class Update implements Specification { - private final Map matchForm; - private final Map targetForm; - - private Transform transform; - - public Update(Map matchForm, Map targetForm) { - this.matchForm = matchForm; - this.targetForm = targetForm; - } - + record Update(Map matchForm, Map targetForm) implements Specification { @Override public Script getScript(String bulkJobId) { Script s = new Script(loadClasspathScriptSource("update.groovy"), bulkJobId); @@ -49,36 +39,12 @@ public Script getScript(String bulkJobId) { return s; } - @SuppressWarnings("unchecked") - public boolean modify(Document doc, Whelk whelk) { - Map thing = doc.getThing(); - thing.put(RECORD_KEY, doc.getRecord()); - - var m = new ModifiedThing(thing, getTransform(whelk), whelk.getJsonld().repeatableTerms); - - ((List>) doc.data.get(GRAPH_KEY)).set(0, (Map) m.getAfter().remove(RECORD_KEY)); - ((List>) doc.data.get(GRAPH_KEY)).set(1, m.getAfter()); - - return m.isModified(); - } - public Transform getTransform(Whelk whelk) { - if (transform == null) { - transform = new Transform(matchForm, targetForm, whelk); - } - return transform; + return new Transform(matchForm, targetForm, whelk); } } - final class Delete implements Specification { - private final Map matchForm; - - private Transform.MatchForm matchFormObj; - - public Delete(Map matchForm) { - this.matchForm = matchForm; - } - + record Delete(Map matchForm) implements Specification { @Override public Script getScript(String bulkJobId) { Script s = new Script(loadClasspathScriptSource("delete.groovy"), bulkJobId); @@ -87,20 +53,6 @@ public Script getScript(String bulkJobId) { )); return s; } - - @SuppressWarnings("unchecked") - public boolean matches(Document doc, Whelk whelk) { - Map thing = doc.clone().getThing(); - thing.put(RECORD_KEY, doc.getRecord()); - return getMatchForm(whelk).matches(thing); - } - - public Transform.MatchForm getMatchForm(Whelk whelk) { - if (matchFormObj == null) { - matchFormObj = new Transform.MatchForm(matchForm, whelk); - } - return matchFormObj; - } } record Create(Map targetForm) implements Specification { diff --git a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy index 55301856c3..2650b02c9a 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy @@ -1,13 +1,21 @@ -import whelk.datatool.bulkchange.Specification +import whelk.Document +import whelk.datatool.form.Transform +import static whelk.JsonLd.RECORD_KEY import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY Map matchForm = parameters.get(MATCH_FORM_KEY) -Specification.Delete delete = new Specification.Delete(matchForm) +Transform.MatchForm mf = new Transform.MatchForm(matchForm, getWhelk()) selectByForm(matchForm) { - if(delete.matches(it.doc, it.whelk)) { + if(mf.matches(getFramedThing(it.doc))) { it.scheduleDelete(loud: isLoudAllowed) } +} + +private static Map getFramedThing(Document doc) { + Map thing = doc.clone().getThing(); + thing.put(RECORD_KEY, doc.getRecord()); + return thing } \ No newline at end of file diff --git a/whelktool/src/main/resources/bulk-change-scripts/update.groovy b/whelktool/src/main/resources/bulk-change-scripts/update.groovy index 217c2fa14d..17e7bb347d 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/update.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/update.groovy @@ -1,15 +1,33 @@ -import whelk.datatool.bulkchange.Specification +import whelk.Document +import whelk.Whelk +import whelk.datatool.form.ModifiedThing +import whelk.datatool.form.Transform +import static whelk.JsonLd.GRAPH_KEY +import static whelk.JsonLd.RECORD_KEY import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY import static whelk.datatool.bulkchange.BulkJobDocument.TARGET_FORM_KEY Map matchForm = parameters.get(MATCH_FORM_KEY) Map targetForm = parameters.get(TARGET_FORM_KEY) -Specification.Update update = new Specification.Update(matchForm, targetForm) +Transform transform = new Transform(matchForm, targetForm, getWhelk()) selectByForm(matchForm) { - if(update.modify(it.doc, it.whelk)) { + if(modify(transform, it.doc, it.whelk)) { it.scheduleSave(loud: isLoudAllowed) } -} \ No newline at end of file +} + +static boolean modify(Transform tf, Document doc, Whelk whelk) { + Map thing = doc.getThing(); + thing.put(RECORD_KEY, doc.getRecord()); + + var m = new ModifiedThing(thing, tf, whelk.getJsonld().repeatableTerms); + + ((List>) doc.data.get(GRAPH_KEY)).set(0, (Map) m.getAfter().remove(RECORD_KEY)); + ((List>) doc.data.get(GRAPH_KEY)).set(1, m.getAfter()); + + return m.isModified(); +} + From b8e580aa6f69b7b2e995605eb92fa88dd6e0495f Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 19 Nov 2024 19:16:14 +0100 Subject: [PATCH 08/13] Make selectByForm take a Transform object as argument --- .../src/main/groovy/whelk/datatool/WhelkTool.gdsl | 10 ++++++---- .../src/main/groovy/whelk/datatool/WhelkTool.groovy | 4 ++-- .../main/resources/bulk-change-scripts/delete.groovy | 2 +- .../main/resources/bulk-change-scripts/update.groovy | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl index 715345b04b..ed13cde535 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl @@ -8,6 +8,8 @@ package whelk.datatool +import whelk.datatool.form.Transform + String ITEM = 'whelk.datatool.DocumentItem' String DC = "Closure<$ITEM>" @@ -41,10 +43,10 @@ contributor(ctx) { method name:"selectByIds", params:['ids':Collection, 'process':DC, 'batchSize':int], type:void method name:"selectByIds", params:['ids':Collection, 'process':DC, 'silent':boolean], type:void method name:"selectByIds", params:['ids':Collection, 'process':DC], type:void - method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int, 'silent':boolean], type:void - method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int], type:void - method name:"selectByForm", params:['form':Map, 'process':DC, 'silent':boolean], type:void - method name:"selectByForm", params:['form':Map, 'process':DC], type:void + method name:"selectByForm", params:['form':Transform, 'process':DC, 'batchSize':int, 'silent':boolean], type:void + method name:"selectByForm", params:['form':Transform, 'process':DC, 'batchSize':int], type:void + method name:"selectByForm", params:['form':Transform, 'process':DC, 'silent':boolean], type:void + method name:"selectByForm", params:['form':Transform, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'silent':boolean, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'silent':boolean, 'process':DC], type:void diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy index 0d92f53716..4a972b1108 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy @@ -196,13 +196,13 @@ class WhelkTool { batchSize, [1: idItems, 2: collection]) } - void selectByForm(Map form, Closure process, + void selectByForm(Transform transform, Closure process, int batchSize = DEFAULT_BATCH_SIZE, boolean silent = false) { if (!silent) { log "Select by form" } - var sparqlPattern = new Transform.MatchForm(form, whelk).getSparqlPattern(whelk.jsonld.context) + var sparqlPattern = transform.getSparqlPattern(whelk.jsonld.context) var ids = whelk.sparqlQueryClient.queryIdsByPattern(sparqlPattern) selectByIds(ids, process, batchSize, silent) diff --git a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy index 2650b02c9a..4fd40366d2 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy @@ -8,7 +8,7 @@ Map matchForm = parameters.get(MATCH_FORM_KEY) Transform.MatchForm mf = new Transform.MatchForm(matchForm, getWhelk()) -selectByForm(matchForm) { +selectByForm(mf) { if(mf.matches(getFramedThing(it.doc))) { it.scheduleDelete(loud: isLoudAllowed) } diff --git a/whelktool/src/main/resources/bulk-change-scripts/update.groovy b/whelktool/src/main/resources/bulk-change-scripts/update.groovy index 17e7bb347d..f4b3adef78 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/update.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/update.groovy @@ -13,7 +13,7 @@ Map targetForm = parameters.get(TARGET_FORM_KEY) Transform transform = new Transform(matchForm, targetForm, getWhelk()) -selectByForm(matchForm) { +selectByForm(transform) { if(modify(transform, it.doc, it.whelk)) { it.scheduleSave(loud: isLoudAllowed) } From 2d8ec63170da164d9a94cdb114b6e9124aae19ee Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 20 Nov 2024 08:09:44 +0100 Subject: [PATCH 09/13] Refine querying combinations of blank subdivision --- .../removeSubdivision.groovy | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index 355270cd5f..5be2f57636 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -69,16 +69,23 @@ def process = { doc -> } Set ids = [] as Set -removeSubdivision.each { subdivision -> - if (subdivision[ID_KEY]) { - selectByIds([subdivision[ID_KEY]]) { obsoleteSubdivision -> - ids = ids.intersect(obsoleteSubdivision.getDependers()) as Set - } - } else { - Whelk whelk = getWhelk() - ids = ids.intersect(whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) subdivision, whelk.jsonld.context))) +def (linked, blank) = removeSubdivision.split { it[ID_KEY] } +linked.each { l -> + selectByIds(linked.collect { it[ID_KEY] }) { + ids = ids.intersect(it.getDependers()) as Set } } +if (!blank.isEmpty()) { + Whelk whelk = getWhelk() + /* + Querying records containing the given combination of blank subdivisions is very slow so we have to run a separate + query for each subdivision. However the maximum number of results from a Sparql query is 100k so if we just take the + intersection of each result we risk missing some records. Better to just save the result with least hits. + */ + blank.collect { whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) it, whelk.jsonld.context)) } + .min { it.size() } + .with { ids = ids.intersect(it) } +} selectByIds(ids) { process(it) From 8eaea6b1a1f40e0f00f01d03826d0236432fff80 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 20 Nov 2024 09:10:32 +0100 Subject: [PATCH 10/13] (WIP) Refactor: Break out MatchForm --- .../whelk/datatool/form/MatchForm.groovy | 307 ++++++++++++++++++ .../whelk/datatool/form/MatchFormSpec.groovy | 178 ++++++++++ 2 files changed, 485 insertions(+) create mode 100644 whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy create mode 100644 whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy diff --git a/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy b/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy new file mode 100644 index 0000000000..2939874562 --- /dev/null +++ b/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy @@ -0,0 +1,307 @@ +package whelk.datatool.form + +import groovy.transform.Memoized +import whelk.Document +import whelk.JsonLd +import whelk.Whelk +import whelk.datatool.util.DocumentComparator +import whelk.datatool.util.IdLoader +import whelk.util.DocumentUtil + +import static whelk.JsonLd.GRAPH_KEY +import static whelk.JsonLd.ID_KEY +import static whelk.JsonLd.RECORD_KEY +import static whelk.JsonLd.RECORD_TYPE +import static whelk.JsonLd.THING_KEY +import static whelk.JsonLd.TYPE_KEY +import static whelk.JsonLd.asList +import static whelk.component.SparqlQueryClient.GRAPH_VAR +import static whelk.converter.JsonLDTurtleConverter.toTurtleData +import static whelk.util.DocumentUtil.getAtPath +import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy + +class MatchForm { + private static final DocumentComparator comparator = new DocumentComparator() + + public static final String MATCHING_MODE = 'bulk:matchingMode' + public static final String HAS_ID = 'bulk:hasId' + public static final String BNODE_ID = 'bulk:formBlankNodeId' + public static final String ANY_TYPE = "bulk:Any" + public static final String SUBTYPES = "bulk:Subtypes" + public static final String EXACT = 'bulk:Exact' + + private static final String VALUE = 'value' + private static final String VALUE_FROM = 'bulk:valueFrom' + private static final String ANY_OF = 'bulk:AnyOf' + private static final String HAS_BASE_TYPE_TMP = '_hasBaseTypeTmp' + + Map form + + Map nodeIdToMatchFormPath + Map> nodeIdMappings + Map> baseTypeMappings + + MatchForm(Map form, Whelk whelk) { + this.form = form + this.nodeIdToMatchFormPath = collectNodeIdToPath(form) + this.nodeIdMappings = collectNodeIdMappings(whelk) + this.baseTypeMappings = collectBaseTypeMappings(whelk?.jsonld) + } + + MatchForm(Map matchForm) { + this(matchForm, null) + } + + MatchForm() {} + + Map> collectNodeIdMappings(Whelk whelk) { + return collectNodeIdMappings(form, whelk) + } + + static Map> collectNodeIdMappings(Map form, Whelk whelk) { + Map> nodeIdMappings = [:] + + IdLoader idLoader = whelk ? new IdLoader(whelk.storage) : null + + DocumentUtil.traverse(form) { node, path -> + if (!(node instanceof Map)) { + return + } + def anyOf = asList(node[HAS_ID]).find { it[TYPE_KEY] == ANY_OF } + if (!anyOf) { + return + } + def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set + if (ids) { + String nodeId = node[BNODE_ID] + + def (iris, shortIds) = ids.split(JsonLd::looksLikeIri) + if (shortIds.isEmpty()) { + nodeIdMappings[nodeId] = iris + return + } + + if (!idLoader) { + nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT } + return + } + + def nodeType = node[TYPE_KEY] + def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null + def xlShortIds = idLoader.collectXlShortIds(shortIds as List, marcCollection) + def parentProp = dropIndexes(path).reverse()[1] + def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) } + // TODO: Fix hardcoding + def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata") + || isInRange(RECORD_TYPE) + || isInRange("AdminMetadata") + + nodeIdMappings[nodeId] = iris + xlShortIds.collect { + Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT) + } + + return new DocumentUtil.Nop() + } + } + + return nodeIdMappings + } + + static Map collectNodeIdToPath(Map form) { + Map nodeIdToPath = [:] + DocumentUtil.findKey(form, BNODE_ID) { nodeId, path -> + nodeIdToPath[(String) nodeId] = path.dropRight(1) + return new DocumentUtil.Nop() + } + return nodeIdToPath + } + + Map> collectBaseTypeMappings(JsonLd jsonLd) { + Map> mappings = [:] + + if (jsonLd == null) { + return mappings + } + + DocumentUtil.traverse(form) { node, path -> + if (node instanceof Map && node.containsKey(MATCHING_MODE) && ((List) node[MATCHING_MODE]).contains(SUBTYPES)) { + def baseType = (String) node[TYPE_KEY] + Set subTypes = getSubtypes(baseType, jsonLd) as Set + mappings[baseType] = subTypes + return new DocumentUtil.Nop() + } + } + + return mappings + } + + static List dropIndexes(List path) { + return path.findAll { it instanceof String } as List + } + + String getSparqlPattern(Map context) { + Map thing = getSparqlPreparedForm() + Map record = (Map) thing.remove(RECORD_KEY) ?: [:] + + record[ID_KEY] = getRecordTmpId() + thing[ID_KEY] = getThingTmpId() + record[THING_KEY] = [(ID_KEY): getThingTmpId()] + + Map graph = [(GRAPH_KEY): [record, thing]] + + String ttl = toTurtleData(graph, context) + + return insertTypeMappings(insertIdMappings(insertVars(ttl))) + } + + private Map getSparqlPreparedForm() { + Map matchFormCopy = (Map) Document.deepCopy(form) + + DocumentUtil.traverse(matchFormCopy) { node, path -> + if (node instanceof Map) { + def bNodeId = node.remove(BNODE_ID) + if (!bNodeId) return + node.remove(HAS_ID) + if (node[TYPE_KEY] == ANY_TYPE) { + node.remove(TYPE_KEY) + } + if (asList(node.remove(MATCHING_MODE)).contains(SUBTYPES)) { + def baseType = node.remove(TYPE_KEY) + node[HAS_BASE_TYPE_TMP] = baseType + } + if (nodeIdMappings.containsKey(bNodeId)) { + node[ID_KEY] = bNodeId + } + return new DocumentUtil.Nop() + } + if (asList(node).isEmpty()) { + return new DocumentUtil.Replace([:]) + } + } + + return matchFormCopy + } + + private String insertVars(String ttl) { + def substitutions = [ + ("<" + getThingTmpId() + ">") : getVar(getThingTmpId()), + ("<" + getRecordTmpId() + ">"): getVar(getRecordTmpId()) + ] + + baseTypeMappings.keySet().each { baseType -> + substitutions.put(":$HAS_BASE_TYPE_TMP \"$baseType\"".toString(), "a ?" + baseType) + } + + nodeIdMappings.keySet().each { _id -> + substitutions.put("<" + _id + ">", getVar(_id)) + } + + return ttl.replace(substitutions) + } + + + private String insertTypeMappings(String sparqlPattern) { + def valuesClause = baseTypeMappings.collect { baseType, subTypes -> + "VALUES ?$baseType { ${([baseType] + subTypes).collect { ":$it" }.join(" ")} }\n" + }.join() + return valuesClause + sparqlPattern + } + + private String insertIdMappings(String sparqlPattern) { + def valuesClauses = nodeIdMappings.collect { _id, ids -> + "VALUES ${getVar(_id)} { ${ids.collect { "<$it>" }.join(" ")} }\n" + }.join() + return valuesClauses + sparqlPattern + } + + String getVar(String bNodeId) { + return bNodeId == getRecordTmpId() + ? "?$GRAPH_VAR" + : "?${bNodeId.replace('#', '')}" + } + + @Memoized + private static Set getSubtypes(String type, JsonLd jsonLd) { + return jsonLd.getSubClasses(type) + } + + private String getThingTmpId() { + return form[BNODE_ID] + } + + private String getRecordTmpId() { + return getAtPath(form, [RECORD_KEY, BNODE_ID], "TEMP_ID") + } + + boolean matches(Object node) { + return matches(form, node) + } + + boolean matches(Object matchForm, Object node) { + return comparator.isSubset(["x": matchForm], ["x": node], this::mapMatches) + } + + boolean mapMatches(Map matchForm, Map bNode) { + if (matchForm == null || bNode == null) { + return false + } + matchForm = new LinkedHashMap(matchForm) + def match = asList(matchForm[MATCHING_MODE]) + if (match.contains(EXACT)) { + return exactMatches(matchForm, bNode) + } + if (match.contains(SUBTYPES)) { + String aType = matchForm[TYPE_KEY] + String bType = bNode[TYPE_KEY] + if (!(baseTypeMappings[aType] + aType).contains(bType)) { + return false + } else { + matchForm.remove(TYPE_KEY) + } + } + matchForm.remove(MATCHING_MODE) + if (matchForm[TYPE_KEY] == ANY_TYPE) { + matchForm.remove(TYPE_KEY) + } + def ids = nodeIdMappings[matchForm.remove(BNODE_ID)] + if (ids && !ids.contains(bNode[ID_KEY])) { + return false + } + matchForm.remove(HAS_ID) + if (matchForm.size() > bNode.size()) { + return false + } + return comparator.isSubset(matchForm, bNode, this::mapMatches) + } + + private boolean exactMatches(Map matchForm, Map bNode) { + if (matchForm == null || bNode == null) { + return false + } + matchForm = new HashMap(matchForm) + bNode = new HashMap(bNode) + if (asList(matchForm.remove(MATCHING_MODE)).contains(SUBTYPES)) { + String aType = matchForm[TYPE_KEY] + String bType = bNode[TYPE_KEY] + if ((baseTypeMappings[aType] + aType).contains(bType)) { + matchForm.remove(TYPE_KEY) + bNode.remove(TYPE_KEY) + } else { + return false + } + } + if (matchForm[TYPE_KEY] == ANY_TYPE) { + matchForm.remove(TYPE_KEY) + bNode.remove(TYPE_KEY) + } + def ids = nodeIdMappings[matchForm.remove(BNODE_ID)] + if (ids && !ids.contains(bNode[ID_KEY])) { + return false + } + matchForm.remove(HAS_ID) + if (matchForm.size() != bNode.size()) { + return false + } + return comparator.isEqual(matchForm, bNode, this::exactMatches) + } +} diff --git a/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy b/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy new file mode 100644 index 0000000000..7c4074bdc2 --- /dev/null +++ b/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy @@ -0,0 +1,178 @@ +package whelk.datatool.form + +import spock.lang.Specification + +class MatchFormSpec extends Specification { + static Map context = [ + '@vocab': 'https://id.kb.se/vocab/', + 'marc' : 'https://id.kb.se/marc/', + 'p1' : ['@container': '@set'], + 'p2' : ['@container': '@set'], + 'p3' : ['@container': '@list'], + 'marc:p': ['@container': '@set'] + ] + + def "match data against form"() { + given: + def matchForm = new MatchForm() + matchForm.nodeIdMappings = ["#1": ["https://libris.kb.se/x#it", "https://libris.kb.se/y#it"] as Set] + matchForm.baseTypeMappings = ["T": ["Tx", "Ty"] as Set] + + expect: + matchForm.matches(form, node) == result + + where: + form | node | result + "a" | "a" | true + "a" | "b" | false + "a" | ["a", "b"] | true + ["x": "a"] | ["x": ["a", "b"]] | true + ["x": "a", "bulk:matchingMode": ["bulk:Exact"]] | ["x": ["a", "b"]] | false + ["x": ["a", "b"], "bulk:matchingMode": ["bulk:Exact"]] | ["x": ["a", "b"]] | true + ["@type": "T", "bulk:matchingMode": ["bulk:Subtypes"], "a": "b"] | ["@type": "Tx", "a": "b"] | true + ["@type": "T", "a": "b"] | ["@type": "Tx", "a": "b"] | false + ["@type": "T", "bulk:matchingMode": ["bulk:Subtypes", "bulk:Exact"], "a": "b"] | ["@type": "Ty", "a": "b"] | true + ["@type": "T", "bulk:matchingMode": ["bulk:Subtypes", "bulk:Exact"], "a": "b"] | ["@type": "Ty", "a": "b", "c": "d"] | false + ["@type": "bulk:Any", "a": "b"] | ["@type": "T", "a": "b", "c": "d"] | true + ["@type": "bulk:Any", "a": "b", "bulk:matchingMode": ["bulk:Exact"]] | ["@type": "T", "a": "b", "c": "d"] | false + ["x": ["bulk:formBlankNodeId": "#1"]] | ["x": ["@id": "https://libris.kb.se/y#it"]] | true + ["x": ["bulk:formBlankNodeId": "#1"]] | ["x": ["@id": "https://libris.kb.se/z#it"]] | false + } + + def "form to sparql pattern: literal value"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'p1': 'x'] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 \"x\" ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: iri value"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'p1': ['@id': 'https://libris.kb.se/x']] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: marc property"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'marc:p': 'x'] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 marc:p \"x\" ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: null/empty value"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'p1': v] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 [ ] ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + + where: + v << [null, [:], []] + } + + def "form to sparql pattern: nested null/empty value"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'p1': ['bulk:formBlankNodeId': '#2', 'p2': v]] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 [ :p2 [ ] ] ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + + where: + v << [null, [:], []] + } + + def "form to sparql pattern: nested values"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'p1': ['bulk:formBlankNodeId': '#2', 'p2': ['@id': 'https://libris.kb.se/x'], 'marc:p': "x"]] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 [ :p2 ;\n marc:p \"x\" ] ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: ordered list value"() { + given: + def form = ['bulk:formBlankNodeId': '#1', 'p3': [['bulk:formBlankNodeId': '#2', 'p1': 'x'], ['bulk:formBlankNodeId': '#3', 'p2': 'y']]] + def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p3 ( [ :p1 \"x\" ] [ :p2 \"y\" ] ) ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: id mappings"() { + given: + def recordIds = ['@type': 'bulk:AnyOf', 'value': ['https://libris.kb.se/x', 'https://libris.kb.se/y', + 'https://libris.kb.se/z']] + def thingIds = ['@type': 'bulk:AnyOf', 'value': ['https://libris.kb.se/x#it', 'https://libris.kb.se/y#it', + 'https://libris.kb.se/z#it']] + def values = ['@type': 'bulk:AnyOf', 'value': ['https://id.kb.se/x', 'https://id.kb.se/y', + 'https://id.kb.se/z#it']] + + def form = [ + 'bulk:formBlankNodeId': '#1', + 'bulk:hasId' : thingIds, + 'meta' : ['bulk:formBlankNodeId': '#2', 'bulk:hasId': recordIds], + 'p1' : ['bulk:formBlankNodeId': '#3', 'bulk:hasId': values] + ] + + def expectedPattern = "VALUES ?1 { }\n" + + "VALUES ?graph { }\n" + + "VALUES ?3 { }\n" + + "?graph :mainEntity ?1 .\n" + + "\n" + + "?1 :p1 ?3 ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: unspecified types"() { + given: + def form = [ + 'bulk:formBlankNodeId': '#1', + '@type' : 'bulk:Any', + 'p1' : ['bulk:formBlankNodeId': '#2', '@type': 'bulk:Any'], + 'p2' : ['bulk:formBlankNodeId': '#3', '@type': 'bulk:Any', 'p': 'v'], + 'marc:p' : ['bulk:formBlankNodeId': '#4', '@type': 'marc:T', 'p': 'v'] + ] + + def expectedPattern = "?graph :mainEntity ?1 .\n" + + "\n" + + "?1 :p1 [ ] ;\n" + + " :p2 [ :p \"v\" ] ;\n" + + " marc:p [ a marc:T ;\n" + + " :p \"v\" ] ." + + expect: + new MatchForm(form).getSparqlPattern(context) == expectedPattern + } + + def "form to sparql pattern: base types"() { + given: + def form = [ + 'bulk:formBlankNodeId': '#1', + '@type' : 'T1', + 'bulk:matchingMode' : ['bulk:Subtypes'] + ] + + def expectedPattern = "VALUES ?T1 { :T1 :T1x :T1y :T1z }\n" + + "?graph :mainEntity ?1 .\n" + + "\n" + + "?1 a ?T1 ." + + def transform = new MatchForm(form) + transform.baseTypeMappings['T1'] = ['T1x', 'T1y', 'T1z'] as Set + + expect: + transform.getSparqlPattern(context) == expectedPattern + } +} From e1c83dd420f1eaddb917341bdaf1ca38fabaafb2 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 20 Nov 2024 09:20:45 +0100 Subject: [PATCH 11/13] Complete refactoring --- .../whelk/datatool/form/ModifiedThing.groovy | 2 +- .../whelk/datatool/form/Transform.groovy | 324 +----------------- .../whelk/datatool/form/TransformSpec.groovy | 172 ---------- 3 files changed, 18 insertions(+), 480 deletions(-) diff --git a/whelktool/src/main/groovy/whelk/datatool/form/ModifiedThing.groovy b/whelktool/src/main/groovy/whelk/datatool/form/ModifiedThing.groovy index 6310baf690..06dbfa932e 100644 --- a/whelktool/src/main/groovy/whelk/datatool/form/ModifiedThing.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/form/ModifiedThing.groovy @@ -30,7 +30,7 @@ class ModifiedThing { } private Map modify(Map thing) { - if (!transform.matches(thing)) { + if (!transform.matchForm.matches(thing)) { return thing } diff --git a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy index e8f6e2d1af..87703f96ec 100644 --- a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy @@ -1,73 +1,46 @@ package whelk.datatool.form -import groovy.transform.Memoized -import whelk.Document -import whelk.JsonLd import whelk.Whelk import whelk.datatool.util.DocumentComparator -import whelk.datatool.util.IdLoader -import whelk.util.DocumentUtil -import static whelk.JsonLd.GRAPH_KEY import static whelk.JsonLd.ID_KEY -import static whelk.JsonLd.RECORD_KEY -import static whelk.JsonLd.RECORD_TYPE -import static whelk.JsonLd.THING_KEY import static whelk.JsonLd.TYPE_KEY import static whelk.JsonLd.asList -import static whelk.component.SparqlQueryClient.GRAPH_VAR -import static whelk.converter.JsonLDTurtleConverter.toTurtleData +import static whelk.datatool.form.MatchForm.ANY_TYPE +import static whelk.datatool.form.MatchForm.BNODE_ID +import static whelk.datatool.form.MatchForm.EXACT +import static whelk.datatool.form.MatchForm.HAS_ID +import static whelk.datatool.form.MatchForm.MATCHING_MODE +import static whelk.datatool.form.MatchForm.dropIndexes import static whelk.util.DocumentUtil.getAtPath -import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy class Transform { private static final DocumentComparator comparator = new DocumentComparator() - private static final String BNODE_ID = 'bulk:formBlankNodeId' - private static final String MATCHING_MODE = 'bulk:matchingMode' - private static final String HAS_ID = 'bulk:hasId' - private static final String VALUE = 'value' - private static final String VALUE_FROM = 'bulk:valueFrom' - private static final String ANY_TYPE = "bulk:Any" - private static final String SUBTYPES = "bulk:Subtypes" - private static final String EXACT = 'bulk:Exact' - private static final String ANY_OF = 'bulk:AnyOf' - private static final String HAS_BASE_TYPE_TMP = '_hasBaseTypeTmp' - - Map matchForm + MatchForm matchForm Map targetForm List addedPaths List removedPaths - Map nodeIdToMatchFormPath - Map> nodeIdMappings - Map> baseTypeMappings - List changes Transform(Map matchForm, Map targetForm, Whelk whelk) { - this.matchForm = matchForm + this.matchForm = new MatchForm(matchForm, whelk) this.targetForm = targetForm this.removedPaths = collectRemovedPaths() this.addedPaths = collectAddedPaths() - this.nodeIdToMatchFormPath = collectNodeIdToPath(matchForm) - this.nodeIdMappings = collectNodeIdMappings(whelk) - this.baseTypeMappings = collectBaseTypeMappings(whelk?.jsonld) } Transform(Map matchForm, Map targetForm) { this(matchForm, targetForm, null) } - Transform() { - } - List getChangeSets() { return [ [ (TYPE_KEY) : 'ChangeSet', - 'version' : matchForm, + 'version' : matchForm.form, 'removedPaths': [], 'addedPaths' : [] ], @@ -92,7 +65,7 @@ class Transform { private List collectRemove() { return (List) removedPaths.collect { fullPath -> - asList(getAtPath(matchForm, fullPath)).collect { value -> + asList(getAtPath(matchForm.form, fullPath)).collect { value -> new Remove(fullPath, value) } }.flatten() @@ -107,11 +80,11 @@ class Transform { } private List collectAddedPaths() { - return collectChangedPaths(targetForm, matchForm, []) + return collectChangedPaths(targetForm, matchForm.form, []) } private List collectRemovedPaths() { - return collectChangedPaths(matchForm, targetForm, []) + return collectChangedPaths(matchForm.form, targetForm, []) } private static List collectChangedPaths(Object a, Object b, List path) { @@ -159,256 +132,6 @@ class Transform { return !path.isEmpty() && path.last() instanceof Integer ? path.dropRight(1) : path } - private static List dropIndexes(List path) { - return path.findAll { it instanceof String } as List - } - - static Map collectNodeIdToPath(Map form) { - Map nodeIdToPath = [:] - DocumentUtil.findKey(form, BNODE_ID) { nodeId, path -> - nodeIdToPath[(String) nodeId] = path.dropRight(1) - return new DocumentUtil.Nop() - } - return nodeIdToPath - } - - String getSparqlPattern(Map context) { - Map thing = getSparqlPreparedForm() - Map record = (Map) thing.remove(RECORD_KEY) ?: [:] - - record[ID_KEY] = getRecordTmpId() - thing[ID_KEY] = getThingTmpId() - record[THING_KEY] = [(ID_KEY): getThingTmpId()] - - Map graph = [(GRAPH_KEY): [record, thing]] - - String ttl = toTurtleData(graph, context) - - return insertTypeMappings(insertIdMappings(insertVars(ttl))) - } - - private Map getSparqlPreparedForm() { - Map matchFormCopy = (Map) Document.deepCopy(matchForm) - - DocumentUtil.traverse(matchFormCopy) { node, path -> - if (node instanceof Map) { - def bNodeId = node.remove(BNODE_ID) - if (!bNodeId) return - node.remove(HAS_ID) - if (node[TYPE_KEY] == ANY_TYPE) { - node.remove(TYPE_KEY) - } - if (asList(node.remove(MATCHING_MODE)).contains(SUBTYPES)) { - def baseType = node.remove(TYPE_KEY) - node[HAS_BASE_TYPE_TMP] = baseType - } - if (nodeIdMappings.containsKey(bNodeId)) { - node[ID_KEY] = bNodeId - } - return new DocumentUtil.Nop() - } - if (asList(node).isEmpty()) { - return new DocumentUtil.Replace([:]) - } - } - - return matchFormCopy - } - - private String insertVars(String ttl) { - def substitutions = [ - ("<" + getThingTmpId() + ">") : getVar(getThingTmpId()), - ("<" + getRecordTmpId() + ">"): getVar(getRecordTmpId()) - ] - - baseTypeMappings.keySet().each { baseType -> - substitutions.put(":$HAS_BASE_TYPE_TMP \"$baseType\"".toString(), "a ?" + baseType) - } - - nodeIdMappings.keySet().each { _id -> - substitutions.put("<" + _id + ">", getVar(_id)) - } - - return ttl.replace(substitutions) - } - - private String insertTypeMappings(String sparqlPattern) { - def valuesClause = baseTypeMappings.collect { baseType, subTypes -> - "VALUES ?$baseType { ${([baseType] + subTypes).collect { ":$it" }.join(" ")} }\n" - }.join() - return valuesClause + sparqlPattern - } - - private String insertIdMappings(String sparqlPattern) { - def valuesClauses = nodeIdMappings.collect { _id, ids -> - "VALUES ${getVar(_id)} { ${ids.collect { "<$it>" }.join(" ")} }\n" - }.join() - return valuesClauses + sparqlPattern - } - - String getVar(String bNodeId) { - return bNodeId == getRecordTmpId() - ? "?$GRAPH_VAR" - : "?${bNodeId.replace('#', '')}" - } - - Map> collectNodeIdMappings(Whelk whelk) { - return collectNodeIdMappings(matchForm, whelk) - } - - static Map> collectNodeIdMappings(Map form, Whelk whelk) { - Map> nodeIdMappings = [:] - - IdLoader idLoader = whelk ? new IdLoader(whelk.storage) : null - - DocumentUtil.traverse(form) { node, path -> - if (!(node instanceof Map)) { - return - } - def anyOf = asList(node[HAS_ID]).find { it[TYPE_KEY] == ANY_OF } - if (!anyOf) { - return - } - def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set - if (ids) { - String nodeId = node[BNODE_ID] - - def (iris, shortIds) = ids.split(JsonLd::looksLikeIri) - if (shortIds.isEmpty()) { - nodeIdMappings[nodeId] = iris - return - } - - if (!idLoader) { - nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT } - return - } - - def nodeType = node[TYPE_KEY] - def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null - def xlShortIds = idLoader.collectXlShortIds(shortIds as List, marcCollection) - def parentProp = dropIndexes(path).reverse()[1] - def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) } - // TODO: Fix hardcoding - def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata") - || isInRange(RECORD_TYPE) - || isInRange("AdminMetadata") - - nodeIdMappings[nodeId] = iris + xlShortIds.collect { - Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT) - } - - return new DocumentUtil.Nop() - } - } - - return nodeIdMappings - } - - Map> collectBaseTypeMappings(JsonLd jsonLd) { - Map> mappings = [:] - - if (jsonLd == null) { - return mappings - } - - DocumentUtil.traverse(matchForm) { node, path -> - if (node instanceof Map && node.containsKey(MATCHING_MODE) && ((List) node[MATCHING_MODE]).contains(SUBTYPES)) { - def baseType = (String) node[TYPE_KEY] - Set subTypes = getSubtypes(baseType, jsonLd) as Set - mappings[baseType] = subTypes - return new DocumentUtil.Nop() - } - } - - return mappings - } - - @Memoized - private static Set getSubtypes(String type, JsonLd jsonLd) { - return jsonLd.getSubClasses(type) - } - - private String getThingTmpId() { - return matchForm[BNODE_ID] - } - - private String getRecordTmpId() { - return getAtPath(matchForm, [RECORD_KEY, BNODE_ID], "TEMP_ID") - } - - boolean matches(Object node) { - return matches(matchForm, node) - } - - boolean matches(Object matchForm, Object node) { - return comparator.isSubset(["x": matchForm], ["x": node], this::mapMatches) - } - - boolean mapMatches(Map matchForm, Map bNode) { - if (matchForm == null || bNode == null) { - return false - } - matchForm = new LinkedHashMap(matchForm) - def match = asList(matchForm[MATCHING_MODE]) - if (match.contains(EXACT)) { - return exactMatches(matchForm, bNode) - } - if (match.contains(SUBTYPES)) { - String aType = matchForm[TYPE_KEY] - String bType = bNode[TYPE_KEY] - if (!(baseTypeMappings[aType] + aType).contains(bType)) { - return false - } else { - matchForm.remove(TYPE_KEY) - } - } - matchForm.remove(MATCHING_MODE) - if (matchForm[TYPE_KEY] == ANY_TYPE) { - matchForm.remove(TYPE_KEY) - } - def ids = nodeIdMappings[matchForm.remove(BNODE_ID)] - if (ids && !ids.contains(bNode[ID_KEY])) { - return false - } - matchForm.remove(HAS_ID) - if (matchForm.size() > bNode.size()) { - return false - } - return comparator.isSubset(matchForm, bNode, this::mapMatches) - } - - private boolean exactMatches(Map matchForm, Map bNode) { - if (matchForm == null || bNode == null) { - return false - } - matchForm = new HashMap(matchForm) - bNode = new HashMap(bNode) - if (asList(matchForm.remove(MATCHING_MODE)).contains(SUBTYPES)) { - String aType = matchForm[TYPE_KEY] - String bType = bNode[TYPE_KEY] - if ((baseTypeMappings[aType] + aType).contains(bType)) { - matchForm.remove(TYPE_KEY) - bNode.remove(TYPE_KEY) - } else { - return false - } - } - if (matchForm[TYPE_KEY] == ANY_TYPE) { - matchForm.remove(TYPE_KEY) - bNode.remove(TYPE_KEY) - } - def ids = nodeIdMappings[matchForm.remove(BNODE_ID)] - if (ids && !ids.contains(bNode[ID_KEY])) { - return false - } - matchForm.remove(HAS_ID) - if (matchForm.size() != bNode.size()) { - return false - } - return comparator.isEqual(matchForm, bNode, this::exactMatches) - } - Add newAddValue(Object value) { return new Add(null, value) } @@ -422,11 +145,11 @@ class Transform { ChangesForNode(String nodeId, List changeList) { this.nodeId = nodeId this.changeList = changeList - this.propertyPath = dropIndexes(nodeIdToMatchFormPath[nodeId]) + this.propertyPath = dropIndexes(matchForm.nodeIdToMatchFormPath[nodeId]) } boolean matches(Map node) { - return matches(form(), node) && removeMatches(node) + return matchForm.matches(form(), node) && removeMatches(node) } private removeMatches(Map node) { @@ -446,7 +169,7 @@ class Transform { } private Map form() { - return getAtPath(matchForm, nodeIdToMatchFormPath[nodeId]) as Map + return getAtPath(matchForm.form, matchForm.nodeIdToMatchFormPath[nodeId]) as Map } } @@ -490,11 +213,11 @@ class Transform { } boolean matches(Object o) { - return Transform.this.matches(value, o) || (property() == TYPE_KEY && value == ANY_TYPE) + return matchForm.matches(value, o) || (property() == TYPE_KEY && value == ANY_TYPE) } String parentId() { - getAtPath(matchForm, parentPath())[BNODE_ID] + getAtPath(matchForm.form, parentPath())[BNODE_ID] } boolean hasId() { @@ -517,17 +240,4 @@ class Transform { getAtPath(targetForm, parentPath())[BNODE_ID] } } - - static class MatchForm extends Transform { - MatchForm(Map matchForm, Whelk whelk) { - super() - this.matchForm = matchForm - this.nodeIdMappings = collectNodeIdMappings(whelk) - this.baseTypeMappings = collectBaseTypeMappings(whelk?.jsonld) - } - - MatchForm(Map matchForm) { - this(matchForm, null) - } - } } diff --git a/whelktool/src/test/groovy/whelk/datatool/form/TransformSpec.groovy b/whelktool/src/test/groovy/whelk/datatool/form/TransformSpec.groovy index 91be81b538..7bfbaf3824 100644 --- a/whelktool/src/test/groovy/whelk/datatool/form/TransformSpec.groovy +++ b/whelktool/src/test/groovy/whelk/datatool/form/TransformSpec.groovy @@ -8,14 +8,6 @@ class TransformSpec extends Specification { static List specs = TransformSpec.class.getClassLoader() .getResourceAsStream('whelk/datatool/form/specs.json') .with { mapper.readValue((InputStream) it, Map)['specs'] } - static Map context = [ - '@vocab': 'https://id.kb.se/vocab/', - 'marc' : 'https://id.kb.se/marc/', - 'p1' : ['@container': '@set'], - 'p2' : ['@container': '@set'], - 'p3' : ['@container': '@list'], - 'marc:p': ['@container': '@set'] - ] def "collect changed paths"() { given: @@ -30,168 +22,4 @@ class TransformSpec extends Specification { where: spec << specs.findAll { (it["addedPaths"] || it["removedPaths"]) && !it['shouldFailWithException'] } } - - def "form to sparql pattern: literal value"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'p1': 'x'] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 \"x\" ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: iri value"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'p1': ['@id': 'https://libris.kb.se/x']] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: marc property"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'marc:p': 'x'] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 marc:p \"x\" ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: null/empty value"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'p1': v] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 [ ] ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - - where: - v << [null, [:], []] - } - - def "form to sparql pattern: nested null/empty value"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'p1': ['bulk:formBlankNodeId': '#2', 'p2': v]] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 [ :p2 [ ] ] ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - - where: - v << [null, [:], []] - } - - def "form to sparql pattern: nested values"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'p1': ['bulk:formBlankNodeId': '#2', 'p2': ['@id': 'https://libris.kb.se/x'], 'marc:p': "x"]] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p1 [ :p2 ;\n marc:p \"x\" ] ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: ordered list value"() { - given: - def form = ['bulk:formBlankNodeId': '#1', 'p3': [['bulk:formBlankNodeId': '#2', 'p1': 'x'], ['bulk:formBlankNodeId': '#3', 'p2': 'y']]] - def expectedPattern = "?graph :mainEntity ?1 .\n\n?1 :p3 ( [ :p1 \"x\" ] [ :p2 \"y\" ] ) ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: id mappings"() { - given: - def recordIds = ['@type': 'bulk:AnyOf', 'value': ['https://libris.kb.se/x', 'https://libris.kb.se/y', - 'https://libris.kb.se/z']] - def thingIds = ['@type': 'bulk:AnyOf', 'value': ['https://libris.kb.se/x#it', 'https://libris.kb.se/y#it', - 'https://libris.kb.se/z#it']] - def values = ['@type': 'bulk:AnyOf', 'value': ['https://id.kb.se/x', 'https://id.kb.se/y', - 'https://id.kb.se/z#it']] - - def form = [ - 'bulk:formBlankNodeId': '#1', - 'bulk:hasId' : thingIds, - 'meta' : ['bulk:formBlankNodeId': '#2', 'bulk:hasId': recordIds], - 'p1' : ['bulk:formBlankNodeId': '#3', 'bulk:hasId': values] - ] - - def expectedPattern = "VALUES ?1 { }\n" + - "VALUES ?graph { }\n" + - "VALUES ?3 { }\n" + - "?graph :mainEntity ?1 .\n" + - "\n" + - "?1 :p1 ?3 ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: unspecified types"() { - given: - def form = [ - 'bulk:formBlankNodeId': '#1', - '@type' : 'bulk:Any', - 'p1' : ['bulk:formBlankNodeId': '#2', '@type': 'bulk:Any'], - 'p2' : ['bulk:formBlankNodeId': '#3', '@type': 'bulk:Any', 'p': 'v'], - 'marc:p' : ['bulk:formBlankNodeId': '#4', '@type': 'marc:T', 'p': 'v'] - ] - - def expectedPattern = "?graph :mainEntity ?1 .\n" + - "\n" + - "?1 :p1 [ ] ;\n" + - " :p2 [ :p \"v\" ] ;\n" + - " marc:p [ a marc:T ;\n" + - " :p \"v\" ] ." - - expect: - new Transform.MatchForm(form).getSparqlPattern(context) == expectedPattern - } - - def "form to sparql pattern: base types"() { - given: - def form = [ - 'bulk:formBlankNodeId': '#1', - '@type' : 'T1', - 'bulk:matchingMode' : ['bulk:Subtypes'] - ] - - def expectedPattern = "VALUES ?T1 { :T1 :T1x :T1y :T1z }\n" + - "?graph :mainEntity ?1 .\n" + - "\n" + - "?1 a ?T1 ." - - def transform = new Transform.MatchForm(form) - transform.baseTypeMappings['T1'] = ['T1x', 'T1y', 'T1z'] as Set - - expect: - transform.getSparqlPattern(context) == expectedPattern - } - - def "match data against form"() { - given: - def transform = new Transform() - transform.nodeIdMappings = ["#1": ["https://libris.kb.se/x#it", "https://libris.kb.se/y#it"] as Set] - transform.baseTypeMappings = ["T": ["Tx", "Ty"] as Set] - - expect: - transform.matches(matchForm, node) == result - - where: - matchForm | node | result - "a" | "a" | true - "a" | "b" | false - "a" | ["a", "b"] | true - ["x": "a"] | ["x": ["a", "b"]] | true - ["x": "a", "bulk:matchingMode": ["bulk:Exact"]] | ["x": ["a", "b"]] | false - ["x": ["a", "b"], "bulk:matchingMode": ["bulk:Exact"]] | ["x": ["a", "b"]] | true - ["@type": "T", "bulk:matchingMode": ["bulk:Subtypes"], "a": "b"] | ["@type": "Tx", "a": "b"] | true - ["@type": "T", "a": "b"] | ["@type": "Tx", "a": "b"] | false - ["@type": "T", "bulk:matchingMode": ["bulk:Subtypes", "bulk:Exact"], "a": "b"] | ["@type": "Ty", "a": "b"] | true - ["@type": "T", "bulk:matchingMode": ["bulk:Subtypes", "bulk:Exact"], "a": "b"] | ["@type": "Ty", "a": "b", "c": "d"] | false - ["@type": "bulk:Any", "a": "b"] | ["@type": "T", "a": "b", "c": "d"] | true - ["@type": "bulk:Any", "a": "b", "bulk:matchingMode": ["bulk:Exact"]] | ["@type": "T", "a": "b", "c": "d"] | false - ["x": ["bulk:formBlankNodeId": "#1"]] | ["x": ["@id": "https://libris.kb.se/y#it"]] | true - ["x": ["bulk:formBlankNodeId": "#1"]] | ["x": ["@id": "https://libris.kb.se/z#it"]] | false - } } \ No newline at end of file From 1b39f63b20f49967984e3dec296fb69d9c85c825 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 20 Nov 2024 09:26:35 +0100 Subject: [PATCH 12/13] Make selectByForm take a MatchForm instead of Transform as argument --- .../src/main/groovy/whelk/datatool/WhelkTool.gdsl | 10 +++++----- .../src/main/groovy/whelk/datatool/WhelkTool.groovy | 5 +++-- .../main/resources/bulk-change-scripts/delete.groovy | 4 ++-- .../main/resources/bulk-change-scripts/update.groovy | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl index ed13cde535..17ab707bce 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl @@ -8,7 +8,7 @@ package whelk.datatool -import whelk.datatool.form.Transform +import whelk.datatool.form.MatchForm String ITEM = 'whelk.datatool.DocumentItem' String DC = "Closure<$ITEM>" @@ -43,10 +43,10 @@ contributor(ctx) { method name:"selectByIds", params:['ids':Collection, 'process':DC, 'batchSize':int], type:void method name:"selectByIds", params:['ids':Collection, 'process':DC, 'silent':boolean], type:void method name:"selectByIds", params:['ids':Collection, 'process':DC], type:void - method name:"selectByForm", params:['form':Transform, 'process':DC, 'batchSize':int, 'silent':boolean], type:void - method name:"selectByForm", params:['form':Transform, 'process':DC, 'batchSize':int], type:void - method name:"selectByForm", params:['form':Transform, 'process':DC, 'silent':boolean], type:void - method name:"selectByForm", params:['form':Transform, 'process':DC], type:void + method name:"selectByForm", params:['form':MatchForm, 'process':DC, 'batchSize':int, 'silent':boolean], type:void + method name:"selectByForm", params:['form':MatchForm, 'process':DC, 'batchSize':int], type:void + method name:"selectByForm", params:['form':MatchForm, 'process':DC, 'silent':boolean], type:void + method name:"selectByForm", params:['form':MatchForm, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'silent':boolean, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'process':DC], type:void method name:"selectBySqlWhere", params:['whereClause':String, 'silent':boolean, 'process':DC], type:void diff --git a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy index 4a972b1108..50f344b2cb 100644 --- a/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy @@ -8,6 +8,7 @@ import whelk.IdGenerator import whelk.JsonLd import whelk.JsonLdValidator import whelk.Whelk +import whelk.datatool.form.MatchForm import whelk.datatool.form.Transform import whelk.datatool.util.IdLoader import whelk.exception.StaleUpdateException @@ -196,13 +197,13 @@ class WhelkTool { batchSize, [1: idItems, 2: collection]) } - void selectByForm(Transform transform, Closure process, + void selectByForm(MatchForm matchForm, Closure process, int batchSize = DEFAULT_BATCH_SIZE, boolean silent = false) { if (!silent) { log "Select by form" } - var sparqlPattern = transform.getSparqlPattern(whelk.jsonld.context) + var sparqlPattern = matchForm.getSparqlPattern(whelk.jsonld.context) var ids = whelk.sparqlQueryClient.queryIdsByPattern(sparqlPattern) selectByIds(ids, process, batchSize, silent) diff --git a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy index 4fd40366d2..b49669db13 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/delete.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/delete.groovy @@ -1,12 +1,12 @@ import whelk.Document -import whelk.datatool.form.Transform +import whelk.datatool.form.MatchForm import static whelk.JsonLd.RECORD_KEY import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY Map matchForm = parameters.get(MATCH_FORM_KEY) -Transform.MatchForm mf = new Transform.MatchForm(matchForm, getWhelk()) +MatchForm mf = new MatchForm(matchForm, getWhelk()) selectByForm(mf) { if(mf.matches(getFramedThing(it.doc))) { diff --git a/whelktool/src/main/resources/bulk-change-scripts/update.groovy b/whelktool/src/main/resources/bulk-change-scripts/update.groovy index f4b3adef78..87444482b0 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/update.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/update.groovy @@ -13,13 +13,13 @@ Map targetForm = parameters.get(TARGET_FORM_KEY) Transform transform = new Transform(matchForm, targetForm, getWhelk()) -selectByForm(transform) { +selectByForm(transform.matchForm) { if(modify(transform, it.doc, it.whelk)) { it.scheduleSave(loud: isLoudAllowed) } } -static boolean modify(Transform tf, Document doc, Whelk whelk) { +private static boolean modify(Transform tf, Document doc, Whelk whelk) { Map thing = doc.getThing(); thing.put(RECORD_KEY, doc.getRecord()); From caa9c2854f8463af48f662a1c4fff04c2ad94ef9 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 20 Nov 2024 10:43:13 +0100 Subject: [PATCH 13/13] Don't intersect with empty --- .../bulk-change-scripts/removeSubdivision.groovy | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index 5be2f57636..2624a3d208 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -72,7 +72,12 @@ Set ids = [] as Set def (linked, blank) = removeSubdivision.split { it[ID_KEY] } linked.each { l -> selectByIds(linked.collect { it[ID_KEY] }) { - ids = ids.intersect(it.getDependers()) as Set + def dependers = it.getDependers() as Set + if (ids.isEmpty()) { + ids.addAll(it.getDependers()) + } else { + ids = ids.intersect(dependers) + } } } if (!blank.isEmpty()) { @@ -84,7 +89,13 @@ if (!blank.isEmpty()) { */ blank.collect { whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) it, whelk.jsonld.context)) } .min { it.size() } - .with { ids = ids.intersect(it) } + .with { + if (ids.isEmpty()) { + ids.addAll(it) + } else { + ids = ids.intersect(it) + } + } } selectByIds(ids) {