Refine querying combinations of blank subdivision

libris · Nov 20, 2024 · 2d8ec63 · 2d8ec63
1 parent b8e580a
commit 2d8ec63
Showing 1 changed file with 15 additions and 8 deletions.
diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy
@@ -69,16 +69,23 @@ def process = { doc ->
 }
 
 Set<String> ids = [] as Set
-removeSubdivision.each { subdivision ->
-    if (subdivision[ID_KEY]) {
-        selectByIds([subdivision[ID_KEY]]) { obsoleteSubdivision ->
-            ids = ids.intersect(obsoleteSubdivision.getDependers()) as Set<String>
-        }
-    } else {
-        Whelk whelk = getWhelk()
-        ids = ids.intersect(whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) subdivision, whelk.jsonld.context)))
+def (linked, blank) = removeSubdivision.split { it[ID_KEY] }
+linked.each { l ->
+    selectByIds(linked.collect { it[ID_KEY] }) {
+        ids = ids.intersect(it.getDependers()) as Set<String>
     }
 }
+if (!blank.isEmpty()) {
+    Whelk whelk = getWhelk()
+    /*
+    Querying records containing the given combination of blank subdivisions is very slow so we have to run a separate
+    query for each subdivision. However the maximum number of results from a Sparql query is 100k so if we just take the
+    intersection of each result we risk missing some records. Better to just save the result with least hits.
+     */
+    blank.collect { whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) it, whelk.jsonld.context)) }
+            .min { it.size() }
+            .with { ids = ids.intersect(it) }
+}
 
 selectByIds(ids) {
     process(it)