From 2d8ec63170da164d9a94cdb114b6e9124aae19ee Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 20 Nov 2024 08:09:44 +0100 Subject: [PATCH] Refine querying combinations of blank subdivision --- .../removeSubdivision.groovy | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index 355270cd5f..5be2f57636 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -69,16 +69,23 @@ def process = { doc -> } Set ids = [] as Set -removeSubdivision.each { subdivision -> - if (subdivision[ID_KEY]) { - selectByIds([subdivision[ID_KEY]]) { obsoleteSubdivision -> - ids = ids.intersect(obsoleteSubdivision.getDependers()) as Set - } - } else { - Whelk whelk = getWhelk() - ids = ids.intersect(whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) subdivision, whelk.jsonld.context))) +def (linked, blank) = removeSubdivision.split { it[ID_KEY] } +linked.each { l -> + selectByIds(linked.collect { it[ID_KEY] }) { + ids = ids.intersect(it.getDependers()) as Set } } +if (!blank.isEmpty()) { + Whelk whelk = getWhelk() + /* + Querying records containing the given combination of blank subdivisions is very slow so we have to run a separate + query for each subdivision. However the maximum number of results from a Sparql query is 100k so if we just take the + intersection of each result we risk missing some records. Better to just save the result with least hits. + */ + blank.collect { whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) it, whelk.jsonld.context)) } + .min { it.size() } + .with { ids = ids.intersect(it) } +} selectByIds(ids) { process(it)