From 15c035651d83bb30e8278a8f3ca1ebd52ff83949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kalle=20W=C3=A5hlin?= <72360110+kwahlin@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:04:55 +0100 Subject: [PATCH] Bugfix/lxl 4600 unverified ids (#1534) * Filter out surely invalid ids when collecting short ids * Load only verified xl ids in collectFormBNodeIdToResourceIds * Don't add prefix to already prefixed types * Add missing type declaration --- .../whelk/datatool/form/MatchForm.groovy | 31 +++++++++---------- .../java/whelk/datatool/util/IdLoader.java | 24 ++++++++++---- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy b/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy index f74c6bdfd7..a2c248ef4b 100644 --- a/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy @@ -14,6 +14,7 @@ import static whelk.JsonLd.RECORD_TYPE import static whelk.JsonLd.THING_KEY import static whelk.JsonLd.TYPE_KEY import static whelk.JsonLd.asList +import static whelk.JsonLd.looksLikeIri import static whelk.component.SparqlQueryClient.GRAPH_VAR import static whelk.converter.JsonLDTurtleConverter.toTurtleNoPrelude import static whelk.util.DocumentUtil.getAtPath @@ -80,7 +81,7 @@ class MatchForm { return path.findAll { it instanceof String } as List } - private getSubtypes() { + private Set getSubtypes() { return getSubtypes(form) } @@ -220,8 +221,9 @@ class MatchForm { private String insertTypeMappings(String sparqlPattern) { if (shouldMatchSubtypes() && getSubtypes()) { - def baseType = form[TYPE_KEY] - String valuesClause = "VALUES ?$baseType { ${([baseType] + getSubtypes()).collect { ":$it" }.join(" ")} }\n" + String baseType = form[TYPE_KEY] + String values = ([baseType] + getSubtypes()).collect { it.contains(":") ? it : ":$it" }.join(" ") + String valuesClause = "VALUES ?$baseType { $values }\n" return valuesClause + sparqlPattern } return sparqlPattern @@ -265,34 +267,31 @@ class MatchForm { if (!anyOf) { return } - def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set + def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as List if (ids) { String nodeId = node[BNODE_ID] - def (iris, shortIds) = ids.split(JsonLd::looksLikeIri) - if (shortIds.isEmpty()) { - nodeIdMappings[nodeId] = iris - return - } - if (!idLoader) { - nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT } + nodeIdMappings[nodeId] = ids.findResults { + IdLoader.isXlShortId(it) + ? Document.BASE_URI.toString() + it + Document.HASH_IT + : (looksLikeIri(it) ? it : null) + } as Set return } def nodeType = node[TYPE_KEY] def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null - def xlShortIds = idLoader.collectXlShortIds(shortIds as List, marcCollection) + def xlShortIds = idLoader.collectXlShortIds(ids, marcCollection) def parentProp = dropIndexes(path).reverse()[1] def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) } // TODO: Fix hardcoding - def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata") + def isRecord = whelk.jsonld.isInstanceOf((Map) node, "AdminMetadata") || isInRange(RECORD_TYPE) || isInRange("AdminMetadata") - nodeIdMappings[nodeId] = iris + xlShortIds.collect { - Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT) - } + nodeIdMappings[nodeId] = idLoader.loadAllIds(xlShortIds) + .collect { isRecord ? it.recordIri() : it.thingIri() } as Set return new DocumentUtil.Nop() } diff --git a/whelktool/src/main/java/whelk/datatool/util/IdLoader.java b/whelktool/src/main/java/whelk/datatool/util/IdLoader.java index 1069adba37..d0b5ba5e6c 100644 --- a/whelktool/src/main/java/whelk/datatool/util/IdLoader.java +++ b/whelktool/src/main/java/whelk/datatool/util/IdLoader.java @@ -1,5 +1,6 @@ package whelk.datatool.util; +import whelk.JsonLd; import whelk.component.PostgreSQLComponent; import java.io.BufferedReader; @@ -18,6 +19,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.function.Function; import static whelk.datatool.WhelkTool.DEFAULT_FETCH_SIZE; @@ -65,20 +67,30 @@ public List loadAllIds(Collection shortIds) { } } - public List collectXlShortIds(Collection xlIds) { - Map iriToShortId = findShortIdsForIris(xlIds.stream().filter(id -> id.contains(":")).toList()); - return xlIds.stream().map(id -> iriToShortId.getOrDefault(id, id)).toList(); + public List collectXlShortIds(Collection ids) { + Map iriToShortId = findShortIdsForIris(ids.stream().filter(JsonLd::looksLikeIri).toList()); + return ids.stream() + .map(id -> iriToShortId.getOrDefault(id, isXlShortId(id) ? id : null)) + .filter(Objects::nonNull) + .toList(); } public List collectXlShortIds(Collection ids, String marcCollection) { if (!Arrays.asList("bib", "auth", "hold").contains(marcCollection)) { return collectXlShortIds(ids); } - Map iriToShortId = findShortIdsForIris(ids.stream().filter(id -> id.contains(":")).toList()); - Map voyagerIdToXlShortID = findXlShortIdsForVoyagerIds( + Map iriToShortId = findShortIdsForIris(ids.stream().filter(JsonLd::looksLikeIri).toList()); + Map voyagerIdToXlShortId = findXlShortIdsForVoyagerIds( ids.stream().filter(IdLoader::isVoyagerId).toList(), marcCollection); - return ids.stream().map(id -> iriToShortId.getOrDefault(id, voyagerIdToXlShortID.getOrDefault(id, id))).toList(); + return ids.stream() + .map(id -> iriToShortId.getOrDefault(id, voyagerIdToXlShortId.getOrDefault(id, isXlShortId(id) ? id : null))) + .filter(Objects::nonNull) + .toList(); + } + + public static boolean isXlShortId(String id) { + return id.matches("[0-9a-z]{15,17}"); } private Map findXlShortIdsForVoyagerIds(Collection voyagerIds, String marcCollection) {