diff --git a/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy b/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy index f315089c97..f49d3d18e0 100644 --- a/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy +++ b/whelk-core/src/main/groovy/whelk/converter/JsonLDTurtleConverter.groovy @@ -21,7 +21,7 @@ class JsonLDTurtleConverter implements FormatConverter { return [(JsonLd.NON_JSON_CONTENT_KEY) : toTurtle(source, null, base)] } - static String toTurtleData(source, Map context) { + static String toTurtleNoPrelude(source, Map context) { // Add skip prelude flag in trld.trig.SerializerState.serialize? return withoutPrefixes(toTurtle(source, context, null)) } diff --git a/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy b/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy index 2939874562..5774662aa7 100644 --- a/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy @@ -1,6 +1,7 @@ package whelk.datatool.form import groovy.transform.Memoized +import groovy.transform.PackageScope import whelk.Document import whelk.JsonLd import whelk.Whelk @@ -16,7 +17,7 @@ import static whelk.JsonLd.THING_KEY import static whelk.JsonLd.TYPE_KEY import static whelk.JsonLd.asList import static whelk.component.SparqlQueryClient.GRAPH_VAR -import static whelk.converter.JsonLDTurtleConverter.toTurtleData +import static whelk.converter.JsonLDTurtleConverter.toTurtleNoPrelude import static whelk.util.DocumentUtil.getAtPath import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy @@ -37,121 +38,119 @@ class MatchForm { Map form - Map nodeIdToMatchFormPath - Map> nodeIdMappings - Map> baseTypeMappings + // For looking up where in the form a certain blank node is located + Map formBNodeIdToPath + // For looking up resource ids (if given in bulk:hasId) associated with a certain blank node in the form + Map> formBNodeIdToResourceIds + // For looking up subtypes of a type appearing in the form + Map> baseTypeToSubtypes MatchForm(Map form, Whelk whelk) { this.form = form - this.nodeIdToMatchFormPath = collectNodeIdToPath(form) - this.nodeIdMappings = collectNodeIdMappings(whelk) - this.baseTypeMappings = collectBaseTypeMappings(whelk?.jsonld) + this.formBNodeIdToPath = collectFormBNodeIdToPath() + this.formBNodeIdToResourceIds = collectFormBNodeIdToResourceIds(whelk) + this.baseTypeToSubtypes = collectBaseTypeToSubtypes(whelk?.jsonld) } + // For testing only + @PackageScope MatchForm(Map matchForm) { this(matchForm, null) } + // For testing only + @PackageScope MatchForm() {} - Map> collectNodeIdMappings(Whelk whelk) { - return collectNodeIdMappings(form, whelk) + boolean matches(Object node) { + return matches(form, node) } - static Map> collectNodeIdMappings(Map form, Whelk whelk) { - Map> nodeIdMappings = [:] + boolean matches(Object matchForm, Object node) { + return comparator.isSubset(["x": matchForm], ["x": node], this::mapMatches) + } - IdLoader idLoader = whelk ? new IdLoader(whelk.storage) : null + String getSparqlPattern(Map context) { + Map thing = getSparqlPreparedForm() + Map record = (Map) thing.remove(RECORD_KEY) ?: [:] - DocumentUtil.traverse(form) { node, path -> - if (!(node instanceof Map)) { - return - } - def anyOf = asList(node[HAS_ID]).find { it[TYPE_KEY] == ANY_OF } - if (!anyOf) { - return - } - def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set - if (ids) { - String nodeId = node[BNODE_ID] + record[ID_KEY] = getRecordTmpId() + thing[ID_KEY] = getThingTmpId() + record[THING_KEY] = [(ID_KEY): getThingTmpId()] - def (iris, shortIds) = ids.split(JsonLd::looksLikeIri) - if (shortIds.isEmpty()) { - nodeIdMappings[nodeId] = iris - return - } + Map graph = [(GRAPH_KEY): [record, thing]] - if (!idLoader) { - nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT } - return - } + String ttl = toTurtleNoPrelude(graph, context) - def nodeType = node[TYPE_KEY] - def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null - def xlShortIds = idLoader.collectXlShortIds(shortIds as List, marcCollection) - def parentProp = dropIndexes(path).reverse()[1] - def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) } - // TODO: Fix hardcoding - def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata") - || isInRange(RECORD_TYPE) - || isInRange("AdminMetadata") + return insertTypeMappings(insertIdMappings(insertVars(ttl))) + } - nodeIdMappings[nodeId] = iris + xlShortIds.collect { - Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT) - } + static List dropIndexes(List path) { + return path.findAll { it instanceof String } as List + } - return new DocumentUtil.Nop() + private boolean mapMatches(Map matchForm, Map bNode) { + if (matchForm == null || bNode == null) { + return false + } + matchForm = new LinkedHashMap(matchForm) + def match = asList(matchForm[MATCHING_MODE]) + if (match.contains(EXACT)) { + return exactMatches(matchForm, bNode) + } + if (match.contains(SUBTYPES)) { + String aType = matchForm[TYPE_KEY] + String bType = bNode[TYPE_KEY] + if (!(baseTypeToSubtypes[aType] + aType).contains(bType)) { + return false + } else { + matchForm.remove(TYPE_KEY) } } - - return nodeIdMappings - } - - static Map collectNodeIdToPath(Map form) { - Map nodeIdToPath = [:] - DocumentUtil.findKey(form, BNODE_ID) { nodeId, path -> - nodeIdToPath[(String) nodeId] = path.dropRight(1) - return new DocumentUtil.Nop() + matchForm.remove(MATCHING_MODE) + if (matchForm[TYPE_KEY] == ANY_TYPE) { + matchForm.remove(TYPE_KEY) } - return nodeIdToPath + def ids = formBNodeIdToResourceIds[matchForm.remove(BNODE_ID)] + if (ids && !ids.contains(bNode[ID_KEY])) { + return false + } + matchForm.remove(HAS_ID) + if (matchForm.size() > bNode.size()) { + return false + } + return comparator.isSubset(matchForm, bNode, this::mapMatches) } - Map> collectBaseTypeMappings(JsonLd jsonLd) { - Map> mappings = [:] - - if (jsonLd == null) { - return mappings + private boolean exactMatches(Map matchForm, Map bNode) { + if (matchForm == null || bNode == null) { + return false } - - DocumentUtil.traverse(form) { node, path -> - if (node instanceof Map && node.containsKey(MATCHING_MODE) && ((List) node[MATCHING_MODE]).contains(SUBTYPES)) { - def baseType = (String) node[TYPE_KEY] - Set subTypes = getSubtypes(baseType, jsonLd) as Set - mappings[baseType] = subTypes - return new DocumentUtil.Nop() + matchForm = new HashMap(matchForm) + bNode = new HashMap(bNode) + if (asList(matchForm.remove(MATCHING_MODE)).contains(SUBTYPES)) { + String aType = matchForm[TYPE_KEY] + String bType = bNode[TYPE_KEY] + if ((baseTypeToSubtypes[aType] + aType).contains(bType)) { + matchForm.remove(TYPE_KEY) + bNode.remove(TYPE_KEY) + } else { + return false } } - - return mappings - } - - static List dropIndexes(List path) { - return path.findAll { it instanceof String } as List - } - - String getSparqlPattern(Map context) { - Map thing = getSparqlPreparedForm() - Map record = (Map) thing.remove(RECORD_KEY) ?: [:] - - record[ID_KEY] = getRecordTmpId() - thing[ID_KEY] = getThingTmpId() - record[THING_KEY] = [(ID_KEY): getThingTmpId()] - - Map graph = [(GRAPH_KEY): [record, thing]] - - String ttl = toTurtleData(graph, context) - - return insertTypeMappings(insertIdMappings(insertVars(ttl))) + if (matchForm[TYPE_KEY] == ANY_TYPE) { + matchForm.remove(TYPE_KEY) + bNode.remove(TYPE_KEY) + } + def ids = formBNodeIdToResourceIds[matchForm.remove(BNODE_ID)] + if (ids && !ids.contains(bNode[ID_KEY])) { + return false + } + matchForm.remove(HAS_ID) + if (matchForm.size() != bNode.size()) { + return false + } + return comparator.isEqual(matchForm, bNode, this::exactMatches) } private Map getSparqlPreparedForm() { @@ -169,7 +168,7 @@ class MatchForm { def baseType = node.remove(TYPE_KEY) node[HAS_BASE_TYPE_TMP] = baseType } - if (nodeIdMappings.containsKey(bNodeId)) { + if (formBNodeIdToResourceIds.containsKey(bNodeId)) { node[ID_KEY] = bNodeId } return new DocumentUtil.Nop() @@ -188,11 +187,11 @@ class MatchForm { ("<" + getRecordTmpId() + ">"): getVar(getRecordTmpId()) ] - baseTypeMappings.keySet().each { baseType -> + baseTypeToSubtypes.keySet().each { baseType -> substitutions.put(":$HAS_BASE_TYPE_TMP \"$baseType\"".toString(), "a ?" + baseType) } - nodeIdMappings.keySet().each { _id -> + formBNodeIdToResourceIds.keySet().each { _id -> substitutions.put("<" + _id + ">", getVar(_id)) } @@ -201,30 +200,25 @@ class MatchForm { private String insertTypeMappings(String sparqlPattern) { - def valuesClause = baseTypeMappings.collect { baseType, subTypes -> + def valuesClause = baseTypeToSubtypes.collect { baseType, subTypes -> "VALUES ?$baseType { ${([baseType] + subTypes).collect { ":$it" }.join(" ")} }\n" }.join() return valuesClause + sparqlPattern } private String insertIdMappings(String sparqlPattern) { - def valuesClauses = nodeIdMappings.collect { _id, ids -> + def valuesClauses = formBNodeIdToResourceIds.collect { _id, ids -> "VALUES ${getVar(_id)} { ${ids.collect { "<$it>" }.join(" ")} }\n" }.join() return valuesClauses + sparqlPattern } - String getVar(String bNodeId) { + private String getVar(String bNodeId) { return bNodeId == getRecordTmpId() ? "?$GRAPH_VAR" : "?${bNodeId.replace('#', '')}" } - @Memoized - private static Set getSubtypes(String type, JsonLd jsonLd) { - return jsonLd.getSubClasses(type) - } - private String getThingTmpId() { return form[BNODE_ID] } @@ -233,75 +227,89 @@ class MatchForm { return getAtPath(form, [RECORD_KEY, BNODE_ID], "TEMP_ID") } - boolean matches(Object node) { - return matches(form, node) + private Map> collectFormBNodeIdToResourceIds(Whelk whelk) { + return collectFormBNodeIdToResourceIds(form, whelk) } - boolean matches(Object matchForm, Object node) { - return comparator.isSubset(["x": matchForm], ["x": node], this::mapMatches) - } + private static Map> collectFormBNodeIdToResourceIds(Map form, Whelk whelk) { + Map> nodeIdMappings = [:] - boolean mapMatches(Map matchForm, Map bNode) { - if (matchForm == null || bNode == null) { - return false - } - matchForm = new LinkedHashMap(matchForm) - def match = asList(matchForm[MATCHING_MODE]) - if (match.contains(EXACT)) { - return exactMatches(matchForm, bNode) - } - if (match.contains(SUBTYPES)) { - String aType = matchForm[TYPE_KEY] - String bType = bNode[TYPE_KEY] - if (!(baseTypeMappings[aType] + aType).contains(bType)) { - return false - } else { - matchForm.remove(TYPE_KEY) + IdLoader idLoader = whelk ? new IdLoader(whelk.storage) : null + + DocumentUtil.traverse(form) { node, path -> + if (!(node instanceof Map)) { + return + } + def anyOf = asList(node[HAS_ID]).find { it[TYPE_KEY] == ANY_OF } + if (!anyOf) { + return + } + def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set + if (ids) { + String nodeId = node[BNODE_ID] + + def (iris, shortIds) = ids.split(JsonLd::looksLikeIri) + if (shortIds.isEmpty()) { + nodeIdMappings[nodeId] = iris + return + } + + if (!idLoader) { + nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT } + return + } + + def nodeType = node[TYPE_KEY] + def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null + def xlShortIds = idLoader.collectXlShortIds(shortIds as List, marcCollection) + def parentProp = dropIndexes(path).reverse()[1] + def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) } + // TODO: Fix hardcoding + def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata") + || isInRange(RECORD_TYPE) + || isInRange("AdminMetadata") + + nodeIdMappings[nodeId] = iris + xlShortIds.collect { + Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT) + } + + return new DocumentUtil.Nop() } } - matchForm.remove(MATCHING_MODE) - if (matchForm[TYPE_KEY] == ANY_TYPE) { - matchForm.remove(TYPE_KEY) - } - def ids = nodeIdMappings[matchForm.remove(BNODE_ID)] - if (ids && !ids.contains(bNode[ID_KEY])) { - return false - } - matchForm.remove(HAS_ID) - if (matchForm.size() > bNode.size()) { - return false + + return nodeIdMappings + } + + private Map collectFormBNodeIdToPath() { + Map nodeIdToPath = [:] + DocumentUtil.findKey(form, BNODE_ID) { nodeId, path -> + nodeIdToPath[(String) nodeId] = path.dropRight(1) + return new DocumentUtil.Nop() } - return comparator.isSubset(matchForm, bNode, this::mapMatches) + return nodeIdToPath } - private boolean exactMatches(Map matchForm, Map bNode) { - if (matchForm == null || bNode == null) { - return false + private Map> collectBaseTypeToSubtypes(JsonLd jsonLd) { + Map> mappings = [:] + + if (jsonLd == null) { + return mappings } - matchForm = new HashMap(matchForm) - bNode = new HashMap(bNode) - if (asList(matchForm.remove(MATCHING_MODE)).contains(SUBTYPES)) { - String aType = matchForm[TYPE_KEY] - String bType = bNode[TYPE_KEY] - if ((baseTypeMappings[aType] + aType).contains(bType)) { - matchForm.remove(TYPE_KEY) - bNode.remove(TYPE_KEY) - } else { - return false + + DocumentUtil.traverse(form) { node, path -> + if (node instanceof Map && node.containsKey(MATCHING_MODE) && ((List) node[MATCHING_MODE]).contains(SUBTYPES)) { + def baseType = (String) node[TYPE_KEY] + Set subTypes = getSubtypes(baseType, jsonLd) as Set + mappings[baseType] = subTypes + return new DocumentUtil.Nop() } } - if (matchForm[TYPE_KEY] == ANY_TYPE) { - matchForm.remove(TYPE_KEY) - bNode.remove(TYPE_KEY) - } - def ids = nodeIdMappings[matchForm.remove(BNODE_ID)] - if (ids && !ids.contains(bNode[ID_KEY])) { - return false - } - matchForm.remove(HAS_ID) - if (matchForm.size() != bNode.size()) { - return false - } - return comparator.isEqual(matchForm, bNode, this::exactMatches) + + return mappings + } + + @Memoized + private static Set getSubtypes(String type, JsonLd jsonLd) { + return jsonLd.getSubClasses(type) } } diff --git a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy index 87703f96ec..a65af22ac3 100644 --- a/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy +++ b/whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy @@ -1,5 +1,6 @@ package whelk.datatool.form +import groovy.transform.PackageScope import whelk.Whelk import whelk.datatool.util.DocumentComparator @@ -32,6 +33,8 @@ class Transform { this.addedPaths = collectAddedPaths() } + // For testing only + @PackageScope Transform(Map matchForm, Map targetForm) { this(matchForm, targetForm, null) } @@ -145,7 +148,7 @@ class Transform { ChangesForNode(String nodeId, List changeList) { this.nodeId = nodeId this.changeList = changeList - this.propertyPath = dropIndexes(matchForm.nodeIdToMatchFormPath[nodeId]) + this.propertyPath = dropIndexes(matchForm.formBNodeIdToPath[nodeId]) } boolean matches(Map node) { @@ -169,7 +172,7 @@ class Transform { } private Map form() { - return getAtPath(matchForm.form, matchForm.nodeIdToMatchFormPath[nodeId]) as Map + return getAtPath(matchForm.form, matchForm.formBNodeIdToPath[nodeId]) as Map } } diff --git a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy index 2adbf81cd5..276634d58f 100644 --- a/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy +++ b/whelktool/src/main/resources/bulk-change-scripts/removeSubdivision.groovy @@ -14,7 +14,7 @@ import whelk.util.DocumentUtil import static whelk.JsonLd.ID_KEY import static whelk.JsonLd.asList -import static whelk.converter.JsonLDTurtleConverter.toTurtleData +import static whelk.converter.JsonLDTurtleConverter.toTurtleNoPrelude import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY @@ -87,7 +87,7 @@ if (!blank.isEmpty()) { query for each subdivision. However the maximum number of results from a Sparql query is 100k so if we just take the intersection of each result we risk missing some records. Better to just save the result with least hits. */ - blank.collect { whelk.sparqlQueryClient.queryIdsByPattern(toTurtleData((Map) it, whelk.jsonld.context)) } + blank.collect { whelk.sparqlQueryClient.queryIdsByPattern(toTurtleNoPrelude((Map) it, whelk.jsonld.context)) } .min { it.size() } .with { if (ids.isEmpty()) { diff --git a/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy b/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy index 7c4074bdc2..7cfa53db6d 100644 --- a/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy +++ b/whelktool/src/test/groovy/whelk/datatool/form/MatchFormSpec.groovy @@ -15,8 +15,8 @@ class MatchFormSpec extends Specification { def "match data against form"() { given: def matchForm = new MatchForm() - matchForm.nodeIdMappings = ["#1": ["https://libris.kb.se/x#it", "https://libris.kb.se/y#it"] as Set] - matchForm.baseTypeMappings = ["T": ["Tx", "Ty"] as Set] + matchForm.formBNodeIdToResourceIds = ["#1": ["https://libris.kb.se/x#it", "https://libris.kb.se/y#it"] as Set] + matchForm.baseTypeToSubtypes = ["T": ["Tx", "Ty"] as Set] expect: matchForm.matches(form, node) == result @@ -170,7 +170,7 @@ class MatchFormSpec extends Specification { "?1 a ?T1 ." def transform = new MatchForm(form) - transform.baseTypeMappings['T1'] = ['T1x', 'T1y', 'T1z'] as Set + transform.baseTypeToSubtypes['T1'] = ['T1x', 'T1y', 'T1z'] as Set expect: transform.getSparqlPattern(context) == expectedPattern