Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/adjust subvision script #1522

Merged
merged 13 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,17 @@ class JsonLDTurtleConverter implements FormatConverter {
}

Map convert(Map source, String id) {
return [(JsonLd.NON_JSON_CONTENT_KEY) : _toTurtle(source, null, base, false)]
return [(JsonLd.NON_JSON_CONTENT_KEY) : toTurtle(source, null, base)]
}

static String toTurtle(Map source, Map context, boolean skipPrelude) {
return _toTurtle(source, context, null, skipPrelude)
static String toTurtleData(source, Map context) {
olovy marked this conversation as resolved.
Show resolved Hide resolved
// Add skip prelude flag in trld.trig.SerializerState.serialize?
return withoutPrefixes(toTurtle(source, context, null))
}

private static String _toTurtle(Map source, Map context, base, boolean skipPrelude) {
private static String toTurtle(source, Map context, base) {
def bytes = JsonLdToTrigSerializer.toTurtle(context, source, base).toByteArray()
def s = new String(bytes, UTF_8)
// Add skip prelude flag in trld.trig.SerializerState.serialize?
return skipPrelude ? withoutPrefixes(s) : s
return new String(bytes, UTF_8)
}

private static String withoutPrefixes(String ttl) {
Expand Down
6 changes: 6 additions & 0 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

package whelk.datatool

import whelk.datatool.form.MatchForm

String ITEM = 'whelk.datatool.DocumentItem'
String DC = "Closure<$ITEM>"

Expand Down Expand Up @@ -41,6 +43,10 @@ contributor(ctx) {
method name:"selectByIds", params:['ids':Collection<String>, 'process':DC, 'batchSize':int], type:void
method name:"selectByIds", params:['ids':Collection<String>, 'process':DC, 'silent':boolean], type:void
method name:"selectByIds", params:['ids':Collection<String>, 'process':DC], type:void
method name:"selectByForm", params:['form':MatchForm, 'process':DC, 'batchSize':int, 'silent':boolean], type:void
method name:"selectByForm", params:['form':MatchForm, 'process':DC, 'batchSize':int], type:void
method name:"selectByForm", params:['form':MatchForm, 'process':DC, 'silent':boolean], type:void
method name:"selectByForm", params:['form':MatchForm, 'process':DC], type:void
method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'silent':boolean, 'process':DC], type:void
method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'process':DC], type:void
method name:"selectBySqlWhere", params:['whereClause':String, 'silent':boolean, 'process':DC], type:void
Expand Down
14 changes: 14 additions & 0 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import whelk.IdGenerator
import whelk.JsonLd
import whelk.JsonLdValidator
import whelk.Whelk
import whelk.datatool.form.MatchForm
import whelk.datatool.form.Transform
import whelk.datatool.util.IdLoader
import whelk.exception.StaleUpdateException
Expand Down Expand Up @@ -196,6 +197,18 @@ class WhelkTool {
batchSize, [1: idItems, 2: collection])
}

void selectByForm(MatchForm matchForm, Closure process,
int batchSize = DEFAULT_BATCH_SIZE, boolean silent = false) {
if (!silent) {
log "Select by form"
}

var sparqlPattern = matchForm.getSparqlPattern(whelk.jsonld.context)
var ids = whelk.sparqlQueryClient.queryIdsByPattern(sparqlPattern)

selectByIds(ids, process, batchSize, silent)
}

DocumentItem create(Map data) {
Document doc = new Document(data)
doc.deepReplaceId(Document.BASE_URI.toString() + IdGenerator.generate())
Expand Down Expand Up @@ -677,6 +690,7 @@ class WhelkTool {
bindings.put("selectByIds", this.&selectByIds)
bindings.put("selectByIdsAndCollection", this.&selectByIdsAndCollection)
bindings.put("selectBySqlWhere", this.&selectBySqlWhere)
bindings.put("selectByForm", this.&selectByForm)
bindings.put("selectFromIterable", this.&selectFromIterable)
bindings.put("create", this.&create)
bindings.put("queryIds", this.&queryIds)
Expand Down
307 changes: 307 additions & 0 deletions whelktool/src/main/groovy/whelk/datatool/form/MatchForm.groovy
olovy marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
package whelk.datatool.form

import groovy.transform.Memoized
import whelk.Document
import whelk.JsonLd
import whelk.Whelk
import whelk.datatool.util.DocumentComparator
import whelk.datatool.util.IdLoader
import whelk.util.DocumentUtil

import static whelk.JsonLd.GRAPH_KEY
import static whelk.JsonLd.ID_KEY
import static whelk.JsonLd.RECORD_KEY
import static whelk.JsonLd.RECORD_TYPE
import static whelk.JsonLd.THING_KEY
import static whelk.JsonLd.TYPE_KEY
import static whelk.JsonLd.asList
import static whelk.component.SparqlQueryClient.GRAPH_VAR
import static whelk.converter.JsonLDTurtleConverter.toTurtleData
import static whelk.util.DocumentUtil.getAtPath
import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy

class MatchForm {
private static final DocumentComparator comparator = new DocumentComparator()

public static final String MATCHING_MODE = 'bulk:matchingMode'
public static final String HAS_ID = 'bulk:hasId'
public static final String BNODE_ID = 'bulk:formBlankNodeId'
public static final String ANY_TYPE = "bulk:Any"
public static final String SUBTYPES = "bulk:Subtypes"
public static final String EXACT = 'bulk:Exact'

private static final String VALUE = 'value'
private static final String VALUE_FROM = 'bulk:valueFrom'
private static final String ANY_OF = 'bulk:AnyOf'
private static final String HAS_BASE_TYPE_TMP = '_hasBaseTypeTmp'

Map form

Map<String, List> nodeIdToMatchFormPath
Map<String, Set<String>> nodeIdMappings
Map<String, Set<String>> baseTypeMappings
olovy marked this conversation as resolved.
Show resolved Hide resolved

MatchForm(Map form, Whelk whelk) {
this.form = form
this.nodeIdToMatchFormPath = collectNodeIdToPath(form)
this.nodeIdMappings = collectNodeIdMappings(whelk)
this.baseTypeMappings = collectBaseTypeMappings(whelk?.jsonld)
}

MatchForm(Map matchForm) {
this(matchForm, null)
}

MatchForm() {}
olovy marked this conversation as resolved.
Show resolved Hide resolved

Map<String, Set<String>> collectNodeIdMappings(Whelk whelk) {
return collectNodeIdMappings(form, whelk)
}
olovy marked this conversation as resolved.
Show resolved Hide resolved

static Map<String, Set<String>> collectNodeIdMappings(Map form, Whelk whelk) {
olovy marked this conversation as resolved.
Show resolved Hide resolved
Map<String, Set<String>> nodeIdMappings = [:]

IdLoader idLoader = whelk ? new IdLoader(whelk.storage) : null

DocumentUtil.traverse(form) { node, path ->
if (!(node instanceof Map)) {
return
}
def anyOf = asList(node[HAS_ID]).find { it[TYPE_KEY] == ANY_OF }
if (!anyOf) {
return
}
def ids = (anyOf[VALUE] ?: (anyOf[VALUE_FROM] ? IdLoader.fromFile((String) anyOf[VALUE_FROM][ID_KEY]) : [])) as Set<String>
if (ids) {
String nodeId = node[BNODE_ID]

def (iris, shortIds) = ids.split(JsonLd::looksLikeIri)
if (shortIds.isEmpty()) {
nodeIdMappings[nodeId] = iris
return
}

if (!idLoader) {
nodeIdMappings[nodeId] = iris + shortIds.collect { Document.BASE_URI.toString() + it + Document.HASH_IT }
return
}

def nodeType = node[TYPE_KEY]
def marcCollection = nodeType ? getMarcCollectionInHierarchy((String) nodeType, whelk.jsonld) : null
def xlShortIds = idLoader.collectXlShortIds(shortIds as List<String>, marcCollection)
def parentProp = dropIndexes(path).reverse()[1]
def isInRange = { type -> whelk.jsonld.getInRange(type).contains(parentProp) }
// TODO: Fix hardcoding
def isRecord = whelk.jsonld.isInstanceOf(node, "AdminMetadata")
|| isInRange(RECORD_TYPE)
|| isInRange("AdminMetadata")

nodeIdMappings[nodeId] = iris + xlShortIds.collect {
Document.BASE_URI.toString() + it + (isRecord ? "" : Document.HASH_IT)
}

return new DocumentUtil.Nop()
}
}

return nodeIdMappings
}

static Map<String, List> collectNodeIdToPath(Map form) {
olovy marked this conversation as resolved.
Show resolved Hide resolved
Map<String, List> nodeIdToPath = [:]
DocumentUtil.findKey(form, BNODE_ID) { nodeId, path ->
nodeIdToPath[(String) nodeId] = path.dropRight(1)
return new DocumentUtil.Nop()
}
return nodeIdToPath
}

Map<String, Set<String>> collectBaseTypeMappings(JsonLd jsonLd) {
olovy marked this conversation as resolved.
Show resolved Hide resolved
Map<String, Set<String>> mappings = [:]

if (jsonLd == null) {
return mappings
}

DocumentUtil.traverse(form) { node, path ->
if (node instanceof Map && node.containsKey(MATCHING_MODE) && ((List) node[MATCHING_MODE]).contains(SUBTYPES)) {
def baseType = (String) node[TYPE_KEY]
Set<String> subTypes = getSubtypes(baseType, jsonLd) as Set
mappings[baseType] = subTypes
return new DocumentUtil.Nop()
}
}

return mappings
}

static List<String> dropIndexes(List path) {
return path.findAll { it instanceof String } as List<String>
}

String getSparqlPattern(Map context) {
Map thing = getSparqlPreparedForm()
Map record = (Map) thing.remove(RECORD_KEY) ?: [:]

record[ID_KEY] = getRecordTmpId()
thing[ID_KEY] = getThingTmpId()
record[THING_KEY] = [(ID_KEY): getThingTmpId()]

Map graph = [(GRAPH_KEY): [record, thing]]

String ttl = toTurtleData(graph, context)

return insertTypeMappings(insertIdMappings(insertVars(ttl)))
}

private Map getSparqlPreparedForm() {
Map matchFormCopy = (Map) Document.deepCopy(form)

DocumentUtil.traverse(matchFormCopy) { node, path ->
if (node instanceof Map) {
def bNodeId = node.remove(BNODE_ID)
if (!bNodeId) return
node.remove(HAS_ID)
if (node[TYPE_KEY] == ANY_TYPE) {
node.remove(TYPE_KEY)
}
if (asList(node.remove(MATCHING_MODE)).contains(SUBTYPES)) {
def baseType = node.remove(TYPE_KEY)
node[HAS_BASE_TYPE_TMP] = baseType
}
if (nodeIdMappings.containsKey(bNodeId)) {
node[ID_KEY] = bNodeId
}
return new DocumentUtil.Nop()
}
if (asList(node).isEmpty()) {
return new DocumentUtil.Replace([:])
}
}

return matchFormCopy
}

private String insertVars(String ttl) {
def substitutions = [
("<" + getThingTmpId() + ">") : getVar(getThingTmpId()),
("<" + getRecordTmpId() + ">"): getVar(getRecordTmpId())
]

baseTypeMappings.keySet().each { baseType ->
substitutions.put(":$HAS_BASE_TYPE_TMP \"$baseType\"".toString(), "a ?" + baseType)
}

nodeIdMappings.keySet().each { _id ->
substitutions.put("<" + _id + ">", getVar(_id))
}

return ttl.replace(substitutions)
}


private String insertTypeMappings(String sparqlPattern) {
def valuesClause = baseTypeMappings.collect { baseType, subTypes ->
"VALUES ?$baseType { ${([baseType] + subTypes).collect { ":$it" }.join(" ")} }\n"
}.join()
return valuesClause + sparqlPattern
}

private String insertIdMappings(String sparqlPattern) {
def valuesClauses = nodeIdMappings.collect { _id, ids ->
"VALUES ${getVar(_id)} { ${ids.collect { "<$it>" }.join(" ")} }\n"
}.join()
return valuesClauses + sparqlPattern
}

String getVar(String bNodeId) {
return bNodeId == getRecordTmpId()
? "?$GRAPH_VAR"
: "?${bNodeId.replace('#', '')}"
}

@Memoized
olovy marked this conversation as resolved.
Show resolved Hide resolved
private static Set<String> getSubtypes(String type, JsonLd jsonLd) {
return jsonLd.getSubClasses(type)
}

private String getThingTmpId() {
return form[BNODE_ID]
}

private String getRecordTmpId() {
return getAtPath(form, [RECORD_KEY, BNODE_ID], "TEMP_ID")
}

boolean matches(Object node) {
return matches(form, node)
}

boolean matches(Object matchForm, Object node) {
return comparator.isSubset(["x": matchForm], ["x": node], this::mapMatches)
}

boolean mapMatches(Map matchForm, Map bNode) {
olovy marked this conversation as resolved.
Show resolved Hide resolved
if (matchForm == null || bNode == null) {
return false
}
matchForm = new LinkedHashMap(matchForm)
def match = asList(matchForm[MATCHING_MODE])
if (match.contains(EXACT)) {
return exactMatches(matchForm, bNode)
}
if (match.contains(SUBTYPES)) {
String aType = matchForm[TYPE_KEY]
String bType = bNode[TYPE_KEY]
if (!(baseTypeMappings[aType] + aType).contains(bType)) {
return false
} else {
matchForm.remove(TYPE_KEY)
}
}
matchForm.remove(MATCHING_MODE)
if (matchForm[TYPE_KEY] == ANY_TYPE) {
matchForm.remove(TYPE_KEY)
}
def ids = nodeIdMappings[matchForm.remove(BNODE_ID)]
if (ids && !ids.contains(bNode[ID_KEY])) {
return false
}
matchForm.remove(HAS_ID)
if (matchForm.size() > bNode.size()) {
return false
}
return comparator.isSubset(matchForm, bNode, this::mapMatches)
}

private boolean exactMatches(Map matchForm, Map bNode) {
if (matchForm == null || bNode == null) {
return false
}
matchForm = new HashMap(matchForm)
bNode = new HashMap(bNode)
if (asList(matchForm.remove(MATCHING_MODE)).contains(SUBTYPES)) {
String aType = matchForm[TYPE_KEY]
String bType = bNode[TYPE_KEY]
if ((baseTypeMappings[aType] + aType).contains(bType)) {
matchForm.remove(TYPE_KEY)
bNode.remove(TYPE_KEY)
} else {
return false
}
}
if (matchForm[TYPE_KEY] == ANY_TYPE) {
matchForm.remove(TYPE_KEY)
bNode.remove(TYPE_KEY)
}
def ids = nodeIdMappings[matchForm.remove(BNODE_ID)]
if (ids && !ids.contains(bNode[ID_KEY])) {
return false
}
matchForm.remove(HAS_ID)
if (matchForm.size() != bNode.size()) {
return false
}
return comparator.isEqual(matchForm, bNode, this::exactMatches)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class ModifiedThing {
}

private Map modify(Map thing) {
if (!transform.matches(thing)) {
if (!transform.matchForm.matches(thing)) {
return thing
}

Expand Down
Loading
Loading