Skip to content

Commit

Permalink
More flexible subdivision job (#1520)
Browse files Browse the repository at this point in the history
- Enable bulk remove/replace blank topic subdivisions
- Allow removing any :Subdivision subtype
- Allow removing a combination of multiple :Subdivision within a ComplexSubject
- Allow removing both linked and/or blank :Subdivision
- Allow adding any :Subject subtype
- Allow adding either linked or blank :Subject
  • Loading branch information
kwahlin authored Nov 18, 2024
1 parent 4eaf8c8 commit 9177045
Showing 12 changed files with 159 additions and 137 deletions.
Original file line number Diff line number Diff line change
@@ -7,7 +7,9 @@ import org.apache.jena.query.QueryExecutionFactory
import org.apache.jena.query.ResultSet
import whelk.Document
import whelk.JsonLd
import whelk.converter.JsonLdToTrigSerializer

import static java.nio.charset.StandardCharsets.UTF_8
import static trld.trig.Serializer.collectPrefixes

@Log
Original file line number Diff line number Diff line change
@@ -3,10 +3,8 @@ package whelk.converter
import groovy.util.logging.Log4j2 as Log
import whelk.JsonLd
import whelk.Whelk
import whelk.component.PostgreSQLComponent
import whelk.util.PropertyLoader

import static whelk.util.Jackson.mapper
import static java.nio.charset.StandardCharsets.UTF_8

@Log
class JsonLDTurtleConverter implements FormatConverter {
@@ -20,7 +18,25 @@ class JsonLDTurtleConverter implements FormatConverter {
}

Map convert(Map source, String id) {
def bytes = JsonLdToTrigSerializer.toTurtle(null, source, base).toByteArray()
return [(JsonLd.NON_JSON_CONTENT_KEY) : (new String(bytes, "UTF-8"))]
return [(JsonLd.NON_JSON_CONTENT_KEY) : _toTurtle(source, null, base, false)]
}

static String toTurtle(Map source, Map context, boolean skipPrelude) {
return _toTurtle(source, context, null, skipPrelude)
}

private static String _toTurtle(Map source, Map context, base, boolean skipPrelude) {
def bytes = JsonLdToTrigSerializer.toTurtle(context, source, base).toByteArray()
def s = new String(bytes, UTF_8)
// Add skip prelude flag in trld.trig.SerializerState.serialize?
return skipPrelude ? withoutPrefixes(s) : s
}

private static String withoutPrefixes(String ttl) {
return ttl.readLines()
.split { it.startsWith('prefix') }
.get(1)
.join('\n')
.trim()
}
}
10 changes: 7 additions & 3 deletions whelk-core/src/main/groovy/whelk/filter/LinkFinder.groovy
Original file line number Diff line number Diff line change
@@ -19,6 +19,12 @@ class LinkFinder {

static String ENTITY_QUERY

/*
Non-primary ids appearing in these paths should be kept as is upon normalization, i.e. they should *not* be
replaced by their primary id.
*/
private static Set<String> RETAIN_NON_PRIMARY_IDS = ['bulk:changeSpec.bulk:deprecate'] as Set

LinkFinder(PostgreSQLComponent pgsql) {
postgres = pgsql
ENTITY_QUERY = """SELECT ids2.iri AS thingUri
@@ -115,8 +121,6 @@ class LinkFinder {
}

private void replaceSameAsLinksWithPrimaries(Map data, List path = []) {
def exceptedPaths = ['bulk:changeSpec.bulk:deprecate'] as Set

// If this is a link (an object containing _only_ an id)
String id = data.get("@id")
if (id != null && data.keySet().size() == 1) {
@@ -127,7 +131,7 @@ class LinkFinder {
)
.findAll { it instanceof String }
.join('.')
if (exceptedPaths.contains(normalizedPath)) {
if (RETAIN_NON_PRIMARY_IDS.contains(normalizedPath)) {
return
}
String primaryId = lookupPrimaryId(id, normalizedPath)
4 changes: 0 additions & 4 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.gdsl
Original file line number Diff line number Diff line change
@@ -41,10 +41,6 @@ contributor(ctx) {
method name:"selectByIds", params:['ids':Collection<String>, 'process':DC, 'batchSize':int], type:void
method name:"selectByIds", params:['ids':Collection<String>, 'process':DC, 'silent':boolean], type:void
method name:"selectByIds", params:['ids':Collection<String>, 'process':DC], type:void
method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int, 'silent':boolean], type:void
method name:"selectByForm", params:['form':Map, 'process':DC, 'batchSize':int], type:void
method name:"selectByForm", params:['form':Map, 'process':DC, 'silent':boolean], type:void
method name:"selectByForm", params:['form':Map, 'process':DC], type:void
method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'silent':boolean, 'process':DC], type:void
method name:"selectBySqlWhere", params:['whereClause':String, 'batchSize':int, 'process':DC], type:void
method name:"selectBySqlWhere", params:['whereClause':String, 'silent':boolean, 'process':DC], type:void
13 changes: 0 additions & 13 deletions whelktool/src/main/groovy/whelk/datatool/WhelkTool.groovy
Original file line number Diff line number Diff line change
@@ -191,18 +191,6 @@ class WhelkTool {
batchSize, [1: idItems, 2: collection])
}

void selectByForm(Map form, Closure process,
int batchSize = DEFAULT_BATCH_SIZE, boolean silent = false) {
if (!silent) {
log "Select by form"
}

var sparqlPattern = new Transform.MatchForm(form, whelk).getSparqlPattern(whelk.jsonld.context)
var ids = whelk.sparqlQueryClient.queryIdsByPattern(sparqlPattern)

selectByIds(ids, process, batchSize, silent)
}

DocumentItem create(Map data) {
Document doc = new Document(data)
doc.deepReplaceId(Document.BASE_URI.toString() + IdGenerator.generate())
@@ -671,7 +659,6 @@ class WhelkTool {
bindings.put("selectByIds", this.&selectByIds)
bindings.put("selectByIdsAndCollection", this.&selectByIdsAndCollection)
bindings.put("selectBySqlWhere", this.&selectBySqlWhere)
bindings.put("selectByForm", this.&selectByForm)
bindings.put("selectFromIterable", this.&selectFromIterable)
bindings.put("create", this.&create)
bindings.put("queryIds", this.&queryIds)
20 changes: 5 additions & 15 deletions whelktool/src/main/groovy/whelk/datatool/form/Transform.groovy
Original file line number Diff line number Diff line change
@@ -4,19 +4,19 @@ import groovy.transform.Memoized
import whelk.Document
import whelk.JsonLd
import whelk.Whelk
import whelk.converter.JsonLdToTrigSerializer
import whelk.datatool.util.DocumentComparator
import whelk.datatool.util.IdLoader
import whelk.util.DocumentUtil

import static java.nio.charset.StandardCharsets.UTF_8
import static whelk.JsonLd.GRAPH_KEY
import static whelk.JsonLd.ID_KEY
import static whelk.JsonLd.RECORD_KEY
import static whelk.JsonLd.RECORD_TYPE
import static whelk.JsonLd.THING_KEY
import static whelk.JsonLd.TYPE_KEY
import static whelk.JsonLd.asList
import static whelk.component.SparqlQueryClient.GRAPH_VAR
import static whelk.converter.JsonLDTurtleConverter.toTurtle
import static whelk.util.DocumentUtil.getAtPath
import static whelk.util.LegacyIntegrationTools.getMarcCollectionInHierarchy

@@ -180,11 +180,9 @@ class Transform {
thing[ID_KEY] = getThingTmpId()
record[THING_KEY] = [(ID_KEY): getThingTmpId()]

def ttl = ((ByteArrayOutputStream) JsonLdToTrigSerializer.toTurtle(context, [record, thing]))
.toByteArray()
.with { new String(it, UTF_8) }
// Add skip prelude flag to JsonLdToTrigSerializer.toTurtle?
.with { withoutPrefixes(it) }
Map graph = [(GRAPH_KEY): [record, thing]]

String ttl = toTurtle(graph, context, true)

return insertTypeMappings(insertIdMappings(insertVars(ttl)))
}
@@ -254,14 +252,6 @@ class Transform {
: "?${bNodeId.replace('#', '')}"
}

private static String withoutPrefixes(String ttl) {
ttl.readLines()
.split { it.startsWith('prefix') }
.get(1)
.join('\n')
.trim()
}

Map<String, Set<String>> collectNodeIdMappings(Whelk whelk) {
return collectNodeIdMappings(matchForm, whelk)
}
Original file line number Diff line number Diff line change
@@ -69,6 +69,8 @@ public String key() {
public static final String ADD_KEY = "bulk:add";
public static final String KEEP_KEY = "bulk:keep";
public static final String DEPRECATE_KEY = "bulk:deprecate";
public static final String REMOVE_SUBDIVISION_KEY = "bulk:removeSubdivision";
public static final String ADD_SUBJECT_KEY = "bulk:addSubject";
public static final String SCRIPT_KEY = "bulk:script";
public static final String EXECUTION_KEY = "bulk:execution";
public static final String EXECUTION_TYPE = "bulk:Execution";
Original file line number Diff line number Diff line change
@@ -11,17 +11,17 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static whelk.JsonLd.GRAPH_KEY;
import static whelk.JsonLd.RECORD_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.ADD_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.KEEP_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.DEPRECATE_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY;
import static whelk.datatool.bulkchange.BulkJobDocument.TARGET_FORM_KEY;

public sealed interface Specification permits Specification.Create, Specification.Delete, Specification.Merge, Specification.Update, Specification.Other {
@@ -49,6 +49,10 @@ public Script getScript(String bulkJobId) {
return s;
}

public List<String> findIds(Whelk whelk) {
return queryIds(getTransform(whelk), whelk);
}

@SuppressWarnings("unchecked")
public boolean modify(Document doc, Whelk whelk) {
Map<String, Object> thing = doc.getThing();
@@ -95,7 +99,11 @@ public boolean matches(Document doc, Whelk whelk) {
return getMatchForm(whelk).matches(thing);
}

public Transform.MatchForm getMatchForm(Whelk whelk) {
public List<String> findIds(Whelk whelk) {
return queryIds(getMatchForm(whelk), whelk);
}

private Transform.MatchForm getMatchForm(Whelk whelk) {
if (matchFormObj == null) {
matchFormObj = new Transform.MatchForm(matchForm, whelk);
}
@@ -128,7 +136,7 @@ public Script getScript(String bulkJobId) {

record Other(String name, Map<String, ?> parameters) implements Specification {
private static final Map<String, List<String>> ALLOWED_SCRIPTS_PARAMS = Map.of(
"removeTopicSubdivision", List.of(DEPRECATE_KEY, ADD_KEY)
"removeSubdivision", List.of(REMOVE_SUBDIVISION_KEY, ADD_SUBJECT_KEY)
);

@Override
@@ -155,4 +163,9 @@ private static String loadClasspathScriptSource(String scriptName) {
throw new RuntimeException(e);
}
}

private static List<String> queryIds(Transform transform, Whelk whelk) {
return whelk.getSparqlQueryClient()
.queryIdsByPattern(transform.getSparqlPattern(whelk.getJsonld().context));
}
}
Original file line number Diff line number Diff line change
@@ -5,9 +5,11 @@ import static whelk.datatool.bulkchange.BulkJobDocument.MATCH_FORM_KEY
Map matchForm = parameters.get(MATCH_FORM_KEY)

Specification.Delete delete = new Specification.Delete(matchForm)
List<String> ids = delete.findIds(getWhelk())

selectByForm(matchForm) {
selectByIds(ids) {
if(delete.matches(it.doc, it.whelk)) {
it.scheduleDelete(loud: isLoudAllowed)
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/**
* Remove all uses of a certain Subdivision within ComplexSubject
* The Subdivision itself is not removed, only the usages.
*
* Parameters:
* bulk:removeSubdivision - The subdivision(s) to be removed
* bulk:addSubject - If specified, add this regular Subject to :subject instead
*/

import whelk.JsonLd
import whelk.Whelk
import whelk.util.DocumentUtil

import static whelk.JsonLd.GRAPH_KEY
import static whelk.JsonLd.ID_KEY
import static whelk.JsonLd.asList
import static whelk.converter.JsonLDTurtleConverter.toTurtle
import static whelk.datatool.bulkchange.BulkJobDocument.ADD_SUBJECT_KEY
import static whelk.datatool.bulkchange.BulkJobDocument.REMOVE_SUBDIVISION_KEY

List<Map> removeSubdivision = asList(parameters.get(REMOVE_SUBDIVISION_KEY))
Map addSubject = parameters.get(ADD_SUBJECT_KEY)

def process = { doc ->
Map thing = doc.graph[1] as Map

if (thing[JsonLd.TYPE_KEY] == 'ComplexSubject') {
return
}

Set<List> modifiedListPaths = [] as Set
def modified = DocumentUtil.traverse(thing) { value, path ->
if (value instanceof Map && value[JsonLd.TYPE_KEY] == 'ComplexSubject') {
var t = asList(value.get('termComponentList'))
if (t.containsAll(removeSubdivision)) {
var parentPath = path.size() > 1 ? path.dropRight(1) : null
if (parentPath) {
var parent = DocumentUtil.getAtPath(thing, parentPath)
if (parent instanceof List) {
modifiedListPaths.add(parentPath)
if (addSubject) {
parent.add(addSubject)
}
}
}

return mapSubject(value, t, removeSubdivision)
}
}
return DocumentUtil.NOP
}

// Remove duplicates
modifiedListPaths.each {
var obj = DocumentUtil.getAtPath(thing, it)
if (obj instanceof List) {
obj.unique(true)
}
}

if (modified) {
doc.scheduleSave(loud: isLoudAllowed)
}
}

Set<String> ids = Collections.synchronizedSet([] as Set<String>)
removeSubdivision.each { subdivision ->
if (subdivision[ID_KEY]) {
selectByIds([subdivision[ID_KEY]]) { obsoleteSubdivision ->
ids.addAll(obsoleteSubdivision.getDependers())
}
} else {
Whelk whelk = getWhelk()
ids.addAll(whelk.sparqlQueryClient.queryIdsByPattern(asTurtle((Map) subdivision, whelk.jsonld.context)))
}
}

selectByIds(ids) {
process(it)
}

static DocumentUtil.Operation mapSubject(Map subject, termComponentList, removeSubdivision) {
var t2 = termComponentList.findAll { !removeSubdivision.contains(it) }
if (t2.size() == 0) {
return new DocumentUtil.Remove()
}
if (t2.size() == 1) {
return new DocumentUtil.Replace(t2.first())
}

Map result = new HashMap(subject)
result.termComponentList = t2
return new DocumentUtil.Replace(result)
}

static String asTurtle(Map thing, Map context) {
Map graph = [(GRAPH_KEY): [[:], thing]]
return toTurtle(graph, context, true)
}
Loading

0 comments on commit 9177045

Please sign in to comment.