Skip to content

Commit

Permalink
Feature/lxl 4279 (#1325)
Browse files Browse the repository at this point in the history
Finalize work extraction/merging logic

* Add more generic titles

* Remove commented-out code used for testing stuff

* Don't try to correct illustrator/translator

* Add cleanup scripts to globalchanges-1.33.sh

* Make sure PrimaryContribution comes first in contribution

* Reorganize scripts

Put "cluster bound" script in librisworks/run.sh and not cluster bound script in whelktool/globalchanges-1.33.sh

* Make 9pu follow illustrator to instance

* Add missing lifeSpan to local agent if found in cluster

* Make all subtitles non-distinguishing

* Fix work title shape (exclude subtitle)

* Remove redundant method

* Raise too-large-result limit

* Remove irrelevant title components

* Remove unnecessary condition since checked elsewhere

* Include work titles when clustering

* If the work title of one record matches the instance title of another record, these two records now ends up in the same cluster.
* Use existing methods in find-work-clusters.groovy instead of declaring new ones.

* Exit bash script immediately if Whelktool script fails

* Handle unexpected datatype in instanceOf

* Add missing exclamation mark

* Change clustering order

* Cluster by titles before merging overlapping clusters
* Side effect: Necessary to make Elastic queries from each record individually in first clustering step

* Add missing -D parameter

* Save only modified

* Avoid IndexOutOfBoundsException

* Avoid null in tryAddLifeSpanToLocalAgent

* Update path to relators

* Fix broken linkedAgentToLifeSpan

* Avoid repeated period in work mainTitle

* Make subtitle/titleRemainder distinguishing again

* Restore structure

* Save before reporting to make sure no data is changed before saving

* Extend ignored-subtitles.txt

* Require that PrimaryContribution has an agent in SVSK selection

* Add explanation + example on when to drop only part of subtitle
  • Loading branch information
kwahlin authored Nov 16, 2023
1 parent ebfe5b2 commit 0e24793
Show file tree
Hide file tree
Showing 17 changed files with 438 additions and 247 deletions.
42 changes: 22 additions & 20 deletions librisworks/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -eu

count_lines() {
if [ -f $1 ]; then
Expand Down Expand Up @@ -30,20 +31,20 @@ NORMALIZATIONS_DIR=$REPORT_DIR/normalizations
MERGED_WORKS_DIR=$REPORT_DIR/merged-works

ALL=$CLUSTERS_DIR/1-all
MERGED=$CLUSTERS_DIR/2-merged
TITLES=$CLUSTERS_DIR/3-titles
TITLES=$CLUSTERS_DIR/2-titles
MERGED=$CLUSTERS_DIR/3-merged
SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction
NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations

mkdir -p $CLUSTERS_DIR $NORMALIZATIONS_DIR $MERGED_WORKS_DIR $ALL $MERGED $TITLES $SWEDISH_FICTION $NO_ANONYMOUS_TRANSLATIONS

LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language
ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer
ILL_CONTENT=$NORMALIZATIONS_DIR/2-illustrative-content
DEDUPLICATE_CONTRIBUTIONS=$NORMALIZATIONS_DIR/3-deduplicate-contributions
ADD_MISSING_CONTRIBUTION_DATA=$NORMALIZATIONS_DIR/4-add-missing-contribution-data
ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/5-roles-to-instance

# Clustering step 1 TODO: run only on recently updated records after first run
# Clustering TODO: run only on recently updated records after first run
echo "Finding new clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
$ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null
Expand All @@ -53,32 +54,33 @@ if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Clustering step 2
# Filter out duplicates
sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV

echo
echo "Merging clusters..."
echo "Finding title clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV)
echo "Merged into $NUM_CLUSTERS clusters"
$ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV)
echo "$NUM_CLUSTERS title clusters found"
if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Clustering step 3
echo
echo "Finding title clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV)
echo "$NUM_CLUSTERS title clusters found"
echo "Merging clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV)
echo "Merged into $NUM_CLUSTERS clusters"
if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Filter: Swedish fiction
echo
echo "Filtering on Swedish fiction..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV)
echo "Found $NUM_CLUSTERS title clusters with Swedish fiction"
Expand All @@ -94,10 +96,10 @@ time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDIS
echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"

echo
echo "Specifying designer roles in Elib records..." # NOTE: Not dependent on clustering, can be run anytime after ContributionByRoleStep has been deployed.
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
$ARGS --report $ELIB_DESIGNERS $SCRIPTS_DIR/elib-unspecified-contributor.groovy 2>/dev/null
echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS"
echo "Moving illustrativeContent to instance..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $ILL_CONTENT $SCRIPTS_DIR/lxl-4221-move-illustrativecontent-to-instance.groovy 2>/dev/null
echo "$(count_lines $ILL_CONTENT/MODIFIED.txt) records affected, report in $ILL_CONTENT"

echo
echo "Merging contribution objects with same agent..."
Expand Down
71 changes: 37 additions & 34 deletions librisworks/scripts/add-missing-contribution-data.groovy
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import groovy.transform.Memoized
import org.apache.commons.lang3.StringUtils

import whelk.Document

import java.util.concurrent.ConcurrentHashMap
Expand All @@ -20,6 +21,9 @@ linkedFoundInCluster.println(['id', 'matched agent', 'agent occurs in (examples)
roleAddedFromRespStatement = getReportWriter("role-added-from-respStatement.tsv")
roleAddedFromRespStatement.println(['id', 'agent name', 'added roles', 'resp statement'].join('\t'))

lifeSpanFoundInCluster = getReportWriter("life-span-found-in-cluster.tsv")
lifeSpanFoundInCluster.println(['id', 'agent name', 'lifeSpan', 'agent occurs with lifeSpan in (examples)'].join('\t'))

respStatementLinkedAgentFoundInCluster = getReportWriter("respStatement-linked-agent-found-in-cluster.tsv")
respStatementLinkedAgentFoundInCluster.println(['id', 'agent name', 'matched agent', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t'))

Expand All @@ -37,15 +41,13 @@ titleMovedToTranslationOf = getReportWriter("title-moved-to-translationOf.tsv")
originalWorkFoundInCluster = getReportWriter("original-work-found-in-cluster.tsv")
originalWorkFoundInCluster.println(['id', 'added translationOf', 'translationOf occurs in (examples)'].join('\t'))

illVsTrl = getReportWriter("ill-vs-trl.tsv")
illVsTrl.println(['id', 'removed/replaced role', 'agent name', 'resp statement'].join('\t'))

def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }

idToCluster = initIdToCluster(clusters)
nameToAgents = new ConcurrentHashMap<String, ConcurrentHashMap>()
agentToRolesToIds = new ConcurrentHashMap<String, ConcurrentHashMap<Map, ConcurrentHashMap>>()
agentToLifeSpan = new ConcurrentHashMap<String, String>()
linkedAgentToLifeSpan = new ConcurrentHashMap<String, String>()
localAgentToLifeSpansToIds = new ConcurrentHashMap<String, ConcurrentHashMap<String, ConcurrentHashMap>>()
idToTranslationOf = new ConcurrentHashMap<String, Object>()

// Populate maps
Expand All @@ -59,8 +61,13 @@ selectByIds(clusters.flatten()) { bib ->
asList(c.agent).each { Map agent ->
def agentStr = toString(agent)
def loadedAgent = loadIfLink(agent)
if (agent.containsKey(ID_KEY)) {
agentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
if (loadedAgent.lifeSpan) {
if (agent.containsKey(ID_KEY)) {
linkedAgentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
} else {
def lifeSpansToIds = localAgentToLifeSpansToIds.computeIfAbsent(agentStr, f -> new ConcurrentHashMap())
lifeSpansToIds.computeIfAbsent(agent.lifeSpan, f -> new ConcurrentHashMap().newKeySet()).add(id)
}
}
([loadedAgent] + asList(loadedAgent.hasVariant)).each { a ->
String agentName = name(a)
Expand Down Expand Up @@ -125,6 +132,7 @@ selectByIds(clusters.flatten()) { bib ->
modified |= tryLinkAgent(c, id)
// if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those
modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id)
modified |= tryAddLifeSpanToLocalAgent(c, id)
}

// drop "implicit authors", e.g. Astrid Lindgren in "Astrid Lindgren ; illustrerad av Ilon Wikland" (likely to already exist)
Expand Down Expand Up @@ -202,7 +210,7 @@ boolean tryLinkAgent(Map contribution, String id) {
if (!names) return
// get linked agents with matching name
def matchingLinkedAgents = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
looksLikeIri(a) && !yearMismatch(lifeSpan(agent), agentToLifeSpan[a])
looksLikeIri(a) && !yearMismatch(lifeSpan(agent), linkedAgentToLifeSpan[a])
}
for (agentIri in matchingLinkedAgents) {
// roles that the linked agent appears as and in which records respectively
Expand Down Expand Up @@ -260,15 +268,6 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt
}

def modified = false

def incorrectIllOrTrl = findIncorrectIllVsTrl(currentRoles, rolesOfInterest)
if (incorrectIllOrTrl) {
currentRoles.remove(toIdMap(incorrectIllOrTrl))
contribution['role'] = currentRoles
roleToIds[toIdMap(incorrectIllOrTrl)].remove(id)
illVsTrl.println([id, roleShort(incorrectIllOrTrl), name, respStatement].join('\t'))
modified = true
}
def newRoles = rolesOfInterest - currentRoles
if (newRoles) {
// add new roles (replace existing unspecifiedContributor)
Expand All @@ -287,6 +286,28 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt
return modified
}

boolean tryAddLifeSpanToLocalAgent(Map contribution, String id) {
def agent = asList(contribution.agent).find()
if (agent instanceof Map && !agent[ID_KEY] && !agent.lifeSpan) {
def names = agentToNames[toString(agent)]
if (!names) return
def matchingLocalAgentsWithLifeSpan = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
!looksLikeIri(a) && localAgentToLifeSpansToIds[a]
}
for (localAgent in matchingLocalAgentsWithLifeSpan) {
def lifeSpanToIds = localAgentToLifeSpansToIds[localAgent]
def lifeSpanInCluster = lifeSpanToIds.find { _, ids -> idToCluster[id].intersect(ids) }?.key
if (lifeSpanInCluster) {
agent['lifeSpan'] = lifeSpanInCluster
def examples = idToCluster[id].intersect(lifeSpanToIds[lifeSpanInCluster]).take(3)
lifeSpanFoundInCluster.println([id, name(agent), lifeSpanInCluster, examples].join('\t'))
return true
}
}
}
return false
}

boolean tryAddLinkedAgentContributionsFromRespStatement(List<Map> contribution, Map contributionsInRespStatement, String respStatement, String id) {
if (contributionsInRespStatement.isEmpty()) return false

Expand Down Expand Up @@ -431,13 +452,6 @@ boolean tryAddRole(Map contribution, String id) {
|| r == toIdMap(Relator.PRIMARY_RIGHTS_HOLDER.iri)
|| (r in adapterEditor && currentRoles.intersect(adapterEditor)))
}.collect { it.key }

def illAndTrl = [toIdMap(Relator.TRANSLATOR.iri), toIdMap(Relator.ILLUSTRATOR.iri)]

if ((currentRoles + rolesInCluster).containsAll(illAndTrl)) {
rolesInCluster -= illAndTrl
}

def newRoles = rolesInCluster - currentRoles
if (newRoles) {
contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles
Expand Down Expand Up @@ -647,17 +661,6 @@ static List<String> nameParts(String s) {
s.split(' ').findAll()
}

static String findIncorrectIllVsTrl(List currentRoles, List rolesInRespStatement) {
if ((currentRoles + rolesInRespStatement)[ID_KEY].containsAll([Relator.ILLUSTRATOR.iri, Relator.TRANSLATOR.iri])) {
if (!rolesInRespStatement[ID_KEY].contains(Relator.ILLUSTRATOR.iri)) {
return Relator.ILLUSTRATOR.iri
}
if (!rolesInRespStatement[ID_KEY].contains(Relator.TRANSLATOR.iri)) {
return Relator.TRANSLATOR.iri
}
}
}

def toIdMap(String iri) {
[(ID_KEY): iri]
}
15 changes: 6 additions & 9 deletions librisworks/scripts/contributions-to-instance.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ report = getReportWriter('report.tsv')
def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }

def whelk = getWhelk()
def instanceRolesByDomain = whelk.resourceCache.relators.findResults {
def instanceRolesByDomain = whelk.resourceCache.relatorResources.relators.findResults {
if (it.domain) {
def domain = whelk.jsonld.toTermKey(it.domain[ID_KEY])
if (whelk.jsonld.isSubClassOf(domain, 'Embodiment')) it.subMap([ID_KEY])
}
}

def instanceRoles = instanceRolesByDomain + [Relator.ILLUSTRATOR, Relator.AUTHOR_OF_INTRO, Relator.AUTHOR_OF_AFTERWORD].collect { [(ID_KEY): it.iri] }
def ill = [(ID_KEY): Relator.ILLUSTRATOR.iri]

Expand Down Expand Up @@ -77,6 +78,10 @@ selectByIds(clusters.flatten()) { bib ->
if (id in keepIllustratorOnWorkForIds[illustrator]) {
toInstance.remove(ill)
}
def pu = asList(contribution.role).find { it == [(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri] }
if (pu) {
toInstance.add(pu)
}
}
if (toInstance) {
instance['contribution'] = asList(instance['contribution']) + c.clone().tap { it['role'] = toInstance }
Expand All @@ -103,14 +108,6 @@ boolean isPrimaryContribution(Map contribution) {
contribution[TYPE_KEY] == 'PrimaryContribution'
}

//boolean has9pu(Map contribution) {
// asList(contribution.role).contains([(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri])
//}
//
//boolean isStillImage(Map work) {
// asList(work.contentType).contains([(ID_KEY): 'https://id.kb.se/term/rda/StillImage'])
//}

boolean isPictureBook(Map work) {
def picBookTerms = [
'https://id.kb.se/term/barngf/Bilderb%C3%B6cker',
Expand Down
Loading

0 comments on commit 0e24793

Please sign in to comment.