Feature/lxl 4279 (#1325)

Finalize work extraction/merging logic * Add more generic titles * Remove commented-out code used for testing stuff * Don't try to correct illustrator/translator * Add cleanup scripts to globalchanges-1.33.sh * Make sure PrimaryContribution comes first in contribution * Reorganize scripts Put "cluster bound" script in librisworks/run.sh and not cluster bound script in whelktool/globalchanges-1.33.sh * Make 9pu follow illustrator to instance * Add missing lifeSpan to local agent if found in cluster * Make all subtitles non-distinguishing * Fix work title shape (exclude subtitle) * Remove redundant method * Raise too-large-result limit * Remove irrelevant title components * Remove unnecessary condition since checked elsewhere * Include work titles when clustering * If the work title of one record matches the instance title of another record, these two records now ends up in the same cluster. * Use existing methods in find-work-clusters.groovy instead of declaring new ones. * Exit bash script immediately if Whelktool script fails * Handle unexpected datatype in instanceOf * Add missing exclamation mark * Change clustering order * Cluster by titles before merging overlapping clusters * Side effect: Necessary to make Elastic queries from each record individually in first clustering step * Add missing -D parameter * Save only modified * Avoid IndexOutOfBoundsException * Avoid null in tryAddLifeSpanToLocalAgent * Update path to relators * Fix broken linkedAgentToLifeSpan * Avoid repeated period in work mainTitle * Make subtitle/titleRemainder distinguishing again * Restore structure * Save before reporting to make sure no data is changed before saving * Extend ignored-subtitles.txt * Require that PrimaryContribution has an agent in SVSK selection * Add explanation + example on when to drop only part of subtitle
libris · Nov 16, 2023 · 0e24793 · 0e24793
1 parent ebfe5b2
commit 0e24793
Show file tree

Hide file tree

Showing 17 changed files with 438 additions and 247 deletions.
diff --git a/librisworks/run.sh b/librisworks/run.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -eu
 
 count_lines() {
   if [ -f $1 ]; then
@@ -30,20 +31,20 @@ NORMALIZATIONS_DIR=$REPORT_DIR/normalizations
 MERGED_WORKS_DIR=$REPORT_DIR/merged-works
 
 ALL=$CLUSTERS_DIR/1-all
-MERGED=$CLUSTERS_DIR/2-merged
-TITLES=$CLUSTERS_DIR/3-titles
+TITLES=$CLUSTERS_DIR/2-titles
+MERGED=$CLUSTERS_DIR/3-merged
 SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction
 NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations
 
 mkdir -p $CLUSTERS_DIR $NORMALIZATIONS_DIR $MERGED_WORKS_DIR $ALL $MERGED $TITLES $SWEDISH_FICTION $NO_ANONYMOUS_TRANSLATIONS
 
 LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language
-ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer
+ILL_CONTENT=$NORMALIZATIONS_DIR/2-illustrative-content
 DEDUPLICATE_CONTRIBUTIONS=$NORMALIZATIONS_DIR/3-deduplicate-contributions
 ADD_MISSING_CONTRIBUTION_DATA=$NORMALIZATIONS_DIR/4-add-missing-contribution-data
 ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/5-roles-to-instance
 
-# Clustering step 1 TODO: run only on recently updated records after first run
+# Clustering TODO: run only on recently updated records after first run
 echo "Finding new clusters..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
   $ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null
@@ -53,32 +54,33 @@ if [ $NUM_CLUSTERS == 0 ]; then
   exit 0
 fi
 
-# Clustering step 2
+# Filter out duplicates
+sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV
+
 echo
-echo "Merging clusters..."
+echo "Finding title clusters..."
 time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null
-NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV)
-echo "Merged into $NUM_CLUSTERS clusters"
+  $ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null
+NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV)
+echo "$NUM_CLUSTERS title clusters found"
 if [ $NUM_CLUSTERS == 0 ]; then
   exit 0
 fi
 
-# Clustering step 3
 echo
-echo "Finding title clusters..."
-time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
-  $ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null
-NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV)
-echo "$NUM_CLUSTERS title clusters found"
+echo "Merging clusters..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \
+  $ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null
+NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV)
+echo "Merged into $NUM_CLUSTERS clusters"
 if [ $NUM_CLUSTERS == 0 ]; then
   exit 0
 fi
 
 # Filter: Swedish fiction
 echo
 echo "Filtering on Swedish fiction..."
-time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
   $ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
 NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV)
 echo "Found $NUM_CLUSTERS title clusters with Swedish fiction"
@@ -94,10 +96,10 @@ time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDIS
 echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"
 
 echo
-echo "Specifying designer roles in Elib records..." # NOTE: Not dependent on clustering, can be run anytime after ContributionByRoleStep has been deployed.
-time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
-  $ARGS --report $ELIB_DESIGNERS $SCRIPTS_DIR/elib-unspecified-contributor.groovy 2>/dev/null
-echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS"
+echo "Moving illustrativeContent to instance..."
+time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
+  $ARGS --report $ILL_CONTENT $SCRIPTS_DIR/lxl-4221-move-illustrativecontent-to-instance.groovy 2>/dev/null
+echo "$(count_lines $ILL_CONTENT/MODIFIED.txt) records affected, report in $ILL_CONTENT"
 
 echo
 echo "Merging contribution objects with same agent..."

diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/add-missing-contribution-data.groovy
@@ -1,5 +1,6 @@
 import groovy.transform.Memoized
 import org.apache.commons.lang3.StringUtils
+
 import whelk.Document
 
 import java.util.concurrent.ConcurrentHashMap
@@ -20,6 +21,9 @@ linkedFoundInCluster.println(['id', 'matched agent', 'agent occurs in (examples)
 roleAddedFromRespStatement = getReportWriter("role-added-from-respStatement.tsv")
 roleAddedFromRespStatement.println(['id', 'agent name', 'added roles', 'resp statement'].join('\t'))
 
+lifeSpanFoundInCluster = getReportWriter("life-span-found-in-cluster.tsv")
+lifeSpanFoundInCluster.println(['id', 'agent name', 'lifeSpan', 'agent occurs with lifeSpan in (examples)'].join('\t'))
+
 respStatementLinkedAgentFoundInCluster = getReportWriter("respStatement-linked-agent-found-in-cluster.tsv")
 respStatementLinkedAgentFoundInCluster.println(['id', 'agent name', 'matched agent', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t'))
 
@@ -37,15 +41,13 @@ titleMovedToTranslationOf = getReportWriter("title-moved-to-translationOf.tsv")
 originalWorkFoundInCluster = getReportWriter("original-work-found-in-cluster.tsv")
 originalWorkFoundInCluster.println(['id', 'added translationOf', 'translationOf occurs in (examples)'].join('\t'))
 
-illVsTrl = getReportWriter("ill-vs-trl.tsv")
-illVsTrl.println(['id', 'removed/replaced role', 'agent name', 'resp statement'].join('\t'))
-
 def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }
 
 idToCluster = initIdToCluster(clusters)
 nameToAgents = new ConcurrentHashMap<String, ConcurrentHashMap>()
 agentToRolesToIds = new ConcurrentHashMap<String, ConcurrentHashMap<Map, ConcurrentHashMap>>()
-agentToLifeSpan = new ConcurrentHashMap<String, String>()
+linkedAgentToLifeSpan = new ConcurrentHashMap<String, String>()
+localAgentToLifeSpansToIds = new ConcurrentHashMap<String, ConcurrentHashMap<String, ConcurrentHashMap>>()
 idToTranslationOf = new ConcurrentHashMap<String, Object>()
 
 // Populate maps
@@ -59,8 +61,13 @@ selectByIds(clusters.flatten()) { bib ->
         asList(c.agent).each { Map agent ->
             def agentStr = toString(agent)
             def loadedAgent = loadIfLink(agent)
-            if (agent.containsKey(ID_KEY)) {
-                agentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
+            if (loadedAgent.lifeSpan) {
+                if (agent.containsKey(ID_KEY)) {
+                    linkedAgentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
+                } else {
+                    def lifeSpansToIds = localAgentToLifeSpansToIds.computeIfAbsent(agentStr, f -> new ConcurrentHashMap())
+                    lifeSpansToIds.computeIfAbsent(agent.lifeSpan, f -> new ConcurrentHashMap().newKeySet()).add(id)
+                }
             }
             ([loadedAgent] + asList(loadedAgent.hasVariant)).each { a ->
                 String agentName = name(a)
@@ -125,6 +132,7 @@ selectByIds(clusters.flatten()) { bib ->
         modified |= tryLinkAgent(c, id)
         // if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those
         modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id)
+        modified |= tryAddLifeSpanToLocalAgent(c, id)
     }
 
     // drop "implicit authors", e.g. Astrid Lindgren in "Astrid Lindgren ; illustrerad av Ilon Wikland" (likely to already exist)
@@ -202,7 +210,7 @@ boolean tryLinkAgent(Map contribution, String id) {
             if (!names) return
             // get linked agents with matching name
             def matchingLinkedAgents = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
-                looksLikeIri(a) && !yearMismatch(lifeSpan(agent), agentToLifeSpan[a])
+                looksLikeIri(a) && !yearMismatch(lifeSpan(agent), linkedAgentToLifeSpan[a])
             }
             for (agentIri in matchingLinkedAgents) {
                 // roles that the linked agent appears as and in which records respectively
@@ -260,15 +268,6 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt
     }
 
     def modified = false
-
-    def incorrectIllOrTrl = findIncorrectIllVsTrl(currentRoles, rolesOfInterest)
-    if (incorrectIllOrTrl) {
-        currentRoles.remove(toIdMap(incorrectIllOrTrl))
-        contribution['role'] = currentRoles
-        roleToIds[toIdMap(incorrectIllOrTrl)].remove(id)
-        illVsTrl.println([id, roleShort(incorrectIllOrTrl), name, respStatement].join('\t'))
-        modified = true
-    }
     def newRoles = rolesOfInterest - currentRoles
     if (newRoles) {
         // add new roles (replace existing unspecifiedContributor)
@@ -287,6 +286,28 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt
     return modified
 }
 
+boolean tryAddLifeSpanToLocalAgent(Map contribution, String id) {
+    def agent = asList(contribution.agent).find()
+    if (agent instanceof Map && !agent[ID_KEY] && !agent.lifeSpan) {
+        def names = agentToNames[toString(agent)]
+        if (!names) return
+        def matchingLocalAgentsWithLifeSpan = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
+            !looksLikeIri(a) && localAgentToLifeSpansToIds[a]
+        }
+        for (localAgent in matchingLocalAgentsWithLifeSpan) {
+            def lifeSpanToIds = localAgentToLifeSpansToIds[localAgent]
+            def lifeSpanInCluster = lifeSpanToIds.find { _, ids -> idToCluster[id].intersect(ids) }?.key
+            if (lifeSpanInCluster) {
+                agent['lifeSpan'] = lifeSpanInCluster
+                def examples = idToCluster[id].intersect(lifeSpanToIds[lifeSpanInCluster]).take(3)
+                lifeSpanFoundInCluster.println([id, name(agent), lifeSpanInCluster, examples].join('\t'))
+                return true
+            }
+        }
+    }
+    return false
+}
+
 boolean tryAddLinkedAgentContributionsFromRespStatement(List<Map> contribution, Map contributionsInRespStatement, String respStatement, String id) {
     if (contributionsInRespStatement.isEmpty()) return false
 
@@ -431,13 +452,6 @@ boolean tryAddRole(Map contribution, String id) {
                 || r == toIdMap(Relator.PRIMARY_RIGHTS_HOLDER.iri)
                 || (r in adapterEditor && currentRoles.intersect(adapterEditor)))
     }.collect { it.key }
-
-    def illAndTrl = [toIdMap(Relator.TRANSLATOR.iri), toIdMap(Relator.ILLUSTRATOR.iri)]
-
-    if ((currentRoles + rolesInCluster).containsAll(illAndTrl)) {
-        rolesInCluster -= illAndTrl
-    }
-
     def newRoles = rolesInCluster - currentRoles
     if (newRoles) {
         contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles
@@ -647,17 +661,6 @@ static List<String> nameParts(String s) {
     s.split(' ').findAll()
 }
 
-static String findIncorrectIllVsTrl(List currentRoles, List rolesInRespStatement) {
-    if ((currentRoles + rolesInRespStatement)[ID_KEY].containsAll([Relator.ILLUSTRATOR.iri, Relator.TRANSLATOR.iri])) {
-        if (!rolesInRespStatement[ID_KEY].contains(Relator.ILLUSTRATOR.iri)) {
-            return Relator.ILLUSTRATOR.iri
-        }
-        if (!rolesInRespStatement[ID_KEY].contains(Relator.TRANSLATOR.iri)) {
-            return Relator.TRANSLATOR.iri
-        }
-    }
-}
-
 def toIdMap(String iri) {
     [(ID_KEY): iri]
 }
diff --git a/librisworks/scripts/contributions-to-instance.groovy b/librisworks/scripts/contributions-to-instance.groovy
@@ -12,12 +12,13 @@ report = getReportWriter('report.tsv')
 def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }
 
 def whelk = getWhelk()
-def instanceRolesByDomain = whelk.resourceCache.relators.findResults {
+def instanceRolesByDomain = whelk.resourceCache.relatorResources.relators.findResults {
     if (it.domain) {
         def domain = whelk.jsonld.toTermKey(it.domain[ID_KEY])
         if (whelk.jsonld.isSubClassOf(domain, 'Embodiment')) it.subMap([ID_KEY])
     }
 }
+
 def instanceRoles = instanceRolesByDomain + [Relator.ILLUSTRATOR, Relator.AUTHOR_OF_INTRO, Relator.AUTHOR_OF_AFTERWORD].collect { [(ID_KEY): it.iri] }
 def ill = [(ID_KEY): Relator.ILLUSTRATOR.iri]
 
@@ -77,6 +78,10 @@ selectByIds(clusters.flatten()) { bib ->
             if (id in keepIllustratorOnWorkForIds[illustrator]) {
                 toInstance.remove(ill)
             }
+            def pu = asList(contribution.role).find { it == [(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri] }
+            if (pu) {
+                toInstance.add(pu)
+            }
         }
         if (toInstance) {
             instance['contribution'] = asList(instance['contribution']) + c.clone().tap { it['role'] = toInstance }
@@ -103,14 +108,6 @@ boolean isPrimaryContribution(Map contribution) {
     contribution[TYPE_KEY] == 'PrimaryContribution'
 }
 
-//boolean has9pu(Map contribution) {
-//    asList(contribution.role).contains([(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri])
-//}
-//
-//boolean isStillImage(Map work) {
-//    asList(work.contentType).contains([(ID_KEY): 'https://id.kb.se/term/rda/StillImage'])
-//}
-
 boolean isPictureBook(Map work) {
     def picBookTerms = [
             'https://id.kb.se/term/barngf/Bilderb%C3%B6cker',