Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/lxl 4279 #1325

Merged
merged 32 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7ff888c
Add more generic titles
kwahlin Oct 26, 2023
6442599
Remove commented-out code used for testing stuff
kwahlin Oct 26, 2023
8945881
Don't try to correct illustrator/translator
kwahlin Oct 26, 2023
a1be6da
Add cleanup scripts to globalchanges-1.33.sh
kwahlin Oct 26, 2023
d8d4dab
Make sure PrimaryContribution comes first in contribution
kwahlin Oct 27, 2023
223edd9
Reorganize scripts
kwahlin Oct 27, 2023
ce3620c
Make 9pu follow illustrator to instance
kwahlin Oct 27, 2023
a882345
Add missing lifeSpan to local agent if found in cluster
kwahlin Oct 30, 2023
b9e319d
Make all subtitles non-distinguishing
kwahlin Nov 1, 2023
387f11a
Fix work title shape (exclude subtitle)
kwahlin Nov 1, 2023
31748bd
Remove redundant method
kwahlin Nov 1, 2023
1a262c4
Raise too-large-result limit
kwahlin Nov 1, 2023
7006456
Remove irrelevant title components
kwahlin Nov 7, 2023
8369cb9
Remove unnecessary condition since checked elsewhere
kwahlin Nov 7, 2023
b9d5c1a
Include work titles when clustering
kwahlin Nov 7, 2023
468f705
Exit bash script immediately if Whelktool script fails
kwahlin Nov 7, 2023
0b5fc5c
Handle unexpected datatype in instanceOf
kwahlin Nov 7, 2023
514b011
Add missing exclamation mark
kwahlin Nov 7, 2023
655a1e7
Change clustering order
kwahlin Nov 9, 2023
2430113
Add missing -D parameter
kwahlin Nov 9, 2023
317f75a
Save only modified
kwahlin Nov 9, 2023
c66b4db
Avoid IndexOutOfBoundsException
kwahlin Nov 9, 2023
29073f2
Avoid null in tryAddLifeSpanToLocalAgent
kwahlin Nov 9, 2023
c0476ff
Update path to relators
kwahlin Nov 9, 2023
42267aa
Fix broken linkedAgentToLifeSpan
kwahlin Nov 10, 2023
cff2043
Avoid repeated period in work mainTitle
kwahlin Nov 13, 2023
1eeaff5
Make subtitle/titleRemainder distinguishing again
kwahlin Nov 13, 2023
3756a0e
Restore structure
kwahlin Nov 13, 2023
7f90c3c
Save before reporting to make sure no data is changed before saving
kwahlin Nov 15, 2023
4fce9f4
Extend ignored-subtitles.txt
kwahlin Nov 15, 2023
c3b7a8e
Require that PrimaryContribution has an agent in SVSK selection
kwahlin Nov 15, 2023
c86d9ee
Add explanation + example on when to drop only part of subtitle
kwahlin Nov 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 22 additions & 20 deletions librisworks/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -eu

count_lines() {
if [ -f $1 ]; then
Expand Down Expand Up @@ -30,20 +31,20 @@ NORMALIZATIONS_DIR=$REPORT_DIR/normalizations
MERGED_WORKS_DIR=$REPORT_DIR/merged-works

ALL=$CLUSTERS_DIR/1-all
MERGED=$CLUSTERS_DIR/2-merged
TITLES=$CLUSTERS_DIR/3-titles
TITLES=$CLUSTERS_DIR/2-titles
MERGED=$CLUSTERS_DIR/3-merged
SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction
NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations

mkdir -p $CLUSTERS_DIR $NORMALIZATIONS_DIR $MERGED_WORKS_DIR $ALL $MERGED $TITLES $SWEDISH_FICTION $NO_ANONYMOUS_TRANSLATIONS

LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language
ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer
ILL_CONTENT=$NORMALIZATIONS_DIR/2-illustrative-content
DEDUPLICATE_CONTRIBUTIONS=$NORMALIZATIONS_DIR/3-deduplicate-contributions
ADD_MISSING_CONTRIBUTION_DATA=$NORMALIZATIONS_DIR/4-add-missing-contribution-data
ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/5-roles-to-instance

# Clustering step 1 TODO: run only on recently updated records after first run
# Clustering TODO: run only on recently updated records after first run
echo "Finding new clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
$ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null
Expand All @@ -53,32 +54,33 @@ if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Clustering step 2
# Filter out duplicates
sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV

echo
echo "Merging clusters..."
echo "Finding title clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV)
echo "Merged into $NUM_CLUSTERS clusters"
$ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV)
echo "$NUM_CLUSTERS title clusters found"
if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Clustering step 3
echo
echo "Finding title clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV)
echo "$NUM_CLUSTERS title clusters found"
echo "Merging clusters..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV)
echo "Merged into $NUM_CLUSTERS clusters"
if [ $NUM_CLUSTERS == 0 ]; then
exit 0
fi

# Filter: Swedish fiction
echo
echo "Filtering on Swedish fiction..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null
NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV)
echo "Found $NUM_CLUSTERS title clusters with Swedish fiction"
Expand All @@ -94,10 +96,10 @@ time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDIS
echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"

echo
echo "Specifying designer roles in Elib records..." # NOTE: Not dependent on clustering, can be run anytime after ContributionByRoleStep has been deployed.
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \
$ARGS --report $ELIB_DESIGNERS $SCRIPTS_DIR/elib-unspecified-contributor.groovy 2>/dev/null
echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS"
echo "Moving illustrativeContent to instance..."
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \
$ARGS --report $ILL_CONTENT $SCRIPTS_DIR/lxl-4221-move-illustrativecontent-to-instance.groovy 2>/dev/null
echo "$(count_lines $ILL_CONTENT/MODIFIED.txt) records affected, report in $ILL_CONTENT"

echo
echo "Merging contribution objects with same agent..."
Expand Down
71 changes: 37 additions & 34 deletions librisworks/scripts/add-missing-contribution-data.groovy
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import groovy.transform.Memoized
import org.apache.commons.lang3.StringUtils

import whelk.Document

import java.util.concurrent.ConcurrentHashMap
Expand All @@ -20,6 +21,9 @@ linkedFoundInCluster.println(['id', 'matched agent', 'agent occurs in (examples)
roleAddedFromRespStatement = getReportWriter("role-added-from-respStatement.tsv")
roleAddedFromRespStatement.println(['id', 'agent name', 'added roles', 'resp statement'].join('\t'))

lifeSpanFoundInCluster = getReportWriter("life-span-found-in-cluster.tsv")
lifeSpanFoundInCluster.println(['id', 'agent name', 'lifeSpan', 'agent occurs with lifeSpan in (examples)'].join('\t'))

respStatementLinkedAgentFoundInCluster = getReportWriter("respStatement-linked-agent-found-in-cluster.tsv")
respStatementLinkedAgentFoundInCluster.println(['id', 'agent name', 'matched agent', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t'))

Expand All @@ -37,15 +41,13 @@ titleMovedToTranslationOf = getReportWriter("title-moved-to-translationOf.tsv")
originalWorkFoundInCluster = getReportWriter("original-work-found-in-cluster.tsv")
originalWorkFoundInCluster.println(['id', 'added translationOf', 'translationOf occurs in (examples)'].join('\t'))

illVsTrl = getReportWriter("ill-vs-trl.tsv")
illVsTrl.println(['id', 'removed/replaced role', 'agent name', 'resp statement'].join('\t'))

def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }

idToCluster = initIdToCluster(clusters)
nameToAgents = new ConcurrentHashMap<String, ConcurrentHashMap>()
agentToRolesToIds = new ConcurrentHashMap<String, ConcurrentHashMap<Map, ConcurrentHashMap>>()
agentToLifeSpan = new ConcurrentHashMap<String, String>()
linkedAgentToLifeSpan = new ConcurrentHashMap<String, String>()
localAgentToLifeSpansToIds = new ConcurrentHashMap<String, ConcurrentHashMap<String, ConcurrentHashMap>>()
idToTranslationOf = new ConcurrentHashMap<String, Object>()

// Populate maps
Expand All @@ -59,8 +61,13 @@ selectByIds(clusters.flatten()) { bib ->
asList(c.agent).each { Map agent ->
def agentStr = toString(agent)
def loadedAgent = loadIfLink(agent)
if (agent.containsKey(ID_KEY)) {
agentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
if (loadedAgent.lifeSpan) {
if (agent.containsKey(ID_KEY)) {
linkedAgentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent))
} else {
def lifeSpansToIds = localAgentToLifeSpansToIds.computeIfAbsent(agentStr, f -> new ConcurrentHashMap())
lifeSpansToIds.computeIfAbsent(agent.lifeSpan, f -> new ConcurrentHashMap().newKeySet()).add(id)
}
}
([loadedAgent] + asList(loadedAgent.hasVariant)).each { a ->
String agentName = name(a)
Expand Down Expand Up @@ -125,6 +132,7 @@ selectByIds(clusters.flatten()) { bib ->
modified |= tryLinkAgent(c, id)
// if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those
modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id)
modified |= tryAddLifeSpanToLocalAgent(c, id)
}

// drop "implicit authors", e.g. Astrid Lindgren in "Astrid Lindgren ; illustrerad av Ilon Wikland" (likely to already exist)
Expand Down Expand Up @@ -202,7 +210,7 @@ boolean tryLinkAgent(Map contribution, String id) {
if (!names) return
// get linked agents with matching name
def matchingLinkedAgents = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
looksLikeIri(a) && !yearMismatch(lifeSpan(agent), agentToLifeSpan[a])
looksLikeIri(a) && !yearMismatch(lifeSpan(agent), linkedAgentToLifeSpan[a])
}
for (agentIri in matchingLinkedAgents) {
// roles that the linked agent appears as and in which records respectively
Expand Down Expand Up @@ -260,15 +268,6 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt
}

def modified = false

def incorrectIllOrTrl = findIncorrectIllVsTrl(currentRoles, rolesOfInterest)
if (incorrectIllOrTrl) {
currentRoles.remove(toIdMap(incorrectIllOrTrl))
contribution['role'] = currentRoles
roleToIds[toIdMap(incorrectIllOrTrl)].remove(id)
illVsTrl.println([id, roleShort(incorrectIllOrTrl), name, respStatement].join('\t'))
modified = true
}
def newRoles = rolesOfInterest - currentRoles
if (newRoles) {
// add new roles (replace existing unspecifiedContributor)
Expand All @@ -287,6 +286,28 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt
return modified
}

boolean tryAddLifeSpanToLocalAgent(Map contribution, String id) {
def agent = asList(contribution.agent).find()
if (agent instanceof Map && !agent[ID_KEY] && !agent.lifeSpan) {
def names = agentToNames[toString(agent)]
if (!names) return
def matchingLocalAgentsWithLifeSpan = nameToAgents.subMap(names).values().flatten().toSet().findAll { a ->
!looksLikeIri(a) && localAgentToLifeSpansToIds[a]
}
for (localAgent in matchingLocalAgentsWithLifeSpan) {
def lifeSpanToIds = localAgentToLifeSpansToIds[localAgent]
def lifeSpanInCluster = lifeSpanToIds.find { _, ids -> idToCluster[id].intersect(ids) }?.key
if (lifeSpanInCluster) {
agent['lifeSpan'] = lifeSpanInCluster
def examples = idToCluster[id].intersect(lifeSpanToIds[lifeSpanInCluster]).take(3)
lifeSpanFoundInCluster.println([id, name(agent), lifeSpanInCluster, examples].join('\t'))
return true
}
}
}
return false
}

boolean tryAddLinkedAgentContributionsFromRespStatement(List<Map> contribution, Map contributionsInRespStatement, String respStatement, String id) {
if (contributionsInRespStatement.isEmpty()) return false

Expand Down Expand Up @@ -431,13 +452,6 @@ boolean tryAddRole(Map contribution, String id) {
|| r == toIdMap(Relator.PRIMARY_RIGHTS_HOLDER.iri)
|| (r in adapterEditor && currentRoles.intersect(adapterEditor)))
}.collect { it.key }

def illAndTrl = [toIdMap(Relator.TRANSLATOR.iri), toIdMap(Relator.ILLUSTRATOR.iri)]

if ((currentRoles + rolesInCluster).containsAll(illAndTrl)) {
rolesInCluster -= illAndTrl
}

def newRoles = rolesInCluster - currentRoles
if (newRoles) {
contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles
Expand Down Expand Up @@ -647,17 +661,6 @@ static List<String> nameParts(String s) {
s.split(' ').findAll()
}

static String findIncorrectIllVsTrl(List currentRoles, List rolesInRespStatement) {
if ((currentRoles + rolesInRespStatement)[ID_KEY].containsAll([Relator.ILLUSTRATOR.iri, Relator.TRANSLATOR.iri])) {
if (!rolesInRespStatement[ID_KEY].contains(Relator.ILLUSTRATOR.iri)) {
return Relator.ILLUSTRATOR.iri
}
if (!rolesInRespStatement[ID_KEY].contains(Relator.TRANSLATOR.iri)) {
return Relator.TRANSLATOR.iri
}
}
}

def toIdMap(String iri) {
[(ID_KEY): iri]
}
15 changes: 6 additions & 9 deletions librisworks/scripts/contributions-to-instance.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ report = getReportWriter('report.tsv')
def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } }

def whelk = getWhelk()
def instanceRolesByDomain = whelk.resourceCache.relators.findResults {
def instanceRolesByDomain = whelk.resourceCache.relatorResources.relators.findResults {
if (it.domain) {
def domain = whelk.jsonld.toTermKey(it.domain[ID_KEY])
if (whelk.jsonld.isSubClassOf(domain, 'Embodiment')) it.subMap([ID_KEY])
}
}

def instanceRoles = instanceRolesByDomain + [Relator.ILLUSTRATOR, Relator.AUTHOR_OF_INTRO, Relator.AUTHOR_OF_AFTERWORD].collect { [(ID_KEY): it.iri] }
def ill = [(ID_KEY): Relator.ILLUSTRATOR.iri]

Expand Down Expand Up @@ -77,6 +78,10 @@ selectByIds(clusters.flatten()) { bib ->
if (id in keepIllustratorOnWorkForIds[illustrator]) {
toInstance.remove(ill)
}
def pu = asList(contribution.role).find { it == [(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri] }
if (pu) {
toInstance.add(pu)
}
}
if (toInstance) {
instance['contribution'] = asList(instance['contribution']) + c.clone().tap { it['role'] = toInstance }
Expand All @@ -103,14 +108,6 @@ boolean isPrimaryContribution(Map contribution) {
contribution[TYPE_KEY] == 'PrimaryContribution'
}

//boolean has9pu(Map contribution) {
// asList(contribution.role).contains([(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri])
//}
//
//boolean isStillImage(Map work) {
// asList(work.contentType).contains([(ID_KEY): 'https://id.kb.se/term/rda/StillImage'])
//}

boolean isPictureBook(Map work) {
def picBookTerms = [
'https://id.kb.se/term/barngf/Bilderb%C3%B6cker',
Expand Down
Loading