From 7ff888c3a4e5a64ddf7542d7e1700298c99a8ae9 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 26 Oct 2023 08:56:33 +0200 Subject: [PATCH 01/32] Add more generic titles --- .../resources/merge-works/generic-titles.txt | 31 +++++++++++++++++++ .../merge-works/ignored-subtitles.txt | 1 + 2 files changed, 32 insertions(+) diff --git a/librisworks/src/main/resources/merge-works/generic-titles.txt b/librisworks/src/main/resources/merge-works/generic-titles.txt index afc73c6ba8..ccd0d3440e 100644 --- a/librisworks/src/main/resources/merge-works/generic-titles.txt +++ b/librisworks/src/main/resources/merge-works/generic-titles.txt @@ -1,15 +1,31 @@ artiklar +C. A. Ehrensvärds skrifter collected plays dagböcker dikter +Dikter i urval dramatik +E. J. Stagnelii samlade skrifter +Elsa Beskows sagor +Erik Axel Karlfeldt +Erik Gustaf Geijers samlade skrifter +Esaias Tegnérs samlade skrifter essäer folksagor folkvisor fragment +Fredmans epistlar +Fredmans sånger +Fria fantasier, hvilka betraktade såsom ett helt, af herr Hugo Löwenstjerna stundom kallades Törnrosens bok stundom En irrande hind +Gluntarne +H. C. Andersens bästa sagor +Johan Ludvig Runebergs efterlemnade skrifter +Johan Ludvig Runebergs samlade arbeten +Johan Ludvig Runebergs samlade skrifter korrespondens krönikor lyrik +Läsning för barn memoarer noveller pjäser @@ -23,12 +39,27 @@ rapport report romaner sagor +Samlade arbeten +Samlade berättelser samlade dikter +Samlade noveller och berättelser samlade pjäser samlade skrifter samlade verk +Samlade vitterhetsarbeten +Samlade vitterhets-arbeten +Shakespeares dramatiska arbeten skrifter skådespel sonetter +Svenska ordspråk +Svenska ordstäv tecknade serier +Tusen och en natt +Valda berättelser +Valda dikter +Valda skrifter +Valda stycken +Valda verk +Visor urval \ No newline at end of file diff --git a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt index 4dea8de2e6..50cc55a282 100644 --- a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt +++ b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt @@ -25,6 +25,7 @@ efterlämnade dikter ein coq-rouge-thriller ein roman eine erzählung +en Harry Bosch-deckare erzählung erzählungen essays From 644259910cb28c3f4b809e7c9b0ee649e9adf8c1 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 26 Oct 2023 11:02:47 +0200 Subject: [PATCH 02/32] Remove commented-out code used for testing stuff --- librisworks/scripts/find-work-clusters.groovy | 31 +------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy index 90bcfaa446..5895d16616 100644 --- a/librisworks/scripts/find-work-clusters.groovy +++ b/librisworks/scripts/find-work-clusters.groovy @@ -7,30 +7,7 @@ import java.util.concurrent.ConcurrentHashMap PrintWriter failedQueries = getReportWriter("failed-queries") PrintWriter tooLargeResult = getReportWriter("too-large-result") -//def yesterday = new SimpleDateFormat('yyyy-MM-dd').with { sdf -> -// Calendar.getInstance().with { c -> -// c.add(Calendar.DATE, -1) -// sdf.format(c.getTime()) -// } -//} - -//def where = """ -// collection = '%s' -// AND (modified::date = '$yesterday' -// OR (data#>>'{@graph,0,generationDate}')::date = '$yesterday') -//""" - -visited = Collections.newSetFromMap(new ConcurrentHashMap()) // TODO: remove? -//instancesOfUpdatedLinkedWorks = Collections.synchronizedSet([] as Set) -// -//selectBySqlWhere(String.format(where, 'auth')) { -// def thing = it.graph[1] -// if (Normalizers.isInstanceOf(it.whelk.jsonld, thing, 'Work')) { -// selectBySqlWhere("collection = 'bib' and data#>>'{@graph,1,instanceOf,@id}' = '${thing['@id']}'") { -// instancesOfUpdatedLinkedWorks.add(it.doc.shortId) -// } -// } -//} +visited = Collections.newSetFromMap(new ConcurrentHashMap()) def process = { bib -> if (!visited.add(bib.doc.shortId)) @@ -59,12 +36,6 @@ def process = { bib -> } } -//selectByIds(instancesOfUpdatedLinkedWorks) { -// process(it) -//} - -// TODO: Change when starting to run regularly -//selectBySqlWhere(String.format(where, 'bib')) { selectByCollection('bib') { process(it) } From 89458815675ca4be034ab0290faf677a94e4d118 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 26 Oct 2023 11:40:20 +0200 Subject: [PATCH 03/32] Don't try to correct illustrator/translator --- .../add-missing-contribution-data.groovy | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/add-missing-contribution-data.groovy index c8ec1384b7..53ce6ba0d4 100644 --- a/librisworks/scripts/add-missing-contribution-data.groovy +++ b/librisworks/scripts/add-missing-contribution-data.groovy @@ -37,9 +37,6 @@ titleMovedToTranslationOf = getReportWriter("title-moved-to-translationOf.tsv") originalWorkFoundInCluster = getReportWriter("original-work-found-in-cluster.tsv") originalWorkFoundInCluster.println(['id', 'added translationOf', 'translationOf occurs in (examples)'].join('\t')) -illVsTrl = getReportWriter("ill-vs-trl.tsv") -illVsTrl.println(['id', 'removed/replaced role', 'agent name', 'resp statement'].join('\t')) - def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } } idToCluster = initIdToCluster(clusters) @@ -260,15 +257,6 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt } def modified = false - - def incorrectIllOrTrl = findIncorrectIllVsTrl(currentRoles, rolesOfInterest) - if (incorrectIllOrTrl) { - currentRoles.remove(toIdMap(incorrectIllOrTrl)) - contribution['role'] = currentRoles - roleToIds[toIdMap(incorrectIllOrTrl)].remove(id) - illVsTrl.println([id, roleShort(incorrectIllOrTrl), name, respStatement].join('\t')) - modified = true - } def newRoles = rolesOfInterest - currentRoles if (newRoles) { // add new roles (replace existing unspecifiedContributor) @@ -431,13 +419,6 @@ boolean tryAddRole(Map contribution, String id) { || r == toIdMap(Relator.PRIMARY_RIGHTS_HOLDER.iri) || (r in adapterEditor && currentRoles.intersect(adapterEditor))) }.collect { it.key } - - def illAndTrl = [toIdMap(Relator.TRANSLATOR.iri), toIdMap(Relator.ILLUSTRATOR.iri)] - - if ((currentRoles + rolesInCluster).containsAll(illAndTrl)) { - rolesInCluster -= illAndTrl - } - def newRoles = rolesInCluster - currentRoles if (newRoles) { contribution['role'] = noRole(currentRoles) ? newRoles : currentRoles + newRoles @@ -647,17 +628,6 @@ static List nameParts(String s) { s.split(' ').findAll() } -static String findIncorrectIllVsTrl(List currentRoles, List rolesInRespStatement) { - if ((currentRoles + rolesInRespStatement)[ID_KEY].containsAll([Relator.ILLUSTRATOR.iri, Relator.TRANSLATOR.iri])) { - if (!rolesInRespStatement[ID_KEY].contains(Relator.ILLUSTRATOR.iri)) { - return Relator.ILLUSTRATOR.iri - } - if (!rolesInRespStatement[ID_KEY].contains(Relator.TRANSLATOR.iri)) { - return Relator.TRANSLATOR.iri - } - } -} - def toIdMap(String iri) { [(ID_KEY): iri] } \ No newline at end of file From a1be6da57c1192de67aec2d283528803fa184c7f Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 26 Oct 2023 11:50:09 +0200 Subject: [PATCH 04/32] Add cleanup scripts to globalchanges-1.33.sh --- whelktool/globalchanges-1.33.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/whelktool/globalchanges-1.33.sh b/whelktool/globalchanges-1.33.sh index c9f0d8025b..e3584baf37 100644 --- a/whelktool/globalchanges-1.33.sh +++ b/whelktool/globalchanges-1.33.sh @@ -1,4 +1,6 @@ #!/bin/bash set -euxo pipefail time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/2023/08/lxl-4243-move-out-solitary-contentType-from-hasPart.groovy -time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2023/07/lxl-4221-move-illustrativecontent-to-instance.groovy \ No newline at end of file +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2023/07/lxl-4221-move-illustrativecontent-to-instance.groovy +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2023/05/gf-cleanup.groovy +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2020/08/lxl-3294-move-bearer-like-gfs-from-work-to-instance.groovy From d8d4dab81be0a437746a7169e37ed5bc26008e15 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 27 Oct 2023 11:17:21 +0200 Subject: [PATCH 05/32] Make sure PrimaryContribution comes first in contribution --- .../src/main/groovy/se/kb/libris/mergeworks/Doc.groovy | 5 +++++ .../src/main/groovy/se/kb/libris/mergeworks/Util.groovy | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 8118d34a87..f83cf0791e 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -318,4 +318,9 @@ class Doc { workData.remove('_editionStatement') workData.remove('_numPages') } + + void sortContribution() { + // PrimaryContribution first + contribution()?.sort {it['@type'] != 'PrimaryContribution' } + } } \ No newline at end of file diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 1f9149af5e..a86cfa81a4 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -311,7 +311,14 @@ class Util { }.with { preferredComparisonOrder(it) } def workClusters = partition(docs, { Doc a, Doc b -> c.sameWork(a, b) }) - .each { work -> work.each { doc -> doc.removeComparisonProps() } } + .each { work -> + work.each { doc -> + doc.removeComparisonProps() + // List order may be shuffled when comparing works. + // Make sure PrimaryContribution always comes first in contribution. + doc.sortContribution() + } + } return workClusters } From 223edd9f3a6c41d0eb557badfc388788d178a8e2 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 27 Oct 2023 11:56:37 +0200 Subject: [PATCH 06/32] Reorganize scripts Put "cluster bound" script in librisworks/run.sh and not cluster bound script in whelktool/globalchanges-1.33.sh --- librisworks/run.sh | 8 ++++---- ...1-move-illustrativecontent-to-instance.groovy | 16 ++++++++-------- whelktool/globalchanges-1.33.sh | 2 +- .../2023/10}/elib-unspecified-contributor.groovy | 0 4 files changed, 13 insertions(+), 13 deletions(-) rename {whelktool/scripts/cleanups/2023/07 => librisworks/scripts}/lxl-4221-move-illustrativecontent-to-instance.groovy (50%) rename {librisworks/scripts => whelktool/scripts/2023/10}/elib-unspecified-contributor.groovy (100%) diff --git a/librisworks/run.sh b/librisworks/run.sh index 18648843b5..54f6af2b05 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -38,7 +38,7 @@ NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations mkdir -p $CLUSTERS_DIR $NORMALIZATIONS_DIR $MERGED_WORKS_DIR $ALL $MERGED $TITLES $SWEDISH_FICTION $NO_ANONYMOUS_TRANSLATIONS LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language -ELIB_DESIGNERS=$NORMALIZATIONS_DIR/2-elib-cover-designer +ILL_CONTENT=$NORMALIZATIONS_DIR/2-illustrative-content DEDUPLICATE_CONTRIBUTIONS=$NORMALIZATIONS_DIR/3-deduplicate-contributions ADD_MISSING_CONTRIBUTION_DATA=$NORMALIZATIONS_DIR/4-add-missing-contribution-data ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/5-roles-to-instance @@ -94,10 +94,10 @@ time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDIS echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE" echo -echo "Specifying designer roles in Elib records..." # NOTE: Not dependent on clustering, can be run anytime after ContributionByRoleStep has been deployed. +echo "Moving illustrativeContent to instance..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \ - $ARGS --report $ELIB_DESIGNERS $SCRIPTS_DIR/elib-unspecified-contributor.groovy 2>/dev/null -echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS" + $ARGS --report $ILL_CONTENT $SCRIPTS_DIR/lxl-4221-move-illustrativecontent-to-instance.groovy 2>/dev/null +echo "$(count_lines $ILL_CONTENT/MODIFIED.txt) records affected, report in $ILL_CONTENT" echo echo "Merging contribution objects with same agent..." diff --git a/whelktool/scripts/cleanups/2023/07/lxl-4221-move-illustrativecontent-to-instance.groovy b/librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy similarity index 50% rename from whelktool/scripts/cleanups/2023/07/lxl-4221-move-illustrativecontent-to-instance.groovy rename to librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy index ebca8f7133..dd24049814 100644 --- a/whelktool/scripts/cleanups/2023/07/lxl-4221-move-illustrativecontent-to-instance.groovy +++ b/librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy @@ -1,14 +1,14 @@ -def where = """ - collection = 'bib' - and deleted = false - and data#>>'{@graph,1,instanceOf,@type}' = 'Text' - and data#>'{@graph,1,instanceOf, illustrativeContent}' is not null -""" - -selectBySqlWhere(where) { +def ids = new File(System.getProperty('clusters')) + .readLines() + .collect { it.split('\t').collect { it.trim()} } + .flatten() + +selectByIds(ids) { def instance = it.graph[1] def work = instance.instanceOf + if (!work || work['@id']) return + instance['illustrativeContent'] = (asList(instance['illustrativeContent']) + asList(work.remove('illustrativeContent'))).unique() it.scheduleSave() diff --git a/whelktool/globalchanges-1.33.sh b/whelktool/globalchanges-1.33.sh index e3584baf37..5d1011e1d2 100644 --- a/whelktool/globalchanges-1.33.sh +++ b/whelktool/globalchanges-1.33.sh @@ -1,6 +1,6 @@ #!/bin/bash set -euxo pipefail time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/2023/08/lxl-4243-move-out-solitary-contentType-from-hasPart.groovy -time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2023/07/lxl-4221-move-illustrativecontent-to-instance.groovy time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2023/05/gf-cleanup.groovy time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/cleanups/2020/08/lxl-3294-move-bearer-like-gfs-from-work-to-instance.groovy +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar --report reports/$ENV-$(date +%Y%m%d-%H%M%S) --skip-index scripts/2023/10/elib-unspecified-contributor.groovy diff --git a/librisworks/scripts/elib-unspecified-contributor.groovy b/whelktool/scripts/2023/10/elib-unspecified-contributor.groovy similarity index 100% rename from librisworks/scripts/elib-unspecified-contributor.groovy rename to whelktool/scripts/2023/10/elib-unspecified-contributor.groovy From ce3620c5dffaba972cf0a1aff92411a5994c5474 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 27 Oct 2023 12:15:08 +0200 Subject: [PATCH 07/32] Make 9pu follow illustrator to instance --- librisworks/scripts/contributions-to-instance.groovy | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/librisworks/scripts/contributions-to-instance.groovy b/librisworks/scripts/contributions-to-instance.groovy index b114ee1036..2e67db814a 100644 --- a/librisworks/scripts/contributions-to-instance.groovy +++ b/librisworks/scripts/contributions-to-instance.groovy @@ -77,6 +77,10 @@ selectByIds(clusters.flatten()) { bib -> if (id in keepIllustratorOnWorkForIds[illustrator]) { toInstance.remove(ill) } + def pu = asList(contribution.role).find { it == [(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri] } + if (pu) { + toInstance.add(pu) + } } if (toInstance) { instance['contribution'] = asList(instance['contribution']) + c.clone().tap { it['role'] = toInstance } @@ -103,14 +107,6 @@ boolean isPrimaryContribution(Map contribution) { contribution[TYPE_KEY] == 'PrimaryContribution' } -//boolean has9pu(Map contribution) { -// asList(contribution.role).contains([(ID_KEY): Relator.PRIMARY_RIGHTS_HOLDER.iri]) -//} -// -//boolean isStillImage(Map work) { -// asList(work.contentType).contains([(ID_KEY): 'https://id.kb.se/term/rda/StillImage']) -//} - boolean isPictureBook(Map work) { def picBookTerms = [ 'https://id.kb.se/term/barngf/Bilderb%C3%B6cker', From a8823459d6d4166b1a774b21609c60e571e3a99c Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 30 Oct 2023 13:40:51 +0100 Subject: [PATCH 08/32] Add missing lifeSpan to local agent if found in cluster --- .../add-missing-contribution-data.groovy | 40 +++++++++++++++++-- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/add-missing-contribution-data.groovy index 53ce6ba0d4..600f0253b4 100644 --- a/librisworks/scripts/add-missing-contribution-data.groovy +++ b/librisworks/scripts/add-missing-contribution-data.groovy @@ -1,5 +1,6 @@ import groovy.transform.Memoized import org.apache.commons.lang3.StringUtils + import whelk.Document import java.util.concurrent.ConcurrentHashMap @@ -20,6 +21,9 @@ linkedFoundInCluster.println(['id', 'matched agent', 'agent occurs in (examples) roleAddedFromRespStatement = getReportWriter("role-added-from-respStatement.tsv") roleAddedFromRespStatement.println(['id', 'agent name', 'added roles', 'resp statement'].join('\t')) +lifeSpanFoundInCluster = getReportWriter("life-span-found-in-cluster.tsv") +lifeSpanFoundInCluster.println(['id', 'agent name', 'lifeSpan', 'agent occurs with lifeSpan in (examples)'].join('\t')) + respStatementLinkedAgentFoundInCluster = getReportWriter("respStatement-linked-agent-found-in-cluster.tsv") respStatementLinkedAgentFoundInCluster.println(['id', 'agent name', 'matched agent', 'resp statement roles', 'agent occurs in (examples)', 'resp statement'].join('\t')) @@ -42,7 +46,8 @@ def clusters = new File(System.getProperty('clusters')).collect { it.split('\t') idToCluster = initIdToCluster(clusters) nameToAgents = new ConcurrentHashMap() agentToRolesToIds = new ConcurrentHashMap>() -agentToLifeSpan = new ConcurrentHashMap() +linkedAgentToLifeSpan = new ConcurrentHashMap() +localAgentToLifeSpansToIds = new ConcurrentHashMap>() idToTranslationOf = new ConcurrentHashMap() // Populate maps @@ -56,8 +61,13 @@ selectByIds(clusters.flatten()) { bib -> asList(c.agent).each { Map agent -> def agentStr = toString(agent) def loadedAgent = loadIfLink(agent) - if (agent.containsKey(ID_KEY)) { - agentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent)) + if (agent.lifeSpan) { + if (agent.containsKey(ID_KEY)) { + linkedAgentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent)) + } else { + def lifeSpansToIds = localAgentToLifeSpansToIds.computeIfAbsent(agentStr, f -> new ConcurrentHashMap()) + lifeSpansToIds.computeIfAbsent(agent.lifeSpan, f -> new ConcurrentHashMap().newKeySet()).add(id) + } } ([loadedAgent] + asList(loadedAgent.hasVariant)).each { a -> String agentName = name(a) @@ -122,6 +132,7 @@ selectByIds(clusters.flatten()) { bib -> modified |= tryLinkAgent(c, id) // if there are more roles stated in responsibilityStatement other than the existing ones in this contribution, add those modified |= tryAddRolesFromRespStatement(c, contributionsInRespStatement, respStatement, id) + modified |= tryAddLifeSpanToLocalAgent(c, id) } // drop "implicit authors", e.g. Astrid Lindgren in "Astrid Lindgren ; illustrerad av Ilon Wikland" (likely to already exist) @@ -199,7 +210,7 @@ boolean tryLinkAgent(Map contribution, String id) { if (!names) return // get linked agents with matching name def matchingLinkedAgents = nameToAgents.subMap(names).values().flatten().toSet().findAll { a -> - looksLikeIri(a) && !yearMismatch(lifeSpan(agent), agentToLifeSpan[a]) + looksLikeIri(a) && !yearMismatch(lifeSpan(agent), linkedAgentToLifeSpan[a]) } for (agentIri in matchingLinkedAgents) { // roles that the linked agent appears as and in which records respectively @@ -275,6 +286,27 @@ boolean tryAddRolesFromRespStatement(Map contribution, Map contributionsInRespSt return modified } +boolean tryAddLifeSpanToLocalAgent(Map contribution, String id) { + def agent = asList(contribution.agent).find() + if (agent instanceof Map && !agent[ID_KEY] && !agent.lifeSpan) { + def names = agentToNames[toString(agent)] + def matchingLocalAgentsWithLifeSpan = nameToAgents.subMap(names).values().flatten().toSet().findAll { a -> + !looksLikeIri(a) && localAgentToLifeSpansToIds[a] + } + for (localAgent in matchingLocalAgentsWithLifeSpan) { + def lifeSpanToIds = localAgentToLifeSpansToIds[localAgent] + def lifeSpanInCluster = lifeSpanToIds.find { _, ids -> idToCluster[id].intersect(ids) }?.key + if (lifeSpanInCluster) { + agent['lifeSpan'] = lifeSpanInCluster + def examples = idToCluster[id].intersect(lifeSpanToIds[lifeSpanInCluster]).take(3) + lifeSpanFoundInCluster.println([id, name(agent), lifeSpanInCluster, examples].join('\t')) + return true + } + } + } + return false +} + boolean tryAddLinkedAgentContributionsFromRespStatement(List contribution, Map contributionsInRespStatement, String respStatement, String id) { if (contributionsInRespStatement.isEmpty()) return false From b9e319d067eab7e22c4c16297ecbb392d14e6ad3 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 1 Nov 2023 13:04:54 +0100 Subject: [PATCH 09/32] Make all subtitles non-distinguishing --- .../se/kb/libris/mergeworks/Util.groovy | 38 ++------- .../merge-works/ignored-subtitles.txt | 78 ------------------- 2 files changed, 8 insertions(+), 108 deletions(-) delete mode 100644 librisworks/src/main/resources/merge-works/ignored-subtitles.txt diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index a86cfa81a4..5233b033cf 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -36,10 +36,6 @@ class Util { } } - private static Set IGNORED_SUBTITLES = Util.class.getClassLoader() - .getResourceAsStream('merge-works/ignored-subtitles.txt') - .readLines().grep().collect(Util.&normalize) as Set - private static Set GENERIC_TITLES = Util.class.getClassLoader() .getResourceAsStream('merge-works/generic-titles.txt') .readLines().grep().collect(Util.&normalize) as Set @@ -85,30 +81,20 @@ class Util { hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES } } - static List dropGenericSubTitles(List hasTitle) { + static List dropSubtitles(List hasTitle) { hasTitle.collect { def copy = new TreeMap(it) - if (copy['subtitle'] || copy['titleRemainder']) { - DocumentUtil.traverse(copy) { value, path -> - if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) { - if (genericSubtitle(value)) { - new DocumentUtil.Remove() - } else { - ((List) value.split(':')).with { - if (it.size() > 1 && genericSubtitle(it.last().trim())) { - new DocumentUtil.Replace(value.replaceFirst(~/\s*:.+$/, '')) - } - } - } - } + DocumentUtil.traverse(copy) { value, path -> + if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) { + new DocumentUtil.Remove() } } - copy + return copy } } static List flatTitles(List hasTitle) { - dropGenericSubTitles(hasTitle).collect { + dropSubtitles(hasTitle).collect { def title = new TreeMap<>() title['flatTitle'] = normalize(DisplayDoc.flatten(it, titleComponents)) if (it['@type']) { @@ -119,14 +105,6 @@ class Util { } } - private static boolean genericSubtitle(String s) { - s = Util.normalize(s) - if (s.startsWith("en ")) { - s = s.substring("en ".length()) - } - return s in IGNORED_SUBTITLES - } - static String normalize(String s) { return Unicode.removeDiacritics(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise)))) } @@ -261,7 +239,7 @@ class Util { static def mostCommonWorkTitle(Collection docs, Closure getTitle = { it.workTitle().findAll(isTitle) }) { def workTitles = docs.collect(getTitle) .grep() - .collect { dropGenericSubTitles(it) } + .collect { dropSubtitles(it) } if (workTitles) { return mostCommon(workTitles) @@ -276,7 +254,7 @@ class Util { } def instanceTitles = docs.collect { it.instanceTitle().findAll(isTitle) } - .collect { dropGenericSubTitles(it) } + .collect { dropSubtitles(it) } if (instanceTitles.grep()) { def instanceTitleToDoc = [instanceTitles, docs].transpose().collectEntries() diff --git a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt deleted file mode 100644 index 50cc55a282..0000000000 --- a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt +++ /dev/null @@ -1,78 +0,0 @@ -a comedy -a history -a novel -a play -a romance -a tale -aforismer -berättelse -berättelse för barn -berättelse för flickor -berättelse för pojkar -berättelse för unga flickor -berättelser -berättelser för barn -bilderbok -comédie -contos -deckare -deckarroman -detektivroman -dikt -dikter -drama -efterlämnade dikter -ein coq-rouge-thriller -ein roman -eine erzählung -en Harry Bosch-deckare -erzählung -erzählungen -essays -essäer -ett fall för kay scarpetta -fortælling -historisk roman -homandeckare -jack reacher-thriller -komedi -komedi i fyra akter -krimi -kriminalroman -kärlekshistoria -kärleksroman -kåserier -lustspel i en akt -nouvelles -novela -novell -novelle -noveller -pjäs -polisroman -povesti -powieść -poėma -reseguide -resehandbok -rikosromaani -romaani -romaani rikoksesta -roman -roman om ett brott -roman om skivvärlden -romanas -romance -romanzo -rövarroman -runoja -saga -sagor -sann historia -skildringar -skáldsaga -spänningsroman -stories -thriller -ungdomsroman -(Efterlämnade dikter.) From 387f11a99e30f768e5be42a5c9ecd407324daefc Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 1 Nov 2023 13:22:22 +0100 Subject: [PATCH 10/32] Fix work title shape (exclude subtitle) --- .../groovy/se/kb/libris/mergeworks/Util.groovy | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 5233b033cf..15a35fd08e 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -164,7 +164,7 @@ class Util { null ] - static Map appendTitlePartsToMainTitle(Map title, String partNumber, String partName = null) { + static void appendTitlePartsToMainTitle(Map title, String partNumber, String partName = null) { if (partNumber && partName) { title['mainTitle'] += ". $partNumber, $partName" } else if (partNumber) { @@ -172,12 +172,6 @@ class Util { } else if (partName) { title['mainTitle'] += ". $partName" } - - title.remove('partNumber') - title.remove('partName') - title.remove('hasPart') - - return title } static String findTitlePart(List title, String prop) { @@ -188,7 +182,6 @@ class Util { // Return the most common title for the best encodingLevel static def bestTitle(Collection docs) { - // TODO: which title to pick when matched with already existing linked work? def linkedWorkTitle = docs.findResult { it.workIri() ? it.workData['hasTitle'] : null } if (linkedWorkTitle) { return linkedWorkTitle @@ -200,11 +193,15 @@ class Util { def partNumber = findTitlePart(bestInstanceTitle, 'partNumber') def partName = findTitlePart(bestInstanceTitle, 'partName') + def workTitleShape = { it.subMap(['@type', 'mainTitle', 'source']) } + if (bestWorkTitle) { - return bestWorkTitle.collect { appendTitlePartsToMainTitle(it, partNumber) } + return bestWorkTitle.each { appendTitlePartsToMainTitle(it, partNumber) } + .collect(workTitleShape) } - return bestInstanceTitle.collect { appendTitlePartsToMainTitle(it, partNumber, partName) } + return bestInstanceTitle.each { appendTitlePartsToMainTitle(it, partNumber, partName) } + .collect(workTitleShape) } static def mostCommonHighestEncodingLevel(Collection docs, Closure> findMostCommon) { From 31748bd7b2f40fb03d24354b2809ffc81e8d4fdf Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 1 Nov 2023 13:25:40 +0100 Subject: [PATCH 11/32] Remove redundant method --- .../groovy/se/kb/libris/mergeworks/DisplayDoc.groovy | 5 +++-- .../main/groovy/se/kb/libris/mergeworks/Doc.groovy | 3 ++- .../main/groovy/se/kb/libris/mergeworks/Util.groovy | 11 ----------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy index 5de250c66b..816760a9b6 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy @@ -2,6 +2,7 @@ package se.kb.libris.mergeworks import whelk.Document import whelk.JsonLd +import whelk.util.DocumentUtil class DisplayDoc { Doc doc @@ -75,7 +76,7 @@ class DisplayDoc { private List contributorStrings() { List path = doc.instanceData ? ['instanceOf', 'contribution'] : ['contribution'] - List contribution = Util.getPathSafe(getFramed(), path, []) + List contribution = DocumentUtil.getAtPath(getFramed(), path, []) return contribution.collect { Map c -> contributionStr(c) @@ -101,7 +102,7 @@ class DisplayDoc { List classificationStrings() { List path = doc.instanceData ? ['instanceOf', 'classification'] : ['classification'] - List classification = Util.getPathSafe(getFramed(), path, []) + List classification = DocumentUtil.getAtPath(getFramed(), path, []) classification.collect { c -> StringBuilder s = new StringBuilder() diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index f83cf0791e..46e89ea5e1 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -4,6 +4,7 @@ import whelk.Document import whelk.JsonLd import whelk.Whelk import whelk.datatool.DocumentItem +import whelk.util.DocumentUtil import static Util.asList import static Util.Relator @@ -181,7 +182,7 @@ class Doc { } int numPages() { - String extent = Util.getPathSafe(extent(), [0, 'label', 0]) ?: Util.getPathSafe(extent(), [0, 'label'], '') + String extent = DocumentUtil.getAtPath(extent(), [0, 'label', 0]) ?: DocumentUtil.getAtPath(extent(), [0, 'label'], '') return numPages(extent) } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 15a35fd08e..422e4c439d 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -109,17 +109,6 @@ class Util { return Unicode.removeDiacritics(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise)))) } - static Object getPathSafe(item, path, defaultTo = null) { - for (p in path) { - if ((item instanceof Collection || item instanceof Map) && item[p] != null) { - item = item[p] - } else { - return defaultTo - } - } - return item - } - static List getFlatTitle(List hasTitle) { flatTitles(hasTitle) .grep(isTitle) From 1a262c46d03c4ec5887622eef2120580c9f35f5e Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 1 Nov 2023 16:29:28 +0100 Subject: [PATCH 12/32] Raise too-large-result limit --- librisworks/scripts/find-work-clusters.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy index 5895d16616..5c91fa25cf 100644 --- a/librisworks/scripts/find-work-clusters.groovy +++ b/librisworks/scripts/find-work-clusters.groovy @@ -21,7 +21,7 @@ def process = { bib -> List ids = queryIds(q).collect() - if (ids.size() > 200) { + if (ids.size() > 1000) { tooLargeResult.println("Results: ${ids.size()} Query: ${q}") } else if (ids.size() > 1) { From 7006456660ccb5539a65dfced3a2523f66b23c81 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 7 Nov 2023 09:08:14 +0100 Subject: [PATCH 13/32] Remove irrelevant title components --- .../src/main/groovy/se/kb/libris/mergeworks/Util.groovy | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 422e4c439d..aabfd0cc0e 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -8,10 +8,7 @@ import whelk.util.Unicode import static se.kb.libris.mergeworks.compare.IntendedAudience.preferredComparisonOrder class Util { - static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle'] - - static def titleVariant = ['Title', 'ParallelTitle'] - // removed 'VariantTitle', 'CoverTitle' since they sometimes contain random generic stuff like "Alibis filmroman", "Kompisböcker för de yngsta" + static def titleComponents = ['mainTitle', 'hasPart', 'partNumber', 'partName'] static enum Relator { TRANSLATOR('https://id.kb.se/relator/translator'), @@ -94,7 +91,7 @@ class Util { } static List flatTitles(List hasTitle) { - dropSubtitles(hasTitle).collect { + hasTitle.collect { def title = new TreeMap<>() title['flatTitle'] = normalize(DisplayDoc.flatten(it, titleComponents)) if (it['@type']) { From 8369cb9b2d2d2e2dcfafe1b147a6faa0a2b762ad Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 7 Nov 2023 09:10:16 +0100 Subject: [PATCH 14/32] Remove unnecessary condition since checked elsewhere --- .../groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy index 717b528e85..dd0dc578d7 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/TranslationOf.groovy @@ -14,7 +14,7 @@ class TranslationOf implements ValuePicker { // We assume that there are never more than one object in translationOf a = Util.asList(a)[0] b = Util.asList(b)[0] - (!a && !b) || (a && b && c.isEqual(noTypeNoTitle(a), noTypeNoTitle(b)) && noTitleOrSameTitle(a, b)) + a && b && c.isEqual(noTypeNoTitle(a), noTypeNoTitle(b)) && noTitleOrSameTitle(a, b) } @Override From b9d5c1aec6cb1ce2dce183559d1cd86bd15191a8 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 7 Nov 2023 09:16:28 +0100 Subject: [PATCH 15/32] Include work titles when clustering * If the work title of one record matches the instance title of another record, these two records now ends up in the same cluster. * Use existing methods in find-work-clusters.groovy instead of declaring new ones. --- librisworks/scripts/find-work-clusters.groovy | 80 ++++++++----------- librisworks/scripts/title-clusters.groovy | 4 +- .../groovy/se/kb/libris/mergeworks/Doc.groovy | 9 +++ 3 files changed, 47 insertions(+), 46 deletions(-) diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy index 5c91fa25cf..d92b236179 100644 --- a/librisworks/scripts/find-work-clusters.groovy +++ b/librisworks/scripts/find-work-clusters.groovy @@ -14,17 +14,26 @@ def process = { bib -> return try { - def q = buildQuery(bib) - if (!q) { - return - } + def instance = bib.graph[1] + def work = loadIfLink(instance.instanceOf) - List ids = queryIds(q).collect() + if (!work) return - if (ids.size() > 1000) { - tooLargeResult.println("Results: ${ids.size()} Query: ${q}") + def titles = [instance, work].grep().collect { title(it) }.grep().unique() + + Set ids = [] + + titles.each { + def q = buildQuery(work, it) + if (!q) { + return + } + ids.addAll(queryIds(q)) } - else if (ids.size() > 1) { + + if (ids.size() > 1000) { + tooLargeResult.println("Results: ${ids.size()} Id: ${bib.doc.shortId} Titles: ${titles}") + } else if (ids.size() > 1) { visited.addAll(ids) println(ids.join('\t')) } @@ -40,39 +49,35 @@ selectByCollection('bib') { process(it) } -Map> buildQuery(bib) { - def title = title(bib) - - if (!title) - return null - +Map> buildQuery(Map work, String title) { Map> query = [ "q" : ["*"], "@type" : ["Instance"], "hasTitle.mainTitle": [esSafe(title)], ] - insertLinkedAgents(bib) - def card = bib.asCard(true) + insertLinkedAgents(work) + def card = getWhelk().jsonld.toCard(work, false, true) - def author = primaryContributor(card).collect{ esSafe(it) } + def author = primaryContributor(card).collect { esSafe(it) } if (author) { query["or-instanceOf.contribution._str"] = author query["or-instanceOf.contribution.agent._str"] = author return query } - def allContributors = contributors(card).collect{ esSafe(it) } + def allContributors = contributors(card).collect { esSafe(it) } if (allContributors) { query["or-instanceOf.contribution._str"] = allContributors query["or-instanceOf.contribution.agent._str"] = allContributors return query } + return null } -private void insertLinkedAgents(bib) { - getPathSafe(bib.doc.data, ['@graph', 1, 'instanceOf', 'contribution']).each { +private void insertLinkedAgents(work) { + asList(work['contribution']).each { def agent = asList(it.agent).find() if (agent && agent['@id']) { it.agent = loadThing(agent['@id']) @@ -80,21 +85,21 @@ private void insertLinkedAgents(bib) { } } -private String title(bib) { - return getPathSafe(bib.doc.data, ['@graph', 1, 'hasTitle', 0, 'mainTitle']) +private String title(Map thing) { + return getAtPath(thing, ['hasTitle', 0, 'mainTitle']) } -private List primaryContributor(bib) { - contributorStrings(getPathSafe(bib, ['@graph', 1, 'instanceOf', 'contribution'], []).find { it['@type'] == "PrimaryContribution" }) +private List primaryContributor(work) { + contributorStrings(asList(work['contribution']).find { it['@type'] == "PrimaryContribution" }) } -private List contributors(bib) { - getPathSafe(bib, ['@graph', 1, 'instanceOf', 'contribution'], []).collect { contributorStrings(it) }.grep().flatten() +private List contributors(work) { + asList(work['contribution']).collect { contributorStrings(it) }.grep().flatten() } -//getPathSafe(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') } +//getAtPath(contribution, ['_str'])?.with { String s -> s.replaceAll(/[^ \p{IsAlphabetic}]/, '') } private List contributorStrings(contribution) { - List variants = asList(contribution?.agent) + asList(getPathSafe(contribution, ['agent', 'hasVariant'])) + List variants = asList(contribution?.agent) + asList(getAtPath(contribution, ['agent', 'hasVariant'])) variants.grep().collect { name(it) }.grep() } @@ -110,19 +115,8 @@ private String esSafe(String s) { s.replaceAll('[+|"\\-*~]', " ") } -private Object getPathSafe(item, path, defaultTo = null) { - if (!item) { - return defaultTo - } - - for (p in path) { - if (item[p] != null) { - item = item[p] - } else { - return defaultTo - } - } - return item +private loadIfLink(Map work) { + work?['@id'] ? loadThing(work['@id']) : work } private Map loadThing(def id) { @@ -131,8 +125,4 @@ private Map loadThing(def id) { thing = t.graph[1] } return thing -} - -private static List asList(Object o) { - (o ?: []).with { it instanceof List ? it : [it] } } \ No newline at end of file diff --git a/librisworks/scripts/title-clusters.groovy b/librisworks/scripts/title-clusters.groovy index 38bf8ab226..5a07b027df 100644 --- a/librisworks/scripts/title-clusters.groovy +++ b/librisworks/scripts/title-clusters.groovy @@ -22,7 +22,9 @@ Collection> titleClusters(Collection docs) { static Collection> partitionByTitle(Collection docs) { return partition(docs) { Doc a, Doc b -> - !a.flatInstanceTitle().intersect(b.flatInstanceTitle()).isEmpty() + def aTitles = a.flatInstanceTitle() + a.flatWorkTitle() + def bTitles = b.flatInstanceTitle() + b.flatWorkTitle() + aTitles.intersect(bTitles).isEmpty() } } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 46e89ea5e1..3030740104 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -41,6 +41,7 @@ class Doc { Map workData List flatInstanceTitle + List flatWorkTitle DisplayDoc display @@ -113,6 +114,14 @@ class Doc { asList(workData['hasTitle']) } + List flatWorkTitle() { + if (!flatWorkTitle) { + flatWorkTitle = Util.getFlatTitle(workTitle()) + } + + return flatWorkTitle + } + List instanceTitle() { asList(instanceData?.hasTitle) } From 468f705b15b9fef279d9b5d57d8c13e15c125642 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 7 Nov 2023 09:25:55 +0100 Subject: [PATCH 16/32] Exit bash script immediately if Whelktool script fails --- librisworks/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/librisworks/run.sh b/librisworks/run.sh index 54f6af2b05..011da8b2b6 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -eu count_lines() { if [ -f $1 ]; then From 0b5fc5c6492f3ef50124093d6c0c1b8936b8841f Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 7 Nov 2023 09:30:25 +0100 Subject: [PATCH 17/32] Handle unexpected datatype in instanceOf --- librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 3030740104..9f2bc32b62 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -68,7 +68,7 @@ class Doc { void setData() { if (mainEntity()['instanceOf']) { instanceData = mainEntity() - workData = instanceData['instanceOf'] + workData = asList(instanceData['instanceOf']).find() } else { workData = mainEntity() } From 514b0112914df658ae811cf01a30ee03582c12c5 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 7 Nov 2023 15:43:50 +0100 Subject: [PATCH 18/32] Add missing exclamation mark --- librisworks/scripts/title-clusters.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/scripts/title-clusters.groovy b/librisworks/scripts/title-clusters.groovy index 5a07b027df..28e6a0eab8 100644 --- a/librisworks/scripts/title-clusters.groovy +++ b/librisworks/scripts/title-clusters.groovy @@ -24,7 +24,7 @@ static Collection> partitionByTitle(Collection docs) { return partition(docs) { Doc a, Doc b -> def aTitles = a.flatInstanceTitle() + a.flatWorkTitle() def bTitles = b.flatInstanceTitle() + b.flatWorkTitle() - aTitles.intersect(bTitles).isEmpty() + !aTitles.intersect(bTitles).isEmpty() } } From 655a1e77eaf0630b08b2b11f370f6d6d3022638d Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 9 Nov 2023 14:21:48 +0100 Subject: [PATCH 19/32] Change clustering order * Cluster by titles before merging overlapping clusters * Side effect: Necessary to make Elastic queries from each record individually in first clustering step --- librisworks/run.sh | 31 ++++++++++--------- librisworks/scripts/find-work-clusters.groovy | 10 +----- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/librisworks/run.sh b/librisworks/run.sh index 011da8b2b6..2deee46008 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -31,8 +31,8 @@ NORMALIZATIONS_DIR=$REPORT_DIR/normalizations MERGED_WORKS_DIR=$REPORT_DIR/merged-works ALL=$CLUSTERS_DIR/1-all -MERGED=$CLUSTERS_DIR/2-merged -TITLES=$CLUSTERS_DIR/3-titles +TITLES=$CLUSTERS_DIR/2-titles +MERGED=$CLUSTERS_DIR/3-merged SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations @@ -44,7 +44,7 @@ DEDUPLICATE_CONTRIBUTIONS=$NORMALIZATIONS_DIR/3-deduplicate-contributions ADD_MISSING_CONTRIBUTION_DATA=$NORMALIZATIONS_DIR/4-add-missing-contribution-data ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/5-roles-to-instance -# Clustering step 1 TODO: run only on recently updated records after first run +# Clustering TODO: run only on recently updated records after first run echo "Finding new clusters..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \ $ARGS --report $ALL/$WHELKTOOL_REPORT $SCRIPTS_DIR/find-work-clusters.groovy >$ALL/$CLUSTER_TSV 2>/dev/null @@ -54,24 +54,25 @@ if [ $NUM_CLUSTERS == 0 ]; then exit 0 fi -# Clustering step 2 +# Filter out duplicates +sort -uo $ALL/$CLUSTER_TSV $ALL/$CLUSTER_TSV + echo -echo "Merging clusters..." +echo "Finding title clusters..." time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$ALL/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null -NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV) -echo "Merged into $NUM_CLUSTERS clusters" + $ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null +NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV) +echo "$NUM_CLUSTERS title clusters found" if [ $NUM_CLUSTERS == 0 ]; then exit 0 fi -# Clustering step 3 echo -echo "Finding title clusters..." -time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \ - $ARGS --report $TITLES/$WHELKTOOL_REPORT $SCRIPTS_DIR/title-clusters.groovy >$TITLES/$CLUSTER_TSV 2>/dev/null -NUM_CLUSTERS=$(count_lines $TITLES/$CLUSTER_TSV) -echo "$NUM_CLUSTERS title clusters found" +echo "Merging clusters..." +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \ + $ARGS --report $MERGED/$WHELKTOOL_REPORT $SCRIPTS_DIR/merge-clusters.groovy >$MERGED/$CLUSTER_TSV 2>/dev/null +NUM_CLUSTERS=$(count_lines $MERGED/$CLUSTER_TSV) +echo "Merged into $NUM_CLUSTERS clusters" if [ $NUM_CLUSTERS == 0 ]; then exit 0 fi @@ -79,7 +80,7 @@ fi # Filter: Swedish fiction echo echo "Filtering on Swedish fiction..." -time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$TITLES/$CLUSTER_TSV -jar $JAR_FILE \ +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$MERGED/$CLUSTER_TSV -jar $JAR_FILE \ $ARGS --report $SWEDISH_FICTION/$WHELKTOOL_REPORT $SCRIPTS_DIR/swedish-fiction.groovy >$SWEDISH_FICTION/$CLUSTER_TSV 2>/dev/null NUM_CLUSTERS=$(count_lines $SWEDISH_FICTION/$CLUSTER_TSV) echo "Found $NUM_CLUSTERS title clusters with Swedish fiction" diff --git a/librisworks/scripts/find-work-clusters.groovy b/librisworks/scripts/find-work-clusters.groovy index d92b236179..b69a4b0ffb 100644 --- a/librisworks/scripts/find-work-clusters.groovy +++ b/librisworks/scripts/find-work-clusters.groovy @@ -2,17 +2,10 @@ * (When running, redirect STDERR to avoid annoying prints from whelktool) */ -import java.util.concurrent.ConcurrentHashMap - PrintWriter failedQueries = getReportWriter("failed-queries") PrintWriter tooLargeResult = getReportWriter("too-large-result") -visited = Collections.newSetFromMap(new ConcurrentHashMap()) - def process = { bib -> - if (!visited.add(bib.doc.shortId)) - return - try { def instance = bib.graph[1] def work = loadIfLink(instance.instanceOf) @@ -34,8 +27,7 @@ def process = { bib -> if (ids.size() > 1000) { tooLargeResult.println("Results: ${ids.size()} Id: ${bib.doc.shortId} Titles: ${titles}") } else if (ids.size() > 1) { - visited.addAll(ids) - println(ids.join('\t')) + println(ids.sort().join('\t')) } } catch (Exception e) { From 2430113985964156f56ef19136104bc346be5202 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 9 Nov 2023 16:30:39 +0100 Subject: [PATCH 20/32] Add missing -D parameter --- librisworks/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/run.sh b/librisworks/run.sh index 2deee46008..d258e1f4f8 100755 --- a/librisworks/run.sh +++ b/librisworks/run.sh @@ -97,7 +97,7 @@ echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in echo echo "Moving illustrativeContent to instance..." -time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar $JAR_FILE \ +time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION/$CLUSTER_TSV -jar $JAR_FILE \ $ARGS --report $ILL_CONTENT $SCRIPTS_DIR/lxl-4221-move-illustrativecontent-to-instance.groovy 2>/dev/null echo "$(count_lines $ILL_CONTENT/MODIFIED.txt) records affected, report in $ILL_CONTENT" From 317f75a00581ee4974ee83dee08b733e1a96ad82 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 9 Nov 2023 16:37:56 +0100 Subject: [PATCH 21/32] Save only modified --- .../lxl-4221-move-illustrativecontent-to-instance.groovy | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy b/librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy index dd24049814..a44ea82fea 100644 --- a/librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy +++ b/librisworks/scripts/lxl-4221-move-illustrativecontent-to-instance.groovy @@ -9,7 +9,11 @@ selectByIds(ids) { if (!work || work['@id']) return - instance['illustrativeContent'] = (asList(instance['illustrativeContent']) + asList(work.remove('illustrativeContent'))).unique() + def illContent = work.remove('illustrativeContent') - it.scheduleSave() + if (illContent) { + instance['illustrativeContent'] = (asList(instance['illustrativeContent']) + asList(illContent)).unique() + + it.scheduleSave() + } } \ No newline at end of file From c66b4dba5e6228e69755e06f33a859b3d5fe5b06 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 9 Nov 2023 17:06:10 +0100 Subject: [PATCH 22/32] Avoid IndexOutOfBoundsException --- .../lxl-4150-deduplicate-contribution.groovy | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy b/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy index daff28bdee..8174b3d5d5 100644 --- a/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy +++ b/librisworks/scripts/lxl-4150-deduplicate-contribution.groovy @@ -14,19 +14,16 @@ selectByIds(ids) { bib -> ? primaryContributionIdx : contribution.findIndexOf { asList(it.agent) == d } def mergeInto = contribution[mergeIntoIdx] - def roles = [] + def roles = contribution.findResults { asList(it.agent) == d ? asList(it.role) : null }.flatten().unique() + if (roles) mergeInto['role'] = roles + + def idx = 0 contribution.removeAll { - if (asList(it.agent) == d) { - roles += asList(it.role) - return true - } - return false + def removeIf = asList(it.agent) == d && idx != mergeIntoIdx + idx += 1 + return removeIf } - - if (roles) mergeInto['role'] = roles.unique() - - contribution.add(mergeIntoIdx, mergeInto) } if (duplicates) { From 29073f221a7cb5cb3d1883ebe8951f3b24665ac3 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 9 Nov 2023 17:39:07 +0100 Subject: [PATCH 23/32] Avoid null in tryAddLifeSpanToLocalAgent --- librisworks/scripts/add-missing-contribution-data.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/add-missing-contribution-data.groovy index 600f0253b4..1d02ad1846 100644 --- a/librisworks/scripts/add-missing-contribution-data.groovy +++ b/librisworks/scripts/add-missing-contribution-data.groovy @@ -290,6 +290,7 @@ boolean tryAddLifeSpanToLocalAgent(Map contribution, String id) { def agent = asList(contribution.agent).find() if (agent instanceof Map && !agent[ID_KEY] && !agent.lifeSpan) { def names = agentToNames[toString(agent)] + if (!names) return def matchingLocalAgentsWithLifeSpan = nameToAgents.subMap(names).values().flatten().toSet().findAll { a -> !looksLikeIri(a) && localAgentToLifeSpansToIds[a] } From c0476ffefc6c1a72c45e1971d03976d210bb25c5 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 9 Nov 2023 17:55:35 +0100 Subject: [PATCH 24/32] Update path to relators --- librisworks/scripts/contributions-to-instance.groovy | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/librisworks/scripts/contributions-to-instance.groovy b/librisworks/scripts/contributions-to-instance.groovy index 2e67db814a..02c458640f 100644 --- a/librisworks/scripts/contributions-to-instance.groovy +++ b/librisworks/scripts/contributions-to-instance.groovy @@ -12,12 +12,13 @@ report = getReportWriter('report.tsv') def clusters = new File(System.getProperty('clusters')).collect { it.split('\t').collect { it.trim() } } def whelk = getWhelk() -def instanceRolesByDomain = whelk.resourceCache.relators.findResults { +def instanceRolesByDomain = whelk.resourceCache.relatorResources.relators.findResults { if (it.domain) { def domain = whelk.jsonld.toTermKey(it.domain[ID_KEY]) if (whelk.jsonld.isSubClassOf(domain, 'Embodiment')) it.subMap([ID_KEY]) } } + def instanceRoles = instanceRolesByDomain + [Relator.ILLUSTRATOR, Relator.AUTHOR_OF_INTRO, Relator.AUTHOR_OF_AFTERWORD].collect { [(ID_KEY): it.iri] } def ill = [(ID_KEY): Relator.ILLUSTRATOR.iri] From 42267aa2b0457fc380c0200123e0a6341e44d823 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Fri, 10 Nov 2023 11:05:33 +0100 Subject: [PATCH 25/32] Fix broken linkedAgentToLifeSpan --- librisworks/scripts/add-missing-contribution-data.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/scripts/add-missing-contribution-data.groovy b/librisworks/scripts/add-missing-contribution-data.groovy index 1d02ad1846..e9f87032cd 100644 --- a/librisworks/scripts/add-missing-contribution-data.groovy +++ b/librisworks/scripts/add-missing-contribution-data.groovy @@ -61,7 +61,7 @@ selectByIds(clusters.flatten()) { bib -> asList(c.agent).each { Map agent -> def agentStr = toString(agent) def loadedAgent = loadIfLink(agent) - if (agent.lifeSpan) { + if (loadedAgent.lifeSpan) { if (agent.containsKey(ID_KEY)) { linkedAgentToLifeSpan.computeIfAbsent(agentStr, f -> lifeSpan(loadedAgent)) } else { From cff2043cb5c134f8a3b78c24bd758f9d30eb14cd Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 13 Nov 2023 12:43:09 +0100 Subject: [PATCH 26/32] Avoid repeated period in work mainTitle --- .../src/main/groovy/se/kb/libris/mergeworks/Util.groovy | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index aabfd0cc0e..c79605cb3b 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -151,12 +151,15 @@ class Util { ] static void appendTitlePartsToMainTitle(Map title, String partNumber, String partName = null) { + if (title['mainTitle'][-1] != '.') { + title['mainTitle'] += '.' + } if (partNumber && partName) { - title['mainTitle'] += ". $partNumber, $partName" + title['mainTitle'] += " $partNumber, $partName" } else if (partNumber) { - title['mainTitle'] += ". $partNumber" + title['mainTitle'] += " $partNumber" } else if (partName) { - title['mainTitle'] += ". $partName" + title['mainTitle'] += " $partName" } } From 1eeaff5199c66f4a6d0b04b83af5a7f285b372ea Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 13 Nov 2023 16:07:28 +0100 Subject: [PATCH 27/32] Make subtitle/titleRemainder distinguishing again --- .../se/kb/libris/mergeworks/Util.groovy | 42 +++++++--- .../merge-works/ignored-subtitles.txt | 77 +++++++++++++++++++ 2 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 librisworks/src/main/resources/merge-works/ignored-subtitles.txt diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index c79605cb3b..12658d6db4 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -8,7 +8,7 @@ import whelk.util.Unicode import static se.kb.libris.mergeworks.compare.IntendedAudience.preferredComparisonOrder class Util { - static def titleComponents = ['mainTitle', 'hasPart', 'partNumber', 'partName'] + static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName'] static enum Relator { TRANSLATOR('https://id.kb.se/relator/translator'), @@ -33,6 +33,10 @@ class Util { } } + private static Set IGNORED_SUBTITLES = Util.class.getClassLoader() + .getResourceAsStream('merge-works/ignored-subtitles.txt') + .readLines().grep().collect(Util.&normalize) as Set + private static Set GENERIC_TITLES = Util.class.getClassLoader() .getResourceAsStream('merge-works/generic-titles.txt') .readLines().grep().collect(Util.&normalize) as Set @@ -78,20 +82,30 @@ class Util { hasTitle.any { it['mainTitle'] && normalize((String) it['mainTitle']) in GENERIC_TITLES } } - static List dropSubtitles(List hasTitle) { + static List dropGenericSubTitles(List hasTitle) { hasTitle.collect { def copy = new TreeMap(it) - DocumentUtil.traverse(copy) { value, path -> - if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) { - new DocumentUtil.Remove() + if (copy['subtitle'] || copy['titleRemainder']) { + DocumentUtil.traverse(copy) { value, path -> + if (('subtitle' in path || 'titleRemainder' in path) && value instanceof String) { + if (genericSubtitle(value)) { + new DocumentUtil.Remove() + } else { + ((List) value.split(':')).with { + if (it.size() > 1 && genericSubtitle(it.last().trim())) { + new DocumentUtil.Replace(value.replaceFirst(~/\s*:.+$/, '')) + } + } + } + } } } - return copy + copy } } static List flatTitles(List hasTitle) { - hasTitle.collect { + dropGenericSubTitles(hasTitle).collect { def title = new TreeMap<>() title['flatTitle'] = normalize(DisplayDoc.flatten(it, titleComponents)) if (it['@type']) { @@ -112,6 +126,14 @@ class Util { .collect { it['flatTitle'] } } + private static boolean genericSubtitle(String s) { + s = Util.normalize(s) + if (s.startsWith("en ")) { + s = s.substring("en ".length()) + } + return s in IGNORED_SUBTITLES + } + static String chipString(def thing, Whelk whelk) { if (thing instanceof Integer) { return thing @@ -182,7 +204,7 @@ class Util { def partNumber = findTitlePart(bestInstanceTitle, 'partNumber') def partName = findTitlePart(bestInstanceTitle, 'partName') - def workTitleShape = { it.subMap(['@type', 'mainTitle', 'source']) } + def workTitleShape = { it.subMap(['@type', 'mainTitle', 'subtitle', 'titleRemainder', 'source']) } if (bestWorkTitle) { return bestWorkTitle.each { appendTitlePartsToMainTitle(it, partNumber) } @@ -225,7 +247,7 @@ class Util { static def mostCommonWorkTitle(Collection docs, Closure getTitle = { it.workTitle().findAll(isTitle) }) { def workTitles = docs.collect(getTitle) .grep() - .collect { dropSubtitles(it) } + .collect { dropGenericSubTitles(it) } if (workTitles) { return mostCommon(workTitles) @@ -240,7 +262,7 @@ class Util { } def instanceTitles = docs.collect { it.instanceTitle().findAll(isTitle) } - .collect { dropSubtitles(it) } + .collect { dropGenericSubTitles(it) } if (instanceTitles.grep()) { def instanceTitleToDoc = [instanceTitles, docs].transpose().collectEntries() diff --git a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt new file mode 100644 index 0000000000..4dea8de2e6 --- /dev/null +++ b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt @@ -0,0 +1,77 @@ +a comedy +a history +a novel +a play +a romance +a tale +aforismer +berättelse +berättelse för barn +berättelse för flickor +berättelse för pojkar +berättelse för unga flickor +berättelser +berättelser för barn +bilderbok +comédie +contos +deckare +deckarroman +detektivroman +dikt +dikter +drama +efterlämnade dikter +ein coq-rouge-thriller +ein roman +eine erzählung +erzählung +erzählungen +essays +essäer +ett fall för kay scarpetta +fortælling +historisk roman +homandeckare +jack reacher-thriller +komedi +komedi i fyra akter +krimi +kriminalroman +kärlekshistoria +kärleksroman +kåserier +lustspel i en akt +nouvelles +novela +novell +novelle +noveller +pjäs +polisroman +povesti +powieść +poėma +reseguide +resehandbok +rikosromaani +romaani +romaani rikoksesta +roman +roman om ett brott +roman om skivvärlden +romanas +romance +romanzo +rövarroman +runoja +saga +sagor +sann historia +skildringar +skáldsaga +spänningsroman +stories +thriller +ungdomsroman +(Efterlämnade dikter.) From 3756a0e7e7f1c4af6c96cb0fdf11affbd3fa5306 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 13 Nov 2023 16:12:04 +0100 Subject: [PATCH 28/32] Restore structure --- .../groovy/se/kb/libris/mergeworks/Util.groovy | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 12658d6db4..57240972a7 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -116,6 +116,14 @@ class Util { } } + private static boolean genericSubtitle(String s) { + s = Util.normalize(s) + if (s.startsWith("en ")) { + s = s.substring("en ".length()) + } + return s in IGNORED_SUBTITLES + } + static String normalize(String s) { return Unicode.removeDiacritics(Unicode.normalizeForSearch(StringUtils.normalizeSpace(" $s ".toLowerCase().replace(noise)))) } @@ -126,14 +134,6 @@ class Util { .collect { it['flatTitle'] } } - private static boolean genericSubtitle(String s) { - s = Util.normalize(s) - if (s.startsWith("en ")) { - s = s.substring("en ".length()) - } - return s in IGNORED_SUBTITLES - } - static String chipString(def thing, Whelk whelk) { if (thing instanceof Integer) { return thing From 7f90c3c5f2463c67e37bbb387143056d43e0cf65 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 15 Nov 2023 08:38:57 +0100 Subject: [PATCH 29/32] Save before reporting to make sure no data is changed before saving --- librisworks/scripts/merge-works.groovy | 41 +++++++++++-------- .../se/kb/libris/mergeworks/DisplayDoc.groovy | 10 ++--- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/librisworks/scripts/merge-works.groovy b/librisworks/scripts/merge-works.groovy index 5dc5842083..e9e4a8b5cd 100644 --- a/librisworks/scripts/merge-works.groovy +++ b/librisworks/scripts/merge-works.groovy @@ -53,23 +53,26 @@ new File(System.getProperty('clusters')).splitEachLine(~/[\t ]+/) { cluster -> List linkableWorkIris = uniqueWorksAndTheirInstances.findResults { it.getV1().workIri() } uniqueWorksAndTheirInstances.each { Doc workDoc, List instanceDocs -> - if (!workDoc.instanceData) { - if (workDoc.existsInStorage) { - if (instanceDocs) { - replaceWorkData(workDoc, c.merge([workDoc] + instanceDocs)) - // TODO: Update adminMetadata? To say that additional instances may have contributed to the linked work. - writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.UPDATED) - } - } else { - addAdminMetadata(workDoc, instanceDocs.collect { ['@id': it.recordIri()] }) - writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.NEW) - } + // Link more instances to existing linked work + if (workDoc.existsInStorage && !workDoc.instanceData && instanceDocs) { + replaceWorkData(workDoc, c.merge([workDoc] + instanceDocs)) + // TODO: Update adminMetadata? To say that additional instances may have contributed to the linked work. addCloseMatch(workDoc, linkableWorkIris) saveAndLink(workDoc, instanceDocs, workDoc.existsInStorage) - } else { - if (addCloseMatch(workDoc, linkableWorkIris)) { - saveAndLink(workDoc) - } + writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.UPDATED) + return + } + // New merged work + if (!workDoc.existsInStorage && !workDoc.instanceData) { + addAdminMetadata(workDoc, instanceDocs.collect { ['@id': it.recordIri()] }) + addCloseMatch(workDoc, linkableWorkIris) + saveAndLink(workDoc, instanceDocs, workDoc.existsInStorage) + writeWorkReport(docs, workDoc, instanceDocs, WorkStatus.NEW) + return + } + // Local work, save if new closeMatch links created + if (workDoc.instanceData && addCloseMatch(workDoc, linkableWorkIris)) { + saveAndLink(workDoc) } } @@ -96,9 +99,11 @@ void saveAndLink(Doc workDoc, Collection instanceDocs = [], boolean existsI } } - selectByIds(instanceDocs.collect { it.shortId() }) { - it.graph[1]['instanceOf'] = ['@id': workDoc.thingIri()] - it.scheduleSave(changedBy: changedBy, generationProcess: generationProcess) + if (!instanceDocs.isEmpty()) { + selectByIds(instanceDocs.collect { it.shortId() }) { + it.graph[1]['instanceOf'] = ['@id': workDoc.thingIri()] + it.scheduleSave(changedBy: changedBy, generationProcess: generationProcess) + } } } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy index 816760a9b6..3fed6e1987 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/DisplayDoc.groovy @@ -147,13 +147,9 @@ class DisplayDoc { Map getFramed() { if (!framed) { - if (doc.existsInStorage) { - framed = JsonLd.frame(doc.thingIri(), doc.whelk.loadEmbellished(doc.shortId()).data) - } else { - Document copy = doc.document.clone() - doc.whelk.embellish(copy) - framed = JsonLd.frame(doc.thingIri(), copy.data) - } + Document copy = doc.document.clone() + doc.whelk.embellish(copy) + framed = JsonLd.frame(doc.thingIri(), copy.data) } return framed From 4fce9f44bcc9adcf655de3aee7a44cb7fabab27d Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 15 Nov 2023 17:07:50 +0100 Subject: [PATCH 30/32] Extend ignored-subtitles.txt --- .../se/kb/libris/mergeworks/Util.groovy | 6 +- .../merge-works/ignored-subtitles.txt | 210 +++++++++++++++++- 2 files changed, 202 insertions(+), 14 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index 57240972a7..f5a8b0ed3d 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -33,6 +33,9 @@ class Util { } } + static def noise = + [",", '"', "'", "ʹ", "ʼ", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', ' the ', '-', '–', '+', '!', '?',].collectEntries { [it, ' '] } + private static Set IGNORED_SUBTITLES = Util.class.getClassLoader() .getResourceAsStream('merge-works/ignored-subtitles.txt') .readLines().grep().collect(Util.&normalize) as Set @@ -41,9 +44,6 @@ class Util { .getResourceAsStream('merge-works/generic-titles.txt') .readLines().grep().collect(Util.&normalize) as Set - static def noise = - [",", '"', "'", "ʹ", "ʼ", '[', ']', ',', '.', '.', ':', ';', '-', '(', ')', ' the ', '-', '–', '+', '!', '?',].collectEntries { [it, ' '] } - static List asList(Object o) { (o ?: []).with { it instanceof List ? it : [it] } diff --git a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt index 4dea8de2e6..74bfc3f693 100644 --- a/librisworks/src/main/resources/merge-works/ignored-subtitles.txt +++ b/librisworks/src/main/resources/merge-works/ignored-subtitles.txt @@ -1,77 +1,265 @@ a comedy +aforismer a history +äktenskapshistoria +alex king-thriller a novel a play a romance a tale -aforismer +äventyrsberättelse +äventyrsberättelse för ungdom +äventyrsroman +barnberättelse för små och stora berättelse berättelse för barn +berättelse för barn och ungdom berättelse för flickor berättelse för pojkar berättelse för unga flickor +berättelse för ungdom +berättelse från 1700-talets senare del +berättelse från kristi tid +berättelse från nittonde seklet +berättelse från öarna +berättelse från okristen tid +berättelse från sista finska kriget +berättelse från skärgården +berättelse från sörmland +berättelse från västra skärgården +berättelsen om en amerikansk familj berättelser berättelser för barn +berättelser för unga och gamla +berättelser från alla tidehvarf +berättelser och skisser +berättelse ur folklifvet bilderbok +bonderoman +bröderna paine på nya äventyr +bröderna perry +bygdeberättelser +chesapeake shores-roman comédie contos +coq rouge +dalgliesh-deckare +dalziel och pascoe-roman deckare +deckare med ester karlsson med k deckarroman +den tyske tonsättaren adrian leverkühns liv skildrat av en vän +detektivhistoria från antikens rom detektivroman dikt dikter +dikt i tre sånger +diktsamling +dirk pitt-roman +dokumentärroman drama efterlämnade dikter ein coq-rouge-thriller -ein roman eine erzählung +ein roman +elak skolgosses minnen +elva brev från marcus mezentius manilianus om våren år trettio efter kristus erzählung erzählungen -essays +espen arnakkes kommentarer till jantelagen essäer +essayer +essays +ett äktenskaps roman +ett fall för anastasia kamenskaja vid moskvapolisen +ett fall för dr siri ett fall för kay scarpetta +ett fall för kommissarie brunetti +ett fall för kommissarie çetin ikmen +ett fall för kommissarie santos +ett fall med dunder brak +ett nytt fall för maria wern spänningsroman +familjeroman +familjs förfall +fantasyroman +femton böcker ur den egyptiske läkaren sinuhes liv omkr 1390-1335 f kr +folklivsberättelse fortælling +framtidsroman +göran persson-deckare +hägringar från reformationstiden +hans ungdoms öden och äventyr i många länder intill år 1527 sanningsenligt framställda av honom själv i tio böcker +harry bosch-deckare +harry hole-thriller +heroisk berättelse +historia från idyllens och revolutionernas tidehvarf +historia om rätt och orätt +historier +historisk äventyrsroman +historisk berättelse +historisk homan-deckare +historisk kriminalroman historisk roman +historisk romantisk skildring +historiskt-romantiska skildringar från snapphanefejden +homan-bok homandeckare +homan-deckare i 1700-talsmiljö +humoresker +humoristisk berättelse +indisk berättelse jack reacher-thriller +jakt- fiske- och bygdehistorier +kärlekshistoria +kärleksroman +kåserier +kent mortland-thriller komedi komedi i fyra akter +kommissarie gamache-deckare +konrad sejer-deckare krimi kriminalroman -kärlekshistoria -kärleksroman -kåserier +kriminalroman från sandhamn +kustroman +lagerlöfs homeros erland lagerlöfs klassiska översättning +läkarroman lustspel i en akt +människa ur det förgångna rannsakad och hörd om sina levnadsomständigheter roman +morden i sandhamn +nathalie svensson- och johan axberg-deckare nouvelles novela novell novelle noveller +novelletter +novellsamling +novell ur spelreglerna +nutidsroman +ny kriminalroman om hammarbypolisen +öfvers +originalberättelse +originalroman +pennritning pjäs +poem +poėma +poesi polisroman +politisk thriller povesti powieść -poėma +prosa +prosadikter +psykologisk thriller +rammakaren theodor marklunds egen redogörelse +religiöst poem reseguide resehandbok +revy om människan i tid och rum rikosromaani +robert krüger-deckare +roland hassel-thriller romaani romaani rikoksesta roman -roman om ett brott -roman om skivvärlden romanas romance +romanen om Elling +roman från 2000-talet +roman från värend 1650 +roman från värend på 1790-talet +roman i femtiotre tablåer +roman i två delar +roman om baltutlämningen +roman om det närvarande +roman om ett brott +roman om frihet +roman om skivvärlden +romans +romantisk berättelse romanzo +romaunt i tolv böcker rövarroman runoja saga +saga på vers med rim som barnen får hitta på alldeles själva +saga på vers med rim som barnen får hitta på alldeles själva bilderbok +saga på vers med rim som barnen får hitta på själva sagor +sällsam historia +samling berättelser +sång +sånger sann historia -skildringar +sannsaga +science fiction +serieroman +själavårdsbok +själs utvecklingshistoria +självbiografisk berättelse +sjöroman +skaldestycke skáldsaga +skälmroman +skärgårdsberättelse +sketch +skildring +skildringar +skildringar från attentatens och jubelfesternas tidehvarf +skildringar från attentatens och jubelfesternas tidevarv +skildringar ur artist- och författarelivet +skildringar ur artist- och författarlifvet +skildringar ur artist- och författarlivet +skildringar ur stockholmslifvet +skildring från franska revolutionen +skisser +skizz +skolflickshistoria +släkten lackland +släktsaga +småstadskrönika +soldatfamiljs historia +sommarbok för stora och små spänningsroman +spänningsroman med anna pigeon +spänningsroman med erik winter +spionroman +spökhistoria +sprakfåle-bok +standardupplaga +sten wall-deckare +stockholmsroman stories +svensk originalberättelse +svensk originalroman +svensk sjömans äventyr +svenskt original +svenskt original för stockholms-tidningen +sv orig +tavla ur livet +teckningar +teckningar med text +teckningar ur hvardagslifvet +teckningar ur vardagslivet +teckning ur det verkliga lifvet +teckning ur det verkliga livet thriller +tio böcker om hans jordiska liv omkring 520-450 +tom grip-thriller +tommy flisen-deckare +tvillingdetektiverna +tvillingdetektiverna på nya äventyr +ungdomsbok ungdomsroman -(Efterlämnade dikter.) +ur Dan Henrys minnen +ur en stockholms-detektivs minnen +ur Kalle Svenssons dagbok +ur numas arkiv +värmlandsberättelse +växelringning på ett gammalt tema i två korta anslag och två hela långringningar +vildmarksroman +virgin river-roman +wexforddeckare +will trent-deckare From c3b7a8ee72a59417ac7acf8ca2877d7723fd4240 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Wed, 15 Nov 2023 17:23:14 +0100 Subject: [PATCH 31/32] Require that PrimaryContribution has an agent in SVSK selection --- librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 9f2bc32b62..e6e45c8896 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -225,7 +225,7 @@ class Doc { boolean isMaybeAggregate() { hasPart() || classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code?.contains('(s)') } - || !contribution().any { it['@type'] == 'PrimaryContribution' } + || !contribution().any { it['@type'] == 'PrimaryContribution' && it['agent'] } || hasRelationshipWithContribution() } From c86d9ee16d438f059bcf4449ad18bfe9ca8b5b50 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Thu, 16 Nov 2023 07:47:37 +0100 Subject: [PATCH 32/32] Add explanation + example on when to drop only part of subtitle --- librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy index f5a8b0ed3d..e67ba0dd66 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Util.groovy @@ -91,6 +91,8 @@ class Util { if (genericSubtitle(value)) { new DocumentUtil.Remove() } else { + // Remove substring after colon if identified as generic + // Example: "klanen Kennedy : roman" -> "klanen Kennedy" ((List) value.split(':')).with { if (it.size() > 1 && genericSubtitle(it.last().trim())) { new DocumentUtil.Replace(value.replaceFirst(~/\s*:.+$/, ''))