From 5a364c4a7bb309aebc29fab5d5b6b236811febe5 Mon Sep 17 00:00:00 2001 From: kwahlin <72360110+kwahlin@users.noreply.github.com> Date: Thu, 9 Nov 2023 15:01:33 +0100 Subject: [PATCH] Add more fine-grained merge rules for SAB classification (#1322) * Add more fine-grained merge rules for SAB classification * Add rules file * Fix incomplete code * Hua --> Hua* * Add test * Clarify * Correct comment * Remove dpendendency added by mistake --- librisworks/build.gradle | 8 ++ .../groovy/se/kb/libris/mergeworks/Doc.groovy | 2 +- .../mergeworks/compare/Classification.groovy | 135 +++++++++++++++--- .../merge-works/sab-precedence-rules.tsv | 107 ++++++++++++++ .../compare/ClassificationSpec.groovy | 22 +++ 5 files changed, 256 insertions(+), 18 deletions(-) create mode 100644 librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy diff --git a/librisworks/build.gradle b/librisworks/build.gradle index 19e68e2b54..b01041315d 100644 --- a/librisworks/build.gradle +++ b/librisworks/build.gradle @@ -5,6 +5,9 @@ sourceSets { scripts { groovy { srcDir 'scripts' } } + test { + groovy { srcDir 'src/test/groovy/' } + } } repositories { @@ -17,6 +20,11 @@ dependencies { compileOnly project(':whelk-core') scriptsCompileOnly sourceSets.main.output scriptsCompileOnly project(':whelk-core') + testImplementation "org.spockframework:spock-core:${spockVersion}" +} + +test { + useJUnitPlatform() } jar { diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy index 44bafeebc5..8118d34a87 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy @@ -260,7 +260,7 @@ class Doc { } boolean isSabFiction() { - classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|uH|ufH|ugH)/ } + classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|h|uH|ufH|ugH)/ } } boolean isNotFiction() { diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy index 0e8d2e0050..12496be8ba 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy @@ -1,13 +1,7 @@ package se.kb.libris.mergeworks.compare class Classification extends StuffSet { - // Terms that will be merged (values precede keys) - private static def norm = [ - 'uHc' : ['Hc,u'], - 'uHce' : ['Hce,u'], - 'Hc' : ['Hc.01', 'Hc.02', 'Hc.03'], - 'Hc,u' : ['Hcf', 'Hcg'] - ] + private static def sabPrecedenceRules = loadSabPrecedenceRules() @Override Object merge(Object a, Object b) { @@ -17,17 +11,17 @@ class Classification extends StuffSet { if (!code1 || !code2) { return } - code1 = code1.replaceAll(/\s+/, "") - code2 = code2.replaceAll(/\s+/, "") if (isSab(c1) && isSab(c2)) { - def code = code1 == code2 || n(code2, code1) - ? code1 - : (n(code1, code2) ? code2 : null) - if (code) { + code1 = normalizeSabCode(code1) + code2 = normalizeSabCode(code2) + + def mergedCode = tryMergeSabCodes(code1, code2) + + if (mergedCode) { def result = [ '@type' : 'Classification', - 'code' : code1, + 'code' : mergedCode, inScheme: [ '@type': 'ConceptScheme', 'code' : 'kssb' @@ -56,7 +50,7 @@ class Classification extends StuffSet { } boolean isSab(Map c) { - c['inScheme'] && c['inScheme']['code'] == 'kssb' + c['inScheme'] && c['inScheme']['code'] =~ 'kssb' } String maxSabVersion(c1, c2) { @@ -79,7 +73,114 @@ class Classification extends StuffSet { Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", "")) } - boolean n(a, b) { - norm[a]?.any { it == b || n(it, b) } + static String normalizeSabCode(String sab) { + sab.replaceFirst(~/^h/, 'H').with { + it =~ /bf:|z/ ? it : it.replaceAll(~/\s+/, '') + } + } + + static String tryMergeSabCodes(String a, String b) { + if (a == b) { + return a + } + if (sabPrecedes(a, b)) { + return a + } + if (sabPrecedes(b, a)) { + return b + } + return null + } + + static sabPrecedes(String a, String b) { + def (equal, startsWith) = sabPrecedenceRules + // Codes starting with Hcb or Hdab should never overwrite another code + def overwriteExceptions = ~/^Hcb|^Hdab/ + def preferred = equal[b] ?: startsWith.find { b.startsWith(it.key) }?.value + if (preferred && !(a =~ overwriteExceptions)) { + if (preferred['equals'] && a in preferred['equals']) { + return true + } + if (preferred['startsWith'] && preferred['startsWith'].any { a.startsWith(it) }) { + return true + } + } + return false + } + + /** + * Loads rules for how to merge SAB codes from file. + * The code in the first column is preferred over the other codes in the same row. + * The codes can contain wildcard characters '?' (anywhere in the string) or '*' (at the end) + * The asterisk represents any sequence of characters (zero or more) + * The question mark represents zero or one of the characters '6', '7' and '8'. + * Examples: + * Hcd* | Hcbd* + * --> Any code starting with Hcd is picked over any code starting with Hcbd + * Hda.01?=c | Hda.01? | Hda=c + * --> Hda.01=c, Hda.016=c, Hda.017=c, Hda.018=c and Hda=c are all picked over over Hda.01, Hda.016, Hda.017, Hda.018 and Hda=c + * Hcee.03 | Hce.03 | Hcee + * --> Hcee.03 is picked over Hce.03 and Hcee + * + * The rules are loaded into two different maps, 'equal' and 'startsWith'. + * The top-level keys of these maps are the codes that can possibly be overwritten. + * + * In the 'equal' map we can directly look up a code (key) to see if there are preferred codes that should overwrite it, + * while in the 'startsWith' map we check if the code starts with any of the keys. For example if the code is 'Hce' + * and we have startsWith = ['He: [:], 'Hm': [:] 'Hc': [:]] we iterate over the entries until 'Hc' is found. + * + * The value is in turn also a Map containing the codes that are preferred over the code matching the key. + * The map at this second level can have two keys, 'equals' and 'startsWith', and the values are sets of preferred codes. + * + * Example: + * [ + * 'Hc.01': ['equals': ['Hc.01', 'Hc.016', 'Hc.017', 'Hc.018', 'Hcd.01', 'Hcd.016', 'Hcd.017', 'Hcd.018']], + * 'Hce': ['startsWith': ['Hce']] + * ] + * + * This means that any code starting with 'Hce' is preferred over just 'Hce' and any of 'Hc.01', 'Hc.016', 'Hc.017'... + * is preferred over just 'Hc.01'. + */ + static Tuple2, Map> loadSabPrecedenceRules() { + Map equal = [:] + Map startsWith = [:] + + def questionMarkSubstitutes = ['6', '7', '8', ''] + + Classification.class.getClassLoader() + .getResourceAsStream('merge-works/sab-precedence-rules.tsv') + .splitEachLine('\t') { + def preferred = it.first() + def preferredStartsWith = preferred.endsWith('*') ? preferred[0..<-1] : null + def preferredEquals = preferred.contains('?') + ? questionMarkSubstitutes.collect { preferred.replace('?', it) } + : (preferredStartsWith ? null : [preferred]) + + def addPreferred = { Map pref -> + if (preferredStartsWith) { + pref.computeIfAbsent('startsWith', f -> [] as Set).add(preferredStartsWith) + } + if (preferredEquals) { + pref.computeIfAbsent('equals', f -> [] as Set).addAll(preferredEquals) + } + } + + def overwrite = it.drop(1) + overwrite.each { s -> + if (s.endsWith('*')) { + def leading = s[0..<-1] + startsWith.computeIfAbsent(leading, f -> [:]).with(addPreferred) + } else if (s.contains('?')) { + questionMarkSubstitutes.each { + def substituted = s.replace('?', it) + equal.computeIfAbsent(substituted, f -> [:]).with(addPreferred) + } + } else { + equal.computeIfAbsent(s, f -> [:]).with(addPreferred) + } + } + } + + return new Tuple2(equal, startsWith) } } \ No newline at end of file diff --git a/librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv b/librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv new file mode 100644 index 0000000000..1b24ddc46a --- /dev/null +++ b/librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv @@ -0,0 +1,107 @@ +H* H +Hc* Hc +Hcd* Hcbd* +Hcq* Hcbq* +Hc,u H,u uHc uH +Hce* Hce +Hce,u H,u Hc,u uHce +Hcf* Hc,u uHc Hcf +Hcg* Hc,u uHc Hcg +Hci,u Hci +Hd* Hd +Hda* Hda +Hdb* Hdb +He* He +Hf* Hf +Hg* Hg +Hi* Hi +Hj* Hj +Hk* Hk +Hl* Hl +Hm* Hm +Hma* Hma +Hmb* Hmb +Hmc* Hmc +Hmd* Hmd +Hsg* Hsg +Hub* Hua* Hub +Hva* Hva +Hxj* Hxj +Hc.01? Hc.01 +Hcd.01? Hcd.01 Hc.01 Hcd +Hcd.03 Hc.03 Hcd +Hce.01? Hce.01 +Hceda.01? Hceda.01 Hce.01 Hced Hceda +Hceda.03 Hce.03 Hced Hceda +Hcedb.01? Hcedb.01 Hce.01? Hced Hcedb +Hcedb.03 Hce.03 Hced Hcedb +Hcee.01? Hcee.01 Hce.01? Hcee +Hcee.03 Hce.03 Hcee +Hceeq.01? Hceeq.01 Hcee.01? Hce.01? Hcee Hceeq +Hceeq.03 Hcee.03 Hce.03 Hcee Hceeq +Hcef.01? Hcef.01 Hce.01? Hcef +Hcef.03 Hce.03 Hcef +Hceg.01? Hceg.01 Hce.01? Hceg +Hceg.03 Hce.03 Hceg +Hcei.01? Hcei.01 Hce.01? Hcei +Hcei.03 Hce.03 Hcei +Hcej.01? Hcej.01 Hce.01? Hcej +Hcej.03 Hce.03 Hcej +Hcek.01? Hcek.01 Hce.01? Hcek +Hcek.03 Hce.03 Hcek +Hcekq.01? Hcekq.01 Hcek.01? Hce.01? Hcek Hcekq +Hcekq.03 Hcek.03 Hce.03 Hcek Hcekq +Hcel.01? Hcel.01 Hce.01? Hcel +Hcel.03 Hce.03 Hcel +Hcema.01? Hcema.01 Hce.01? Hcem Hcema +Hcema.03 Hce.03 Hcem Hcema +Hcemb.01? Hcemb.01 Hce.01? Hcem Hcema +Hcemb.03 Hce.03 Hcem Hcemb +Hcemc.01? Hcemc.01 Hce.01? Hcem Hcemc +Hcemc.03 Hce.03 Hcem Hcmec +Hcemd.01? Hcemd.01 Hce.01? Hcem Hcemd +Hcemd.03 Hce.03 Hcem Hcemd +Hcesg.01? Hcesg.01 Hce.01? Hcesg +Hcesg.03 Hce.03 Hces Hcesg +Hceub.01? Hceub.01 Hce.01? Hceub +Hceub.03 Hce.03 Hceu Hceub +Hceva.01? Hceva.01 Hce.01? Hceva +Hceva.03 Hce.03 Hcev Hceva +Hcexj.01? Hcexj.01 Hce.01? Hcexj +Hcexj.03 Hce.03 Hcex Hcexj +Hda.01?=c Hda.01? Hda=c +Hda.03=c Hda.03 Hda=c +Hdb.01?=c Hdb.01? Hdb=c +Hdb.03=c Hdb.03 Hdb=c +He.01?=c He.01? He=c +He.03=c He.03 He=c +Heq.01?=c Heq.01=c Heq.01? Heq=c He.01? He=c He.01?=c +Heq.03=c Heq.03 Heq=c He.03* He=c +Hf.01?=c Hf.01? Hf=c +Hf.03=c Hf.03 Hf=c +Hi.01?=c Hi.01? Hi=c +Hi.03=c Hi.03 Hi=c +Hj.01?=c Hj.01? Hj=c +Hj.03=c Hj.03 Hj=c +Hk.01?=c Hk.01? Hk=c +Hk.03=c Hk.03 Hk=c +Hkq.01?=c Hkq.01=c Hkq.01? Hkq=c Hk.01? Hk=c Hk.01?=c +Hkq.03=c Hkq.03 Hkq=c Hk.03* Hk=c +Hl.01?=c Hl.01? Hl=c +Hl.03=c Hl.03 Hl=c +Hma.01?=c Hma.01? Hma=c +Hma.03=c Hma.03 Hma=c +Hmb.01?=c Hmb.01? Hmb=c +Hmb.03=c Hmd.03 Hmb=c +Hmc.01?=c Hmc.01? Hmc=c +Hmc.03=c Hmc.03 Hmc=c +Hmd.01?=c Hmd.01? Hmd=c +Hmd.03=c Hmd.03 Hmd=c +Hsg.01?=c Hsg.01? Hsg=c +Hsg.03=c Hsg.03 Hsg=c +Hub.01?=c Hub.01? Hub=c +Hub.03=c Hub.03 Hub=c +Hva.01?=c Hva.01? Hva=c +Hva.03=c Hva.03 Hva=c +Hxj.01?=c Hxj.01? Hxj=c +Hxj.03=c Hxj.03 Hxj=c diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy new file mode 100644 index 0000000000..8bf21f3cfe --- /dev/null +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy @@ -0,0 +1,22 @@ +package se.kb.libris.mergeworks.compare + +import spock.lang.Specification + +class ClassificationSpec extends Specification { + def "merge SAB codes"() { + expect: + Classification.tryMergeSabCodes(a, b) == result + + where: + a || b || result + 'H' || 'H' || 'H' + 'Haaa' || 'H' || 'Haaa' + 'Hcqaa' || 'Hcbqbbb' || 'Hcqaa' + 'Hcb' || 'Hc' || null + 'Hci' || 'Hci,u' || 'Hci,u' + 'Hcd.016' || 'Hcd.01' || 'Hcd.016' + 'Hc.01' || 'Hcd.01' || 'Hcd.01' + 'Hda.017=c' || 'Hda.018' || 'Hda.017=c' + 'He' || 'Hc' || null + } +}