From 5a364c4a7bb309aebc29fab5d5b6b236811febe5 Mon Sep 17 00:00:00 2001
From: kwahlin <72360110+kwahlin@users.noreply.github.com>
Date: Thu, 9 Nov 2023 15:01:33 +0100
Subject: [PATCH] Add more fine-grained merge rules for SAB classification
 (#1322)

* Add more fine-grained merge rules for SAB classification

* Add rules file

* Fix incomplete code

* Hua --> Hua*

* Add test

* Clarify

* Correct comment

* Remove dpendendency added by mistake
---
 librisworks/build.gradle                      |   8 ++
 .../groovy/se/kb/libris/mergeworks/Doc.groovy |   2 +-
 .../mergeworks/compare/Classification.groovy  | 135 +++++++++++++++---
 .../merge-works/sab-precedence-rules.tsv      | 107 ++++++++++++++
 .../compare/ClassificationSpec.groovy         |  22 +++
 5 files changed, 256 insertions(+), 18 deletions(-)
 create mode 100644 librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv
 create mode 100644 librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy

diff --git a/librisworks/build.gradle b/librisworks/build.gradle
index 19e68e2b54..b01041315d 100644
--- a/librisworks/build.gradle
+++ b/librisworks/build.gradle
@@ -5,6 +5,9 @@ sourceSets {
     scripts {
         groovy { srcDir 'scripts' }
     }
+    test {
+        groovy { srcDir 'src/test/groovy/' }
+    }
 }
 
 repositories {
@@ -17,6 +20,11 @@ dependencies {
     compileOnly project(':whelk-core')
     scriptsCompileOnly sourceSets.main.output
     scriptsCompileOnly project(':whelk-core')
+    testImplementation "org.spockframework:spock-core:${spockVersion}"
+}
+
+test {
+    useJUnitPlatform()
 }
 
 jar {
diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy
index 44bafeebc5..8118d34a87 100644
--- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy
+++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/Doc.groovy
@@ -260,7 +260,7 @@ class Doc {
     }
 
     boolean isSabFiction() {
-        classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|uH|ufH|ugH)/ }
+        classification().any { it.inScheme?.code =~ /[Kk]ssb/ && it.code =~ /^(H|h|uH|ufH|ugH)/ }
     }
 
     boolean isNotFiction() {
diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy
index 0e8d2e0050..12496be8ba 100644
--- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy
+++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy
@@ -1,13 +1,7 @@
 package se.kb.libris.mergeworks.compare
 
 class Classification extends StuffSet {
-    // Terms that will be merged (values precede keys)
-    private static def norm = [
-            'uHc'                                                        : ['Hc,u'],
-            'uHce'                                                       : ['Hce,u'],
-            'Hc'                                                         : ['Hc.01', 'Hc.02', 'Hc.03'],
-            'Hc,u'                                                       : ['Hcf', 'Hcg']
-    ]
+    private static def sabPrecedenceRules = loadSabPrecedenceRules()
 
     @Override
     Object merge(Object a, Object b) {
@@ -17,17 +11,17 @@ class Classification extends StuffSet {
             if (!code1 || !code2) {
                 return
             }
-            code1 = code1.replaceAll(/\s+/, "")
-            code2 = code2.replaceAll(/\s+/, "")
 
             if (isSab(c1) && isSab(c2)) {
-                def code = code1 == code2 || n(code2, code1)
-                        ? code1
-                        : (n(code1, code2) ? code2 : null)
-                if (code) {
+                code1 = normalizeSabCode(code1)
+                code2 = normalizeSabCode(code2)
+
+                def mergedCode = tryMergeSabCodes(code1, code2)
+
+                if (mergedCode) {
                     def result = [
                             '@type' : 'Classification',
-                            'code'  : code1,
+                            'code'  : mergedCode,
                             inScheme: [
                                     '@type': 'ConceptScheme',
                                     'code' : 'kssb'
@@ -56,7 +50,7 @@ class Classification extends StuffSet {
     }
 
     boolean isSab(Map c) {
-        c['inScheme'] && c['inScheme']['code'] == 'kssb'
+        c['inScheme'] && c['inScheme']['code'] =~ 'kssb'
     }
 
     String maxSabVersion(c1, c2) {
@@ -79,7 +73,114 @@ class Classification extends StuffSet {
         Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", ""))
     }
 
-    boolean n(a, b) {
-        norm[a]?.any { it == b || n(it, b) }
+    static String normalizeSabCode(String sab) {
+        sab.replaceFirst(~/^h/, 'H').with {
+            it =~ /bf:|z/ ? it : it.replaceAll(~/\s+/, '')
+        }
+    }
+
+    static String tryMergeSabCodes(String a, String b) {
+        if (a == b) {
+            return a
+        }
+        if (sabPrecedes(a, b)) {
+            return a
+        }
+        if (sabPrecedes(b, a)) {
+            return b
+        }
+        return null
+    }
+
+    static sabPrecedes(String a, String b) {
+        def (equal, startsWith) = sabPrecedenceRules
+        // Codes starting with Hcb or Hdab should never overwrite another code
+        def overwriteExceptions = ~/^Hcb|^Hdab/
+        def preferred = equal[b] ?: startsWith.find { b.startsWith(it.key) }?.value
+        if (preferred && !(a =~ overwriteExceptions)) {
+            if (preferred['equals'] && a in preferred['equals']) {
+                return true
+            }
+            if (preferred['startsWith'] && preferred['startsWith'].any { a.startsWith(it) }) {
+                return true
+            }
+        }
+        return false
+    }
+
+    /**
+     * Loads rules for how to merge SAB codes from file.
+     * The code in the first column is preferred over the other codes in the same row.
+     * The codes can contain wildcard characters '?' (anywhere in the string) or '*' (at the end)
+     * The asterisk represents any sequence of characters (zero or more)
+     * The question mark represents zero or one of the characters '6', '7' and '8'.
+     * Examples:
+     * Hcd* | Hcbd*
+     *  --> Any code starting with Hcd is picked over any code starting with Hcbd
+     * Hda.01?=c | Hda.01? | Hda=c
+     *  --> Hda.01=c, Hda.016=c, Hda.017=c, Hda.018=c and Hda=c are all picked over over Hda.01, Hda.016, Hda.017, Hda.018 and Hda=c
+     * Hcee.03 | Hce.03 | Hcee
+     *  --> Hcee.03 is picked over Hce.03 and Hcee
+     *
+     * The rules are loaded into two different maps, 'equal' and 'startsWith'.
+     * The top-level keys of these maps are the codes that can possibly be overwritten.
+     *
+     * In the 'equal' map we can directly look up a code (key) to see if there are preferred codes that should overwrite it,
+     * while in the 'startsWith' map we check if the code starts with any of the keys. For example if the code is 'Hce'
+     * and we have startsWith = ['He: [:], 'Hm': [:] 'Hc': [:]] we iterate over the entries until 'Hc' is found.
+     *
+     * The value is in turn also a Map containing the codes that are preferred over the code matching the key.
+     * The map at this second level can have two keys, 'equals' and 'startsWith', and the values are sets of preferred codes.
+     *
+     * Example:
+     * [
+     *  'Hc.01': ['equals': ['Hc.01', 'Hc.016', 'Hc.017', 'Hc.018', 'Hcd.01', 'Hcd.016', 'Hcd.017', 'Hcd.018']],
+     *  'Hce': ['startsWith': ['Hce']]
+     * ]
+     *
+     * This means that any code starting with 'Hce' is preferred over just 'Hce' and any of 'Hc.01', 'Hc.016', 'Hc.017'...
+     * is preferred over just 'Hc.01'.
+     */
+    static Tuple2<Map<String, Map>, Map<String, Map>> loadSabPrecedenceRules() {
+        Map equal = [:]
+        Map startsWith = [:]
+
+        def questionMarkSubstitutes = ['6', '7', '8', '']
+
+        Classification.class.getClassLoader()
+                .getResourceAsStream('merge-works/sab-precedence-rules.tsv')
+                .splitEachLine('\t') {
+                    def preferred = it.first()
+                    def preferredStartsWith = preferred.endsWith('*') ? preferred[0..<-1] : null
+                    def preferredEquals = preferred.contains('?')
+                            ? questionMarkSubstitutes.collect { preferred.replace('?', it) }
+                            : (preferredStartsWith ? null : [preferred])
+
+                    def addPreferred = { Map pref ->
+                        if (preferredStartsWith) {
+                            pref.computeIfAbsent('startsWith', f -> [] as Set).add(preferredStartsWith)
+                        }
+                        if (preferredEquals) {
+                            pref.computeIfAbsent('equals', f -> [] as Set).addAll(preferredEquals)
+                        }
+                    }
+
+                    def overwrite = it.drop(1)
+                    overwrite.each { s ->
+                        if (s.endsWith('*')) {
+                            def leading = s[0..<-1]
+                            startsWith.computeIfAbsent(leading, f -> [:]).with(addPreferred)
+                        } else if (s.contains('?')) {
+                            questionMarkSubstitutes.each {
+                                def substituted = s.replace('?', it)
+                                equal.computeIfAbsent(substituted, f -> [:]).with(addPreferred)
+                            }
+                        } else {
+                            equal.computeIfAbsent(s, f -> [:]).with(addPreferred)
+                        }
+                    }
+                }
+
+        return new Tuple2(equal, startsWith)
     }
 }
\ No newline at end of file
diff --git a/librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv b/librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv
new file mode 100644
index 0000000000..1b24ddc46a
--- /dev/null
+++ b/librisworks/src/main/resources/merge-works/sab-precedence-rules.tsv
@@ -0,0 +1,107 @@
+H*	H				
+Hc*	Hc				
+Hcd*	Hcbd*				
+Hcq*	Hcbq*				
+Hc,u	H,u	uHc	uH		
+Hce*	Hce				
+Hce,u	H,u	Hc,u	uHce		
+Hcf*	Hc,u	uHc	Hcf		
+Hcg*	Hc,u	uHc	Hcg		
+Hci,u	Hci				
+Hd*	Hd				
+Hda*	Hda				
+Hdb*	Hdb				
+He*	He				
+Hf*	Hf				
+Hg*	Hg				
+Hi*	Hi				
+Hj*	Hj				
+Hk*	Hk				
+Hl*	Hl				
+Hm*	Hm				
+Hma*	Hma				
+Hmb*	Hmb				
+Hmc*	Hmc				
+Hmd*	Hmd				
+Hsg*	Hsg				
+Hub*	Hua*	Hub
+Hva*	Hva				
+Hxj*	Hxj				
+Hc.01?	Hc.01				
+Hcd.01?	Hcd.01	Hc.01	Hcd		
+Hcd.03	Hc.03	Hcd			
+Hce.01?	Hce.01				
+Hceda.01?	Hceda.01	Hce.01	Hced	Hceda	
+Hceda.03	Hce.03	Hced	Hceda		
+Hcedb.01?	Hcedb.01	Hce.01?	Hced	Hcedb	
+Hcedb.03	Hce.03	Hced	Hcedb		
+Hcee.01?	Hcee.01	Hce.01?	Hcee		
+Hcee.03	Hce.03	Hcee			
+Hceeq.01?	Hceeq.01	Hcee.01?	Hce.01?	Hcee	Hceeq
+Hceeq.03	Hcee.03	Hce.03	Hcee	Hceeq	
+Hcef.01?	Hcef.01	Hce.01?	Hcef		
+Hcef.03	Hce.03	Hcef			
+Hceg.01?	Hceg.01	Hce.01?	Hceg		
+Hceg.03	Hce.03	Hceg			
+Hcei.01?	Hcei.01	Hce.01?	Hcei		
+Hcei.03	Hce.03	Hcei			
+Hcej.01?	Hcej.01	Hce.01?	Hcej		
+Hcej.03	Hce.03	Hcej			
+Hcek.01?	Hcek.01	Hce.01?	Hcek		
+Hcek.03	Hce.03	Hcek			
+Hcekq.01?	Hcekq.01	Hcek.01?	Hce.01?	Hcek	Hcekq
+Hcekq.03	Hcek.03	Hce.03	Hcek	Hcekq	
+Hcel.01?	Hcel.01	Hce.01?	Hcel		
+Hcel.03	Hce.03	Hcel			
+Hcema.01?	Hcema.01	Hce.01?	Hcem	Hcema	
+Hcema.03	Hce.03	Hcem	Hcema		
+Hcemb.01?	Hcemb.01	Hce.01?	Hcem	Hcema	
+Hcemb.03	Hce.03	Hcem	Hcemb		
+Hcemc.01?	Hcemc.01	Hce.01?	Hcem	Hcemc	
+Hcemc.03	Hce.03	Hcem	Hcmec
+Hcemd.01?	Hcemd.01	Hce.01?	Hcem	Hcemd	
+Hcemd.03	Hce.03	Hcem	Hcemd		
+Hcesg.01?	Hcesg.01	Hce.01?	Hcesg		
+Hcesg.03	Hce.03	Hces	Hcesg		
+Hceub.01?	Hceub.01	Hce.01?	Hceub		
+Hceub.03	Hce.03	Hceu	Hceub		
+Hceva.01?	Hceva.01	Hce.01?	Hceva		
+Hceva.03	Hce.03	Hcev	Hceva		
+Hcexj.01?	Hcexj.01	Hce.01?	Hcexj		
+Hcexj.03	Hce.03	Hcex	Hcexj		
+Hda.01?=c	Hda.01?	Hda=c			
+Hda.03=c	Hda.03	Hda=c			
+Hdb.01?=c	Hdb.01?	Hdb=c			
+Hdb.03=c	Hdb.03	Hdb=c			
+He.01?=c	He.01?	He=c
+He.03=c	He.03	He=c			
+Heq.01?=c	Heq.01=c	Heq.01?	Heq=c	He.01?	He=c	He.01?=c
+Heq.03=c	Heq.03	Heq=c	He.03*	He=c	
+Hf.01?=c	Hf.01?	Hf=c			
+Hf.03=c	Hf.03	Hf=c			
+Hi.01?=c	Hi.01?	Hi=c			
+Hi.03=c	Hi.03	Hi=c			
+Hj.01?=c	Hj.01?	Hj=c			
+Hj.03=c	Hj.03	Hj=c			
+Hk.01?=c	Hk.01?	Hk=c			
+Hk.03=c	Hk.03	Hk=c			
+Hkq.01?=c	Hkq.01=c	Hkq.01?	Hkq=c	Hk.01?	Hk=c	Hk.01?=c
+Hkq.03=c	Hkq.03	Hkq=c	Hk.03*	Hk=c	
+Hl.01?=c	Hl.01?	Hl=c
+Hl.03=c	Hl.03	Hl=c			
+Hma.01?=c	Hma.01?	Hma=c			
+Hma.03=c	Hma.03	Hma=c			
+Hmb.01?=c	Hmb.01?	Hmb=c			
+Hmb.03=c	Hmd.03	Hmb=c			
+Hmc.01?=c	Hmc.01?	Hmc=c			
+Hmc.03=c	Hmc.03	Hmc=c			
+Hmd.01?=c	Hmd.01?	Hmd=c			
+Hmd.03=c	Hmd.03	Hmd=c			
+Hsg.01?=c	Hsg.01?	Hsg=c			
+Hsg.03=c	Hsg.03	Hsg=c			
+Hub.01?=c	Hub.01?	Hub=c			
+Hub.03=c	Hub.03	Hub=c			
+Hva.01?=c	Hva.01?	Hva=c			
+Hva.03=c	Hva.03	Hva=c			
+Hxj.01?=c	Hxj.01?	Hxj=c			
+Hxj.03=c	Hxj.03	Hxj=c			
diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy
new file mode 100644
index 0000000000..8bf21f3cfe
--- /dev/null
+++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy
@@ -0,0 +1,22 @@
+package se.kb.libris.mergeworks.compare
+
+import spock.lang.Specification
+
+class ClassificationSpec extends Specification {
+    def "merge SAB codes"() {
+        expect:
+        Classification.tryMergeSabCodes(a, b) == result
+
+        where:
+        a           || b         || result
+        'H'         || 'H'       || 'H'
+        'Haaa'      || 'H'       || 'Haaa'
+        'Hcqaa'     || 'Hcbqbbb' || 'Hcqaa'
+        'Hcb'       || 'Hc'      || null
+        'Hci'       || 'Hci,u'   || 'Hci,u'
+        'Hcd.016'   || 'Hcd.01'  || 'Hcd.016'
+        'Hc.01'     || 'Hcd.01'  || 'Hcd.01'
+        'Hda.017=c' || 'Hda.018' || 'Hda.017=c'
+        'He'        || 'Hc'      || null
+    }
+}