From 30737eb0f2c7da359bacddc54e1a91668b5f4014 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Tue, 14 Nov 2023 14:40:32 +0100 Subject: [PATCH 1/2] Keep only one Dewey code in classification, move rest to additionalClassificationDdc --- .../libris/mergeworks/WorkComparator.groovy | 2 + .../mergeworks/compare/Classification.groovy | 73 +++++++++++++++---- .../compare/ClassificationSpec.groovy | 22 ++++++ 3 files changed, 84 insertions(+), 13 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy index 0e6e064fed..121b83cb1b 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/WorkComparator.groovy @@ -68,6 +68,8 @@ class WorkComparator { } } + Classification.moveAdditionalDewey(result, docs) + return result } diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy index 12496be8ba..7a5c26e795 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy @@ -1,5 +1,9 @@ package se.kb.libris.mergeworks.compare +import se.kb.libris.mergeworks.Doc + +import static se.kb.libris.mergeworks.Util.asList + class Classification extends StuffSet { private static def sabPrecedenceRules = loadSabPrecedenceRules() @@ -34,9 +38,9 @@ class Classification extends StuffSet { return result } } else if (isDewey(c1) && isDewey(c2)) { - def code = code1.startsWith(code2.replace("/", "")) + def code = deweyPrecedes(code1, code2) ? code1 - : (code2.startsWith(code1.replace("/", "")) ? code2 : null) + : (deweyPrecedes(code2, code1) ? code2 : null) if (code) { Map result = [:] result.putAll(c1) @@ -49,18 +53,12 @@ class Classification extends StuffSet { } } - boolean isSab(Map c) { - c['inScheme'] && c['inScheme']['code'] =~ 'kssb' - } - - String maxSabVersion(c1, c2) { - def v1 = c1['inScheme']['version'] ?: "-1" - def v2 = c2['inScheme']['version'] ?: "-1" - Integer.parseInt(v1) > Integer.parseInt(v2) ? v1 : v2 + static boolean isDewey(Map c) { + c['@type'] == 'ClassificationDdc' } - boolean isDewey(Map c) { - c['@type'] == 'ClassificationDdc' + static boolean deweyPrecedes(String a, String b) { + a.startsWith(b.replace("/", "")) } String maxDeweyEdition(c1, c2) { @@ -69,10 +67,59 @@ class Classification extends StuffSet { deweyEdition(v1) > deweyEdition(v2) ? v1 : v2 } - int deweyEdition(String edition) { + static int deweyEdition(String edition) { Integer.parseInt((edition ?: "0").replaceAll("[^0-9]", "")) } + static void moveAdditionalDewey(Map mergedWork, Collection instanceDocs) { + def deweyOnMerged = asList(mergedWork['classification']).findAll { Map c -> isDewey(c) } + if (deweyOnMerged.size() > 1) { + def allDewey = instanceDocs.collect { it.classification() } + .flatten() + .findAll { Map c -> isDewey(c) } + + def preferredDewey = findPreferredDewey(deweyOnMerged, allDewey) + + deweyOnMerged.remove(preferredDewey) + + mergedWork['classification'] = asList(mergedWork['classification']) - deweyOnMerged + mergedWork['additionalClassificationDdc'] = (asList(mergedWork['additionalClassificationDdc']) + deweyOnMerged).unique() + } + } + + static Map findPreferredDewey(List deweyOnMerged, List allDewey) { + def occurrenceCount = deweyOnMerged.collectEntries { Map dom -> + def numOccurrences = allDewey.count { Map d -> + def code1 = d['code'] + def code2 = dom['code'] + code1 && code2 && (deweyPrecedes(code1, code2) || deweyPrecedes(code2, code1)) + } + [dom, numOccurrences] + } + + def maxOccurrences = occurrenceCount.max { it.value }.value + + def preferred = occurrenceCount.findResults { k, v -> v == maxOccurrences ? k : null } + .sort { a, b -> + def aEdition = a['editionEnumeration'] + def bEdition = b['editionEnumeration'] + deweyEdition(bEdition) <=> deweyEdition(aEdition) + ?: bEdition?.contains('swe') <=> aEdition?.contains('swe') + }.first() + + return preferred + } + + boolean isSab(Map c) { + c['inScheme'] && c['inScheme']['code'] =~ 'kssb' + } + + String maxSabVersion(c1, c2) { + def v1 = c1['inScheme']['version'] ?: "-1" + def v2 = c2['inScheme']['version'] ?: "-1" + Integer.parseInt(v1) > Integer.parseInt(v2) ? v1 : v2 + } + static String normalizeSabCode(String sab) { sab.replaceFirst(~/^h/, 'H').with { it =~ /bf:|z/ ? it : it.replaceAll(~/\s+/, '') diff --git a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy index 8bf21f3cfe..9f5157f004 100644 --- a/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy +++ b/librisworks/src/test/groovy/se/kb/libris/mergeworks/compare/ClassificationSpec.groovy @@ -19,4 +19,26 @@ class ClassificationSpec extends Specification { 'Hda.017=c' || 'Hda.018' || 'Hda.017=c' 'He' || 'Hc' || null } + + def "find which of multiple Dewey codes to keep in classification"() { + given: + def onMerged = (0.. + [ + 'code' : "x" + i, + 'editionEnumeration': editionsOnMerged[i] + ] + } + def all = allCodes.collect { ['code': it] } + + expect: + Classification.findPreferredDewey(onMerged, all) == result + + where: + editionsOnMerged || allCodes || result + ['23/swe', '23/swe'] || ['x0', 'x0', 'x1', 'x1', 'x1'] || ['code': 'x1', 'editionEnumeration': '23/swe'] + [null, '23/swe'] || ['x0', 'x0', 'x1', 'x1', null] || ['code': 'x1', 'editionEnumeration': '23/swe'] + ['23', '22/swe'] || ['x0', 'x0', 'x1', 'x1'] || ['code': 'x0', 'editionEnumeration': '23'] + ['23/swe', '23'] || ['x0', 'x0', 'x1', 'x1'] || ['code': 'x0', 'editionEnumeration': '23/swe'] + ['22', '23/swe'] || ['x0', 'x0', 'x0', 'x1', 'x1'] || ['code': 'x0', 'editionEnumeration': '22'] + } } From ea7368e0fa7613e3c2e7b4acf555e0d1e317df35 Mon Sep 17 00:00:00 2001 From: kwahlin Date: Mon, 20 Nov 2023 14:04:31 +0100 Subject: [PATCH 2/2] Improve naming --- .../se/kb/libris/mergeworks/compare/Classification.groovy | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy index 7a5c26e795..b1765aff58 100644 --- a/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy +++ b/librisworks/src/main/groovy/se/kb/libris/mergeworks/compare/Classification.groovy @@ -80,10 +80,10 @@ class Classification extends StuffSet { def preferredDewey = findPreferredDewey(deweyOnMerged, allDewey) - deweyOnMerged.remove(preferredDewey) + def additionalDewey = deweyOnMerged - preferredDewey - mergedWork['classification'] = asList(mergedWork['classification']) - deweyOnMerged - mergedWork['additionalClassificationDdc'] = (asList(mergedWork['additionalClassificationDdc']) + deweyOnMerged).unique() + mergedWork['classification'] = asList(mergedWork['classification']) - additionalDewey + mergedWork['additionalClassificationDdc'] = (asList(mergedWork['additionalClassificationDdc']) + additionalDewey).unique() } }