importer: Update series importer to clean series/ser from title

CERNDocumentServer · Apr 4, 2024 · a7fd72e · a7fd72e
1 parent 0a646f9
commit a7fd72e
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 3 deletions.
diff --git a/cds_ils/importer/series/importer.py b/cds_ils/importer/series/importer.py
@@ -7,6 +7,7 @@
 
 """CDS-ILS Series Importer."""
 
+import re
 import uuid
 from copy import deepcopy
 
@@ -57,6 +58,8 @@
     # all language fields already validated in the rules with pycountry
 }
 
+IGNORE_SUFFIXES = [" series", " ser", " Series", " Ser"]
+
 
 class SeriesImporter(object):
     """Series importer class."""
@@ -80,6 +83,10 @@ def _set_record_import_source(self, record_dict):
     def _before_create(self, json_series):
         """Perform before create metadata modification."""
         series = deepcopy(json_series)
+        # Remove `Series` or `Ser` from the end while preserving the capitalization
+        for substring in IGNORE_SUFFIXES:
+            if re.search(substring + "$", series["title"]):
+                series["title"] = rreplace(series["title"], substring, "")
 
         if "volume" in series:
             del series["volume"]
@@ -107,11 +114,13 @@ def _update_field_identifiers(self, matched_series, json_series):
     def _normalize_title(title):
         """Return a normalized title."""
         t = " ".join(title.lower().split())
-        # remove `series` only at the end of the title
+        # remove `series` or `ser` at the end of the title
         # `International Series of Numerical Mathematics series`
+        # or `International   series of Numerical mathematics   ser`
         # will become
         # `international series of numerical mathematics`
-        t = rreplace(t, " series", "")
+        for substring in IGNORE_SUFFIXES:
+            t = rreplace(t, substring, "")
         return t.strip()
 
     def update_series(self, matched_series, json_series):
@@ -222,7 +231,7 @@ def _validate_matches(self, json_series, matches):
         def filter_non_serials(match):
             """Drops periodicals and multipart monographs."""
             _series = all_series[match]
-            # drop multipart monographs
+            # Drop multipart monographs and periodicals
             is_serial = _series["mode_of_issuance"] == "SERIAL"
             is_type_serial = _series.get("series_type") == "SERIAL"
             return is_serial and is_type_serial

diff --git a/tests/importer/data/match_testing_series.json b/tests/importer/data/match_testing_series.json
@@ -61,5 +61,9 @@
   {
     "title": "   the gulf:         The making of An American    Sea Series   ",
     "note": "Will match serid-imp-1 by normalized title"
+  },
+  {
+    "title": "the  gulf:      the making   of an american  sea   ser",
+    "note": "Will match serid-imp-1 by normalized title"
   }
 ]
diff --git a/tests/importer/test_series_matching.py b/tests/importer/test_series_matching.py
@@ -88,3 +88,9 @@ def match(json_series):
     validated_matches = match(json_series)
     assert len(validated_matches) == 1
     assert validated_matches[0] == "serid-imp-1"
+
+    # test `ser` suffix and different capitalization in title
+    json_series = series_to_import[12]
+    validated_matches = match(json_series)
+    assert len(validated_matches) == 1
+    assert validated_matches[0] == "serid-imp-1"