Skip to content

Commit

Permalink
importer: Update series importer to clean series/ser from title
Browse files Browse the repository at this point in the history
  • Loading branch information
sakshamarora1 authored and kpsherva committed Apr 4, 2024
1 parent 0a646f9 commit a7fd72e
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 3 deletions.
15 changes: 12 additions & 3 deletions cds_ils/importer/series/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

"""CDS-ILS Series Importer."""

import re
import uuid
from copy import deepcopy

Expand Down Expand Up @@ -57,6 +58,8 @@
# all language fields already validated in the rules with pycountry
}

IGNORE_SUFFIXES = [" series", " ser", " Series", " Ser"]


class SeriesImporter(object):
"""Series importer class."""
Expand All @@ -80,6 +83,10 @@ def _set_record_import_source(self, record_dict):
def _before_create(self, json_series):
"""Perform before create metadata modification."""
series = deepcopy(json_series)
# Remove `Series` or `Ser` from the end while preserving the capitalization
for substring in IGNORE_SUFFIXES:
if re.search(substring + "$", series["title"]):
series["title"] = rreplace(series["title"], substring, "")

if "volume" in series:
del series["volume"]
Expand Down Expand Up @@ -107,11 +114,13 @@ def _update_field_identifiers(self, matched_series, json_series):
def _normalize_title(title):
"""Return a normalized title."""
t = " ".join(title.lower().split())
# remove `series` only at the end of the title
# remove `series` or `ser` at the end of the title
# `International Series of Numerical Mathematics series`
# or `International series of Numerical mathematics ser`
# will become
# `international series of numerical mathematics`
t = rreplace(t, " series", "")
for substring in IGNORE_SUFFIXES:
t = rreplace(t, substring, "")
return t.strip()

def update_series(self, matched_series, json_series):
Expand Down Expand Up @@ -222,7 +231,7 @@ def _validate_matches(self, json_series, matches):
def filter_non_serials(match):
"""Drops periodicals and multipart monographs."""
_series = all_series[match]
# drop multipart monographs
# Drop multipart monographs and periodicals
is_serial = _series["mode_of_issuance"] == "SERIAL"
is_type_serial = _series.get("series_type") == "SERIAL"
return is_serial and is_type_serial
Expand Down
4 changes: 4 additions & 0 deletions tests/importer/data/match_testing_series.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,9 @@
{
"title": " the gulf: The making of An American Sea Series ",
"note": "Will match serid-imp-1 by normalized title"
},
{
"title": "the gulf: the making of an american sea ser",
"note": "Will match serid-imp-1 by normalized title"
}
]
6 changes: 6 additions & 0 deletions tests/importer/test_series_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ def match(json_series):
validated_matches = match(json_series)
assert len(validated_matches) == 1
assert validated_matches[0] == "serid-imp-1"

# test `ser` suffix and different capitalization in title
json_series = series_to_import[12]
validated_matches = match(json_series)
assert len(validated_matches) == 1
assert validated_matches[0] == "serid-imp-1"

0 comments on commit a7fd72e

Please sign in to comment.