Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

importer: Update series importer to clean series/ser from title #860

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions cds_ils/importer/series/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

"""CDS-ILS Series Importer."""

import re
import uuid
from copy import deepcopy

Expand Down Expand Up @@ -57,6 +58,8 @@
# all language fields already validated in the rules with pycountry
}

IGNORE_SUFFIXES = [" series", " ser", " Series", " Ser"]


class SeriesImporter(object):
"""Series importer class."""
Expand All @@ -80,6 +83,10 @@ def _set_record_import_source(self, record_dict):
def _before_create(self, json_series):
"""Perform before create metadata modification."""
series = deepcopy(json_series)
# Remove `Series` or `Ser` from the end while preserving the capitalization
for substring in IGNORE_SUFFIXES:
if re.search(substring + "$", series["title"]):
series["title"] = rreplace(series["title"], substring, "")

if "volume" in series:
del series["volume"]
Expand Down Expand Up @@ -107,11 +114,13 @@ def _update_field_identifiers(self, matched_series, json_series):
def _normalize_title(title):
"""Return a normalized title."""
t = " ".join(title.lower().split())
# remove `series` only at the end of the title
# remove `series` or `ser` at the end of the title
# `International Series of Numerical Mathematics series`
# or `International series of Numerical mathematics ser`
# will become
# `international series of numerical mathematics`
t = rreplace(t, " series", "")
for substring in IGNORE_SUFFIXES:
t = rreplace(t, substring, "")
return t.strip()

def update_series(self, matched_series, json_series):
Expand Down Expand Up @@ -222,7 +231,7 @@ def _validate_matches(self, json_series, matches):
def filter_non_serials(match):
"""Drops periodicals and multipart monographs."""
_series = all_series[match]
# drop multipart monographs
# Drop multipart monographs and periodicals
is_serial = _series["mode_of_issuance"] == "SERIAL"
is_type_serial = _series.get("series_type") == "SERIAL"
return is_serial and is_type_serial
Expand Down
4 changes: 4 additions & 0 deletions tests/importer/data/match_testing_series.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,9 @@
{
"title": " the gulf: The making of An American Sea Series ",
"note": "Will match serid-imp-1 by normalized title"
},
{
"title": "the gulf: the making of an american sea ser",
"note": "Will match serid-imp-1 by normalized title"
}
]
6 changes: 6 additions & 0 deletions tests/importer/test_series_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ def match(json_series):
validated_matches = match(json_series)
assert len(validated_matches) == 1
assert validated_matches[0] == "serid-imp-1"

# test `ser` suffix and different capitalization in title
json_series = series_to_import[12]
validated_matches = match(json_series)
assert len(validated_matches) == 1
assert validated_matches[0] == "serid-imp-1"
Loading