From c2b99356a6b933e7a51f6a5f291a492d4fa0855d Mon Sep 17 00:00:00 2001 From: ibrahimjaved12 Date: Mon, 22 Jan 2024 13:09:31 +0500 Subject: [PATCH] Handle edge cases of keywords --- .../utilities/normalize_keywords.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 ocw_oer_export/utilities/normalize_keywords.py diff --git a/ocw_oer_export/utilities/normalize_keywords.py b/ocw_oer_export/utilities/normalize_keywords.py new file mode 100644 index 0000000..164ba9d --- /dev/null +++ b/ocw_oer_export/utilities/normalize_keywords.py @@ -0,0 +1,31 @@ +""" +Module for normalizing OCW FM export course keywords. +""" + +import re + + +def normalize_keywords(keywords): + """ + Normalizes keywords from the OCW FM export file to a standardized, pipe-separated format. + + The OCW FM export file may contain keywords in various formats, including comma-separated, + semicolon-separated, newline-separated, double newlines separated, or combinations of these. + + This function converts these formats into a standardized format used in OER: a pipe-separated + string where each keyword's first letter is capitalized. + + Example: + Input: "novel, short story; the city in literature\nnarrative voice" + Output: "Novel|Short Story|The City In Literature|Narrative Voice" + """ + normalized_keywords = re.sub(r"[;,]|\n\n|\n", "|", keywords).strip() + keywords_list = [ + keyword.title() + for keyword in ( + normalized_keyword.strip() + for normalized_keyword in normalized_keywords.split("|") + ) + if keyword + ] + return "|".join(keywords_list)