From c2b99356a6b933e7a51f6a5f291a492d4fa0855d Mon Sep 17 00:00:00 2001
From: ibrahimjaved12 <ibrahim.javed@arbisoft.com>
Date: Mon, 22 Jan 2024 13:09:31 +0500
Subject: [PATCH] Handle edge cases of keywords

---
 .../utilities/normalize_keywords.py           | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 ocw_oer_export/utilities/normalize_keywords.py

diff --git a/ocw_oer_export/utilities/normalize_keywords.py b/ocw_oer_export/utilities/normalize_keywords.py
new file mode 100644
index 0000000..164ba9d
--- /dev/null
+++ b/ocw_oer_export/utilities/normalize_keywords.py
@@ -0,0 +1,31 @@
+"""
+Module for normalizing OCW FM export course keywords.
+"""
+
+import re
+
+
+def normalize_keywords(keywords):
+    """
+    Normalizes keywords from the OCW FM export file to a standardized, pipe-separated format.
+
+    The OCW FM export file may contain keywords in various formats, including comma-separated,
+    semicolon-separated, newline-separated, double newlines separated, or combinations of these.
+
+    This function converts these formats into a standardized format used in OER: a pipe-separated
+    string where each keyword's first letter is capitalized.
+
+    Example:
+    Input:  "novel, short story; the city in literature\nnarrative voice"
+    Output: "Novel|Short Story|The City In Literature|Narrative Voice"
+    """
+    normalized_keywords = re.sub(r"[;,]|\n\n|\n", "|", keywords).strip()
+    keywords_list = [
+        keyword.title()
+        for keyword in (
+            normalized_keyword.strip()
+            for normalized_keyword in normalized_keywords.split("|")
+        )
+        if keyword
+    ]
+    return "|".join(keywords_list)