Skip to content

Commit

Permalink
Handle edge cases of keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
ibrahimjaved12 committed Jan 22, 2024
1 parent 059b141 commit c2b9935
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions ocw_oer_export/utilities/normalize_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Module for normalizing OCW FM export course keywords.
"""

import re


def normalize_keywords(keywords):
"""
Normalizes keywords from the OCW FM export file to a standardized, pipe-separated format.
The OCW FM export file may contain keywords in various formats, including comma-separated,
semicolon-separated, newline-separated, double newlines separated, or combinations of these.
This function converts these formats into a standardized format used in OER: a pipe-separated
string where each keyword's first letter is capitalized.
Example:
Input: "novel, short story; the city in literature\nnarrative voice"
Output: "Novel|Short Story|The City In Literature|Narrative Voice"
"""
normalized_keywords = re.sub(r"[;,]|\n\n|\n", "|", keywords).strip()
keywords_list = [
keyword.title()
for keyword in (
normalized_keyword.strip()
for normalized_keyword in normalized_keywords.split("|")
)
if keyword
]
return "|".join(keywords_list)

0 comments on commit c2b9935

Please sign in to comment.