Skip to content

Commit

Permalink
Update preprocess_commonvoice.py
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr committed Mar 15, 2024
1 parent d9a0ab5 commit 6993183
Showing 1 changed file with 10 additions and 42 deletions.
52 changes: 10 additions & 42 deletions egs/commonvoice/ASR/local/preprocess_commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,48 +56,16 @@ def normalize_text(utt: str, language: str) -> str:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this...
# None en/zh-yue tokens are manually removed here
return (
utt.replace(",", "")
.replace("。", " ")
.replace("?", "")
.replace("!", "")
.replace("?", "")
.replace("!", "")
.replace("‘", "")
.replace("、", "")
.replace(",", "")
.replace(".", "")
.replace(":", "")
.replace(";", "")
.replace("「", "")
.replace("」", "")
.replace("“", "")
.replace("”", "")
.replace("\\", "")
.replace("~", "")
.replace("—", "")
.replace("ㄧ", "")
.replace("《", "")
.replace("》", "")
.replace("…", "")
.replace("⋯", "")
.replace("·", "")
.replace("﹒", "")
.replace(".", "")
.replace(":", "")
.replace("︰", "")
.replace("﹖", "")
.replace("(", "")
.replace(")", "")
.replace("-", "")
.replace("~", "")
.replace(";", "")
.replace("", "")
.replace("﹔", "")
.replace("/", "")
.replace("A", "")
.replace("B", "")
.upper()

# fmt: off
tokens_to_remove = [",", "。", "?", "!", "?", "!", "‘", "、", ",", "\.", ":", ";", "「", "」", "“", "”", "~", "—", "ㄧ", "《", "》", "…", "⋯", "·", "﹒", ".", ":", "︰", "﹖", "(", ")", "-", "~", ";", "", "⠀", "﹔", "/", "A", "B", "–", "‧"]

# fmt: on
utt = utt.upper().replace("\\", "")
return re.sub(
pattern="|".join([f"[{token}]" for token in tokens_to_remove]),
repl="",
string=utt,
)
else:
raise NotImplementedError(
Expand Down

0 comments on commit 6993183

Please sign in to comment.