Skip to content

Commit

Permalink
add text norm script for pl (#1532)
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr authored Mar 7, 2024
1 parent 335a996 commit cdb3fb5
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion egs/commonvoice/ASR/local/preprocess_commonvoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,18 @@ def normalize_text(utt: str, language: str) -> str:
utt = re.sub("’", "'", utt)
if language == "en":
return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
if language == "fr":
elif language == "fr":
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
elif language == "pl":
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
else:
raise NotImplementedError(
f"""
Text normalization not implemented for language: {language},
please consider implementing it in the local/preprocess_commonvoice.py
or raise an issue on GitHub to request it.
"""
)


def preprocess_commonvoice(
Expand Down

0 comments on commit cdb3fb5

Please sign in to comment.