Skip to content

Commit

Permalink
Modify Zh to IPA pipeline to center around json
Browse files Browse the repository at this point in the history
These modifications allow for:
1. HF_TOKEN saving outside of bash script
2. json centered text file creation

Building around the json file will allow us to do many n choose k
datasets for exploration.
  • Loading branch information
gkielian committed Jan 20, 2025
1 parent ce2cda1 commit fa63ac7
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 35 deletions.
37 changes: 26 additions & 11 deletions data/commonvoice_zh/get_dataset.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# !/bin/bash

# Set strict error handling
set -euo pipefail
set -xe

# Install python dependencies for Hugging face
pip install -U "huggingface_hub[cli]"
Expand All @@ -13,7 +13,13 @@ pip install jieba
# Replace with your hugging face tokens
##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ######
##### "Token Type" of "Read" is recommended. ########
HF_TOKEN=""
if [[ -f ~/.cache/huggingface/token && -s ~/.cache/huggingface/token ]]; then
export HF_TOKEN=$(cat ~/.cache/huggingface/token)
else
echo "Consider running 'python3 ./utils/save_hf_token.py' to automate finding HF_TOKEN"
read -s -p "To continue, please enter your Hugging Face token: " HF_TOKEN
echo "" # Add a newline for better readability
fi

# Authenticate with hugging face
echo "Authenticating with Hugging Face..."
Expand All @@ -31,11 +37,11 @@ fi

# Download transcription files under "transcription" directory.
pushd "${out_dir}"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true" || true

echo "transcripts downloaded and saved to transcription."
popd
Expand All @@ -55,11 +61,20 @@ done
echo "All .tsv files have been processed."

# Run program to convert sentences into IPA format.
output_ipa="zh_ipa.txt"
output_json_with_ipa="zh_ipa.json"
echo "Converting sentences to IPA..."
python3 utils/zh_to_ipa.py "$output_file" "$output_ipa"

python3 utils/zh_to_ipa.py -j "$output_file" "$output_json_with_ipa"
echo "IPA conversion finished."

output_ipa_txt="zh_ipa.txt"
python3 utils/extract_json_values.py "$output_json_with_ipa" "sentence_ipa" "$output_ipa_txt"
echo "IPA extraction finished."

#TODO(gkielian): see if we can fix the parsing of rows instead of deleting
# Remove lines which were not correclty processed (and start with numberic hash)
wc -l "$output_ipa_txt"
sed -i "/^[0-9].*/g" "$output_ipa_txt"
wc -l "$output_ipa_txt"

# Tokenization step to create train.bin and val.bin files.
python3 prepare.py -t "$output_ipa" --method char
python3 prepare.py -t "$output_ipa_txt" --method char
116 changes: 92 additions & 24 deletions data/template/utils/zh_to_ipa.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import json


def transcribe_chinese(sentence):
"""Transcribe a Chinese sentence into its phonemes using dragonmapper."""
try:
Expand All @@ -13,6 +14,7 @@ def transcribe_chinese(sentence):
except Exception as e:
return f"Error in transcribing Chinese: {str(e)}"


def handle_mixed_language(word):
"""Handle a word with potential Chinese, Language, or number content."""
if word.isdigit(): # Detect numbers but just pass through for now (different in each language)
Expand All @@ -22,39 +24,105 @@ def handle_mixed_language(word):
else: # Non-Chinese Word
return "[[[[[" + word + "]]]]]"

def transcribe_multilingual(lists, output_file):
"""Transcribe multilingual sentences (English and Chinese, with numbers) and save to a file."""
with open(output_file, 'w', encoding='utf-8') as f:
for item in lists:
result = []
sentence = item['sentence']
# Split sentence using jieba
seg_list = jieba.cut(sentence, cut_all=False)
seg_sentence = " ".join(seg_list)
# Split sentence but keep punctuation (preserve spaces, commas, etc.)
words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
for word in words:
if re.match(r'\w+', word): # Only process words (skip punctuation)
result.append(handle_mixed_language(word))

def transcribe_multilingual(data, output_file, json_inplace_update=False, json_input_field="sentence",
json_output_field="sentence_ipa"):
"""
Transcribe multilingual sentences (English and Chinese, with numbers) and save to a file.
Args:
data: The input data (list of dictionaries if JSON, list of strings if plain text).
output_file: Path to the output file.
json_inplace_update: If True, process JSON input and add IPA to the same JSON.
json_input_field: The field in the JSON data to transcribe (default: "sentence").
json_output_field: The field to write the IPA transcription to (default: "sentence_ipa").
"""
if json_inplace_update:
# In-place update for JSON data
for item in data:
if json_input_field in item:
sentence = item[json_input_field]
result = []

# Split sentence using jieba
seg_list = jieba.cut(sentence, cut_all=False)
seg_sentence = "".join(seg_list)

# Split sentence but keep punctuation
words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
for word in words:
if re.match(r'\w+', word): # Only process words
result.append(handle_mixed_language(word))
else:
result.append(word) # Preserve punctuation

transcription_result = " ".join(result)
item[json_output_field] = transcription_result

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"In-place JSON transcription saved to {output_file}")

else:
# Standard transcription (either JSON or plain text to plain text output)
with open(output_file, 'w', encoding='utf-8') as f:
for item in data:
result = []
if isinstance(item, dict):
sentence = item.get(json_input_field, "")
else:
result.append(word) # Preserve punctuation as is
transcription_result = " ".join(result)
f.write(transcription_result + "\n")
print(transcription_result) # Print to console for reference
sentence = item

# Split sentence using jieba
seg_list = jieba.cut(sentence, cut_all=False)
seg_sentence = "".join(seg_list)

# Split sentence but keep punctuation
words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
for word in words:
if re.match(r'\w+', word): # Only process words
result.append(handle_mixed_language(word))
else:
result.append(word) # Preserve punctuation

transcription_result = " ".join(result)
f.write(transcription_result + "\n")
print(transcription_result) # Print to console


def main():
parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes.')
parser.add_argument('input_file', type=str, help='Path to the input file containing sentences in json format.')
parser.add_argument('input_file', type=str,
help='Path to the input file containing sentences in json format.')
parser.add_argument('output_file', type=str, help='Path to the output file for IPA transcription.')
parser.add_argument('--input_type', type=str, choices=['json', 'text'], default='json',
help='Type of input file: "json" or "text" (default: json)')
parser.add_argument("-j", "--json_inplace_update", action="store_true",
help="Process JSON input and add IPA to the same JSON entries")
parser.add_argument("--json_input_field", default="sentence",
help="JSON field to read from (default: sentence)")
parser.add_argument("--json_output_field", default="sentence_ipa",
help="JSON field to write IPA to (default: sentence_ipa)")

args = parser.parse_args()

# Read input sentences
with open(args.input_file, 'r', encoding='utf-8') as f:
lists = json.load(f)
try:
with open(args.input_file, 'r', encoding='utf-8') as f:
if args.input_type == 'json':
data = json.load(f)
else:
data = f.readlines()

transcribe_multilingual(data, args.output_file, args.json_inplace_update,
args.json_input_field, args.json_output_field)

except FileNotFoundError:
print(f"Error: Input file '{args.input_file}' not found.")
except json.JSONDecodeError:
print(f"Error: Invalid JSON format in '{args.input_file}'.")
except Exception as e:
print(f"An unexpected error occurred: {e}")

# Transcribe and save to the output file
transcribe_multilingual(lists, args.output_file)

if __name__ == '__main__':
main()

0 comments on commit fa63ac7

Please sign in to comment.