From 9a83e2a840c03a818f5da540757d5e6abd6dd56d Mon Sep 17 00:00:00 2001 From: Gregory Kielian Date: Sun, 19 Jan 2025 12:32:45 -0800 Subject: [PATCH 1/4] Remove extra prepare.py file in the data folder --- data/prepare.py | 1 - 1 file changed, 1 deletion(-) delete mode 120000 data/prepare.py diff --git a/data/prepare.py b/data/prepare.py deleted file mode 120000 index 713f6b0012..0000000000 --- a/data/prepare.py +++ /dev/null @@ -1 +0,0 @@ -../template/prepare.py \ No newline at end of file From 85aab37040b9a7d20de039acdf5161afcee78e38 Mon Sep 17 00:00:00 2001 From: Gregory Kielian Date: Sun, 19 Jan 2025 01:25:11 -0800 Subject: [PATCH 2/4] Streamline flow for language to ipa conversion 1. HF_TOKEN Handling Added a script to remove need to manually enter HF_TOKEN to the .sh file. This prevents us from accidentally adding the HF_TOKEN into git tracking. Also provides a potentially uniform location for the team to place this token. 2. Change csv to pandas for tsv to json conversion The pandas library was recommended, and appears to overcome some challenges when converting to json (not perfect still however). 3. Modify ja2ipa.py and ko_en_to_ipa.py to do inplace json modification Added argparse, and added an option for inplace json modification. This allows us to continually just add to the json file, and later mix and match fields to create different types of datasets (e.g. text to ipa, spaced text to spaced ipa, non-spaced ipa to spaced ipa, etc). --- data/commonvoice_ja/get_dataset.sh | 45 ++++++---- data/commonvoice_ko/get_dataset.sh | 7 +- data/template/utils/extract_json_values.py | 50 ++++++++++++ data/template/utils/ja2ipa.py | 71 ++++++++++++++-- data/template/utils/ko_en_to_ipa.py | 86 ++++++++++++++------ data/template/utils/save_hf_token.py | 7 ++ data/template/utils/tsv_to_json_cv_pandas.py | 43 ++++++++++ 7 files changed, 259 insertions(+), 50 deletions(-) create mode 100644 data/template/utils/extract_json_values.py create mode 100644 data/template/utils/save_hf_token.py create mode 100644 data/template/utils/tsv_to_json_cv_pandas.py diff --git a/data/commonvoice_ja/get_dataset.sh b/data/commonvoice_ja/get_dataset.sh index d52adbfa1d..e116ffb415 100644 --- a/data/commonvoice_ja/get_dataset.sh +++ b/data/commonvoice_ja/get_dataset.sh @@ -1,7 +1,6 @@ # !/bin/bash -# Set strict error handling -set -euo pipefail +set -xe # Install python dependencies for Hugging face pip install -U "huggingface_hub[cli]" @@ -10,7 +9,13 @@ pip install -U "huggingface_hub[cli]" # Replace with your hugging face tokens ##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ###### ##### "Token Type" of "Read" is recommended. ######## -HF_TOKEN="" +if [[ -f ~/.cache/huggingface/token && -s ~/.cache/huggingface/token ]]; then + export HF_TOKEN=$(cat ~/.cache/huggingface/token) +else + echo "Consider running 'python3 ./utils/save_hf_token.py' to automate finding HF_TOKEN" + read -s -p "To continue, please enter your Hugging Face token: " HF_TOKEN + echo "" # Add a newline for better readability +fi # Authenticate with hugging face echo "Authenticating with Hugging Face..." @@ -28,12 +33,12 @@ fi # Download transcription files under "transcription" directory. pushd "${out_dir}" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/ja/dev.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "invalidated.tsv" "${url}/resolve/main/transcript/ja/validated.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/ja/other.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/ja/test.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/ja/train.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/ja/validated.tsv?download=true" +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/ja/dev.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "invalidated.tsv" "${url}/resolve/main/transcript/ja/validated.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/ja/other.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/ja/test.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/ja/train.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/ja/validated.tsv?download=true" || true echo "transcripts downloaded and saved to transcription." popd @@ -46,18 +51,28 @@ for tsvfile in "$out_dir"/*.tsv; do echo "Processing $tsvfile..." # Get the filename without the extension for output filename filename=$(basename "${tsvfile%.tsv}") - python3 utils/tsv_to_json_cv.py "$tsvfile" "$output_file" + python3 utils/tsv_to_json_cv_pandas.py "$tsvfile" "$output_file" fi done echo "All .tsv files have been processed." -# Run program to convert sentences into IPA format. -output_ipa="ja_ipa.txt" +# # Run program to convert sentences into IPA format. +output_json_with_ipa="ja_ipa.json" echo "Converting sentences to IPA..." -python3 utils/ja2ipa.py "$output_file" "$output_ipa" - +python3 utils/ja2ipa.py -j "$output_file" "$output_json_with_ipa" echo "IPA conversion finished." +output_ipa_txt="ja_ipa.txt" +python3 utils/extract_json_values.py "$output_json_with_ipa" "sentence_ipa" "$output_ipa_txt" +echo "IPA extraction finished." + +#TODO(gkielian): see if we can fix the parsing of rows instead of deleting +# Remove lines which were not correclty processed (and start with numberic hash) +wc -l "$output_ipa_txt" +sed -i "/^[0-9].*/g" "$output_ipa_txt" +wc -l "$output_ipa_txt" + + # Tokenization step to create train.bin and val.bin files. -python3 prepare.py -t "$output_ipa" --method char +python3 prepare.py -t "$output_ipa_txt" --method char diff --git a/data/commonvoice_ko/get_dataset.sh b/data/commonvoice_ko/get_dataset.sh index 3bc1e649ff..6e02eb8709 100644 --- a/data/commonvoice_ko/get_dataset.sh +++ b/data/commonvoice_ko/get_dataset.sh @@ -53,9 +53,12 @@ done echo "All .tsv files have been processed." # Run program to convert sentences into IPA format. -output_ipa="ko_ipa.txt" echo "Converting sentences to IPA..." -python3 utils/ko_en_to_ipa.py "$output_file" "$output_ipa" +python3 ./utils/ko_en_to_ipa.py "$output_file" --input_json_key "sentence" --output_json_key "phonetic" + +output_ipa="ko_ipa.txt" +echo "export IPA to txt file" +python3 ./utils/extract_json_values.py "$output_file" "phonetic" "$output_ipa" echo "IPA conversion finished." diff --git a/data/template/utils/extract_json_values.py b/data/template/utils/extract_json_values.py new file mode 100644 index 0000000000..1c5fb31963 --- /dev/null +++ b/data/template/utils/extract_json_values.py @@ -0,0 +1,50 @@ +import json +import argparse + +def extract_values_by_key(json_file, key, output_file): + """ + Extracts all values associated with a specific key from a JSON file + and writes them to an output text file, each value on a new line. + + Args: + json_file: Path to the input JSON file. + key: The key to search for in the JSON data. + output_file: Path to the output text file. + """ + try: + with open(json_file, 'r', encoding='utf-8') as f_in, open(output_file, 'w', encoding='utf-8') as f_out: + data = json.load(f_in) + + def extract_values(data, key, f_out): + if isinstance(data, dict): + for k, v in data.items(): + if k == key: + f_out.write(str(v) + '\n') + else: + extract_values(v, key, f_out) + elif isinstance(data, list): + for item in data: + extract_values(item, key, f_out) + + extract_values(data, key, f_out) + + except FileNotFoundError: + print(f"Error: Input file '{json_file}' not found.") + except json.JSONDecodeError: + print(f"Error: Invalid JSON format in '{json_file}'.") + except Exception as e: + print(f"An error occurred: {e}") + +def main(): + parser = argparse.ArgumentParser(description="Extract values of a specific key from a JSON file to a text file.") + parser.add_argument("json_file", help="Path to the input JSON file.") + parser.add_argument("key", help="The key whose values you want to extract.") + parser.add_argument("output_file", help="Path to the output text file.") + + args = parser.parse_args() + + extract_values_by_key(args.json_file, args.key, args.output_file) + print(f"Values for key '{args.key}' extracted to '{args.output_file}'") + +if __name__ == "__main__": + main() diff --git a/data/template/utils/ja2ipa.py b/data/template/utils/ja2ipa.py index bef4de0d91..046fbcfcc8 100644 --- a/data/template/utils/ja2ipa.py +++ b/data/template/utils/ja2ipa.py @@ -1,5 +1,7 @@ from collections import OrderedDict import pykakasi.kakasi as kakasi +import argparse +import json from tqdm import tqdm import sys @@ -348,11 +350,64 @@ def hiragana2IPA(text): return text -#open original japanese raw text 'input.txt', and save output ipa data with file 'input_ipa.jp' -with open('./input.txt', mode="r", encoding="utf-8") as f: - lines = f.readlines() - for line in tqdm(lines): - kana = getRomeNameByHira(line) - ipa = hiragana2IPA(kana) - with open("./txt_output/input_ipa.jp","a") as mon: - mon.write(ipa) \ No newline at end of file +def process_japanese_text(input_file, output_file, json_inplace_update=False, json_input_field="sentence", json_output_field="sentence_ipa"): + """ + Processes Japanese text, converting it to IPA. Handles both plain text and JSON input. + + Args: + input_file (str): Path to the input file (text or JSON). + output_file (str): Path to the output file. + json_inplace_update (bool): If True, process JSON input and add IPA to the same JSON. + json_input_field (str): JSON field to read from (default: "sentence"). + json_output_field (str): JSON field to write IPA to (default: "sentence_ipa"). + """ + + if json_inplace_update: + try: + with open(input_file, "r", encoding="utf-8") as f: + data = json.load(f) + + for entry in tqdm(data, desc="Processing JSON entries"): + if json_input_field in entry: + kana = getRomeNameByHira(entry[json_input_field]) + ipa = hiragana2IPA(kana) + entry[json_output_field] = ipa # Add IPA to the same JSON entry + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + except FileNotFoundError: + print(f"Error: Input file '{input_file}' not found.") + except json.JSONDecodeError: + print(f"Error: Invalid JSON format in '{input_file}'.") + except Exception as e: + print(f"An error occurred: {e}") + + else: + try: + with open(input_file, mode="r", encoding="utf-8") as f: + lines = f.readlines() + + with open(output_file, "w", encoding="utf-8") as outfile: + for line in tqdm(lines, desc="Processing lines"): + kana = getRomeNameByHira(line.strip()) + ipa = hiragana2IPA(kana) + outfile.write(ipa + "\n") + + except FileNotFoundError: + print(f"Error: Input file '{input_file}' not found.") + except Exception as e: + print(f"An error occurred: {e}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert Japanese text to IPA.") + parser.add_argument("input_file", help="Path to the input Japanese text file (default: input.txt)", nargs="?", default="input.txt") + parser.add_argument("output_file", help="Path to the output IPA file (default: input_ipa.txt)", nargs="?", default="input_ipa.txt") + parser.add_argument("-j", "--json_inplace_update", action="store_true", help="Process JSON input and add IPA to the same JSON entries") + parser.add_argument("--json_input_field", default="sentence", help="JSON field to read from (default: sentence)") + parser.add_argument("--json_output_field", default="sentence_ipa", help="JSON field to write IPA to (default: sentence_ipa)") + + args = parser.parse_args() + + process_japanese_text(args.input_file, args.output_file, args.json_inplace_update, args.json_input_field, args.json_output_field) diff --git a/data/template/utils/ko_en_to_ipa.py b/data/template/utils/ko_en_to_ipa.py index b70e1371d4..ae067f1eec 100644 --- a/data/template/utils/ko_en_to_ipa.py +++ b/data/template/utils/ko_en_to_ipa.py @@ -2,6 +2,7 @@ from konlpy.tag import Okt import argparse import re +import json def transcribe_korean(sentence): """Transcribe a Korean sentence into its phonemes using KoNLPy (Okt).""" @@ -34,37 +35,72 @@ def handle_mixed_language(word): else: # Non-Korean Word return "[[[[[" + word + "]]]]]" -def transcribe_multilingual(sentences, output_file): - """Transcribe multilingual sentences (English and Korean, with numbers) and save to a file.""" - with open(output_file, 'w', encoding='utf-8') as f: - for sentence in sentences: - result = [] - # Split sentence but keep punctuation (preserve spaces, commas, etc.) - words = re.findall(r'\w+|[^\w\s]', sentence, re.UNICODE) - for word in words: - if re.match(r'\w+', word): # Only process words (skip punctuation) - result.append(handle_mixed_language(word)) - else: - result.append(word) # Preserve punctuation as is - transcription_result = " ".join(result) - f.write(transcription_result + "\n") - print(transcription_result) # Print to console for reference +def transcribe_multilingual(sentences, input_json_key=None, output_json_key='ipa'): + """ + Transcribe multilingual sentences and update JSON data directly. + + Args: + sentences: JSON string or a loaded JSON object. + input_json_key: Key to extract sentences from in a JSON. + output_json_key: Key to store IPA transcription in the JSON (default: 'ipa'). + + Returns: + The modified JSON string with IPA transcriptions added. + """ + try: + data = json.loads(sentences) if isinstance(sentences, str) else sentences + if not isinstance(data, list): + raise ValueError("JSON data should be a list of objects.") + + for item in data: + if input_json_key in item: + sentence = item[input_json_key] + result = [] + words = re.findall(r'\w+|[^\w\s]', sentence, re.UNICODE) + for word in words: + if re.match(r'\w+', word): + result.append(handle_mixed_language(word)) + else: + result.append(word) + transcription_result = " ".join(result) + item[output_json_key] = transcription_result # Update directly + print(transcription_result) + else: + print(f"Warning: Key '{input_json_key}' not found in item: {item}") + + except (json.JSONDecodeError, ValueError) as e: + print(f"Error: {e}") + return None + + return json.dumps(data, ensure_ascii=False, indent=4) def main(): - parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes.') - parser.add_argument('input_file', type=str, help='Path to the input file containing sentences.') - parser.add_argument('output_file', type=str, help='Path to the output file for IPA transcription.') + parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes and update JSON data.') + parser.add_argument('input_file', type=str, help='Path to the input JSON file.') + parser.add_argument('--input_json_key', type=str, required=True, help='The key of the Korean text to convert to IPA in the JSON file.') + parser.add_argument('--output_json_key', type=str, default='ipa', help='The key to store the IPA transcription in the JSON file (default: "ipa").') args = parser.parse_args() - # Read input sentences - with open(args.input_file, 'r', encoding='utf-8') as f: - sentences = f.readlines() + try: + with open(args.input_file, 'r', encoding='utf-8') as f: + input_content = f.read() - # Transcribe and save to the output file - transcribe_multilingual(sentences, args.output_file) + # Transcribe and get the updated JSON data + updated_json_data = transcribe_multilingual( + input_content, + args.input_json_key, + args.output_json_key + ) -if __name__ == '__main__': - main() + # Overwrite the original file with the updated JSON + if updated_json_data: + with open(args.input_file, 'w', encoding='utf-8') as f: + f.write(updated_json_data) + print(f"Successfully updated JSON data in '{args.input_file}'") + except FileNotFoundError: + print(f"Error: Input file '{args.input_file}' not found.") +if __name__ == '__main__': + main() diff --git a/data/template/utils/save_hf_token.py b/data/template/utils/save_hf_token.py new file mode 100644 index 0000000000..c2a0d3a7fb --- /dev/null +++ b/data/template/utils/save_hf_token.py @@ -0,0 +1,7 @@ +from getpass import getpass +from huggingface_hub import HfFolder + +HF_TOKEN = getpass("Enter your Hugging Face token: ") + +HfFolder.save_token(HF_TOKEN) +print("Token saved successfully!") diff --git a/data/template/utils/tsv_to_json_cv_pandas.py b/data/template/utils/tsv_to_json_cv_pandas.py new file mode 100644 index 0000000000..c0c4fabe90 --- /dev/null +++ b/data/template/utils/tsv_to_json_cv_pandas.py @@ -0,0 +1,43 @@ +import pandas as pd +import argparse +import json + +def tsv_to_json_auto_columns(input_file, output_file, delimiter='\t'): + """ + Converts a TSV file to a JSON file, automatically detecting column names. + + Args: + input_file (str): Path to the input TSV file. + output_file (str): Path to the output JSON file. + delimiter (str, optional): Delimiter used in the TSV file. Defaults to '\t' (tab). + """ + try: + # Read the TSV file using pandas, automatically detecting header + df = pd.read_csv(input_file, sep=delimiter, header='infer') + + # Convert DataFrame to a list of dictionaries (JSON format) + data = df.to_dict(orient='records') + + # Write to JSON file with pretty printing + with open(output_file, 'w') as f: + json.dump(data, f, indent=4) + + print(f"Successfully converted '{input_file}' to '{output_file}'") + + except FileNotFoundError: + print(f"Error: Input file '{input_file}' not found.") + except pd.errors.ParserError: + print(f"Error: Invalid TSV format in '{input_file}'. Check the delimiter and file structure.") + except Exception as e: + print(f"An unexpected error occurred: {e}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert a TSV file to JSON with automatic column name detection.") + parser.add_argument("input_file", help="Path to the input TSV file.") + parser.add_argument("output_file", help="Path to the output JSON file.") + parser.add_argument("--delimiter", default='\t', help="Delimiter used in the TSV (default: tab)") + + args = parser.parse_args() + + tsv_to_json_auto_columns(args.input_file, args.output_file, args.delimiter) From ce2cda105e9be5ddbea608b4c023d66cecaac968 Mon Sep 17 00:00:00 2001 From: Gregory Kielian Date: Sun, 19 Jan 2025 13:00:48 -0800 Subject: [PATCH 3/4] Add prototype for tokenization inspection --- .../template/utils/tokenization_inspection.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 data/template/utils/tokenization_inspection.py diff --git a/data/template/utils/tokenization_inspection.py b/data/template/utils/tokenization_inspection.py new file mode 100644 index 0000000000..5adea58984 --- /dev/null +++ b/data/template/utils/tokenization_inspection.py @@ -0,0 +1,96 @@ +import argparse +import collections + +def analyze_text(char_file, text_file, num_lines, save_lines): + """ + Analyzes a text file based on a given set of characters. + + Args: + char_file (str): Path to the file containing the list of characters. + text_file (str): Path to the text file to be analyzed. + num_lines (int): Number of lines to display (for option 2). + save_lines (str): Path to save the lines with characters (optional). + """ + + try: + with open(char_file, 'r', encoding='utf-8') as f: + chars = set(f.read().split()) + except FileNotFoundError: + print(f"Error: Character file not found at {char_file}") + return + + try: + with open(text_file, 'r', encoding='utf-8') as f: + text_content = f.read() + except FileNotFoundError: + print(f"Error: Text file not found at {text_file}") + return + + # 1. CLI Histogram of character occurrences + char_counts = collections.Counter(c for c in text_content if c in chars) + + if char_counts: + print("\nCharacter Frequency (CLI Histogram):") + max_count = max(char_counts.values()) + for char, count in sorted(char_counts.items()): + bar_length = int(40 * count / max_count) # Scale bar to 40 characters + print(f"{char}: {'█' * bar_length} ({count})") + else: + print("No characters from the character file were found in the text file.") + + # 2. List of lines with characters + lines_with_chars = [] + with open(text_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if any(c in line for c in chars): + lines_with_chars.append(f"Line {line_num}: {line.strip()}") + + if lines_with_chars: + print("\nLines containing specified characters:") + for line in lines_with_chars[:num_lines]: + print(line) + + if save_lines: + try: + with open(save_lines, 'w', encoding='utf-8') as outfile: + for line in lines_with_chars: + outfile.write(line + '\n') + print(f"Lines saved to {save_lines}") + except Exception as e: + print(f"Error saving lines to file: {e}") + else: + print("No lines in the text file contain the specified characters.") + +def main(): + parser = argparse.ArgumentParser(description="Analyze a text file based on a set of characters.") + parser.add_argument("char_file", help="Path to the file containing the list of characters.") + parser.add_argument("text_file", help="Path to the text file to analyze.") + args = parser.parse_args() + + while True: + print("\nChoose an option:") + print("1. Show CLI histogram of character occurrences") + print("2. List lines containing characters") + print("3. Exit") + + choice = input("Enter your choice (1, 2, or 3): ") + + if choice == '1': + analyze_text(args.char_file, args.text_file, 0, None) + elif choice == '2': + num_lines = input("Enter the number of lines to display (default 10): ") + num_lines = int(num_lines) if num_lines.isdigit() else 10 + + save_option = input("Save lines to a file? (y/n): ").lower() + save_path = None + if save_option == 'y': + save_path = input("Enter the path to save the file: ") + + analyze_text(args.char_file, args.text_file, num_lines, save_path) + elif choice == '3': + break + else: + print("Invalid choice. Please enter 1, 2, or 3.") + +if __name__ == "__main__": + main() From fa63ac7298ce86a13414d1dbb887820a126fe176 Mon Sep 17 00:00:00 2001 From: Gregory Kielian Date: Sun, 19 Jan 2025 16:45:13 -0800 Subject: [PATCH 4/4] Modify Zh to IPA pipeline to center around json These modifications allow for: 1. HF_TOKEN saving outside of bash script 2. json centered text file creation Building around the json file will allow us to do many n choose k datasets for exploration. --- data/commonvoice_zh/get_dataset.sh | 37 ++++++--- data/template/utils/zh_to_ipa.py | 116 +++++++++++++++++++++++------ 2 files changed, 118 insertions(+), 35 deletions(-) diff --git a/data/commonvoice_zh/get_dataset.sh b/data/commonvoice_zh/get_dataset.sh index 62a54e6af0..56e3f08ae2 100644 --- a/data/commonvoice_zh/get_dataset.sh +++ b/data/commonvoice_zh/get_dataset.sh @@ -1,7 +1,7 @@ # !/bin/bash # Set strict error handling -set -euo pipefail +set -xe # Install python dependencies for Hugging face pip install -U "huggingface_hub[cli]" @@ -13,7 +13,13 @@ pip install jieba # Replace with your hugging face tokens ##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ###### ##### "Token Type" of "Read" is recommended. ######## -HF_TOKEN="" +if [[ -f ~/.cache/huggingface/token && -s ~/.cache/huggingface/token ]]; then + export HF_TOKEN=$(cat ~/.cache/huggingface/token) +else + echo "Consider running 'python3 ./utils/save_hf_token.py' to automate finding HF_TOKEN" + read -s -p "To continue, please enter your Hugging Face token: " HF_TOKEN + echo "" # Add a newline for better readability +fi # Authenticate with hugging face echo "Authenticating with Hugging Face..." @@ -31,11 +37,11 @@ fi # Download transcription files under "transcription" directory. pushd "${out_dir}" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true" -wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true" +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true" || true +wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true" || true echo "transcripts downloaded and saved to transcription." popd @@ -55,11 +61,20 @@ done echo "All .tsv files have been processed." # Run program to convert sentences into IPA format. -output_ipa="zh_ipa.txt" +output_json_with_ipa="zh_ipa.json" echo "Converting sentences to IPA..." -python3 utils/zh_to_ipa.py "$output_file" "$output_ipa" - +python3 utils/zh_to_ipa.py -j "$output_file" "$output_json_with_ipa" echo "IPA conversion finished." +output_ipa_txt="zh_ipa.txt" +python3 utils/extract_json_values.py "$output_json_with_ipa" "sentence_ipa" "$output_ipa_txt" +echo "IPA extraction finished." + +#TODO(gkielian): see if we can fix the parsing of rows instead of deleting +# Remove lines which were not correclty processed (and start with numberic hash) +wc -l "$output_ipa_txt" +sed -i "/^[0-9].*/g" "$output_ipa_txt" +wc -l "$output_ipa_txt" + # Tokenization step to create train.bin and val.bin files. -python3 prepare.py -t "$output_ipa" --method char +python3 prepare.py -t "$output_ipa_txt" --method char diff --git a/data/template/utils/zh_to_ipa.py b/data/template/utils/zh_to_ipa.py index d4d868117d..d82982029f 100644 --- a/data/template/utils/zh_to_ipa.py +++ b/data/template/utils/zh_to_ipa.py @@ -5,6 +5,7 @@ import re import json + def transcribe_chinese(sentence): """Transcribe a Chinese sentence into its phonemes using dragonmapper.""" try: @@ -13,6 +14,7 @@ def transcribe_chinese(sentence): except Exception as e: return f"Error in transcribing Chinese: {str(e)}" + def handle_mixed_language(word): """Handle a word with potential Chinese, Language, or number content.""" if word.isdigit(): # Detect numbers but just pass through for now (different in each language) @@ -22,39 +24,105 @@ def handle_mixed_language(word): else: # Non-Chinese Word return "[[[[[" + word + "]]]]]" -def transcribe_multilingual(lists, output_file): - """Transcribe multilingual sentences (English and Chinese, with numbers) and save to a file.""" - with open(output_file, 'w', encoding='utf-8') as f: - for item in lists: - result = [] - sentence = item['sentence'] - # Split sentence using jieba - seg_list = jieba.cut(sentence, cut_all=False) - seg_sentence = " ".join(seg_list) - # Split sentence but keep punctuation (preserve spaces, commas, etc.) - words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE) - for word in words: - if re.match(r'\w+', word): # Only process words (skip punctuation) - result.append(handle_mixed_language(word)) + +def transcribe_multilingual(data, output_file, json_inplace_update=False, json_input_field="sentence", + json_output_field="sentence_ipa"): + """ + Transcribe multilingual sentences (English and Chinese, with numbers) and save to a file. + + Args: + data: The input data (list of dictionaries if JSON, list of strings if plain text). + output_file: Path to the output file. + json_inplace_update: If True, process JSON input and add IPA to the same JSON. + json_input_field: The field in the JSON data to transcribe (default: "sentence"). + json_output_field: The field to write the IPA transcription to (default: "sentence_ipa"). + """ + if json_inplace_update: + # In-place update for JSON data + for item in data: + if json_input_field in item: + sentence = item[json_input_field] + result = [] + + # Split sentence using jieba + seg_list = jieba.cut(sentence, cut_all=False) + seg_sentence = "".join(seg_list) + + # Split sentence but keep punctuation + words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE) + for word in words: + if re.match(r'\w+', word): # Only process words + result.append(handle_mixed_language(word)) + else: + result.append(word) # Preserve punctuation + + transcription_result = " ".join(result) + item[json_output_field] = transcription_result + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + print(f"In-place JSON transcription saved to {output_file}") + + else: + # Standard transcription (either JSON or plain text to plain text output) + with open(output_file, 'w', encoding='utf-8') as f: + for item in data: + result = [] + if isinstance(item, dict): + sentence = item.get(json_input_field, "") else: - result.append(word) # Preserve punctuation as is - transcription_result = " ".join(result) - f.write(transcription_result + "\n") - print(transcription_result) # Print to console for reference + sentence = item + + # Split sentence using jieba + seg_list = jieba.cut(sentence, cut_all=False) + seg_sentence = "".join(seg_list) + + # Split sentence but keep punctuation + words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE) + for word in words: + if re.match(r'\w+', word): # Only process words + result.append(handle_mixed_language(word)) + else: + result.append(word) # Preserve punctuation + + transcription_result = " ".join(result) + f.write(transcription_result + "\n") + print(transcription_result) # Print to console + def main(): parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes.') - parser.add_argument('input_file', type=str, help='Path to the input file containing sentences in json format.') + parser.add_argument('input_file', type=str, + help='Path to the input file containing sentences in json format.') parser.add_argument('output_file', type=str, help='Path to the output file for IPA transcription.') + parser.add_argument('--input_type', type=str, choices=['json', 'text'], default='json', + help='Type of input file: "json" or "text" (default: json)') + parser.add_argument("-j", "--json_inplace_update", action="store_true", + help="Process JSON input and add IPA to the same JSON entries") + parser.add_argument("--json_input_field", default="sentence", + help="JSON field to read from (default: sentence)") + parser.add_argument("--json_output_field", default="sentence_ipa", + help="JSON field to write IPA to (default: sentence_ipa)") args = parser.parse_args() - # Read input sentences - with open(args.input_file, 'r', encoding='utf-8') as f: - lists = json.load(f) + try: + with open(args.input_file, 'r', encoding='utf-8') as f: + if args.input_type == 'json': + data = json.load(f) + else: + data = f.readlines() + + transcribe_multilingual(data, args.output_file, args.json_inplace_update, + args.json_input_field, args.json_output_field) + + except FileNotFoundError: + print(f"Error: Input file '{args.input_file}' not found.") + except json.JSONDecodeError: + print(f"Error: Invalid JSON format in '{args.input_file}'.") + except Exception as e: + print(f"An unexpected error occurred: {e}") - # Transcribe and save to the output file - transcribe_multilingual(lists, args.output_file) if __name__ == '__main__': main()