Modify Zh to IPA pipeline to center around json

These modifications allow for: 1. HF_TOKEN saving outside of bash script 2. json centered text file creation Building around the json file will allow us to do many n choose k datasets for exploration.
ReaLLMASIC · Jan 20, 2025 · fa63ac7 · fa63ac7
1 parent ce2cda1
commit fa63ac7
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 35 deletions.
diff --git a/data/commonvoice_zh/get_dataset.sh b/data/commonvoice_zh/get_dataset.sh
@@ -1,7 +1,7 @@
 # !/bin/bash
 
 # Set strict error handling
-set -euo pipefail
+set -xe
 
 # Install python dependencies for Hugging face
 pip install -U "huggingface_hub[cli]"
@@ -13,7 +13,13 @@ pip install jieba
 # Replace with your hugging face tokens
 ##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ######
 ##### "Token Type" of "Read" is recommended. ########
-HF_TOKEN=""
+if [[ -f ~/.cache/huggingface/token && -s ~/.cache/huggingface/token ]]; then
+  export HF_TOKEN=$(cat ~/.cache/huggingface/token)
+else
+  echo "Consider running 'python3 ./utils/save_hf_token.py' to automate finding HF_TOKEN"
+  read -s -p "To continue, please enter your Hugging Face token: " HF_TOKEN
+  echo "" # Add a newline for better readability
+fi
 
 # Authenticate with hugging face
 echo "Authenticating with Hugging Face..."
@@ -31,11 +37,11 @@ fi
 
 # Download transcription files under "transcription" directory.
 pushd "${out_dir}"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/zh-CN/dev.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/zh-CN/other.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/zh-CN/test.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/zh-CN/train.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/zh-CN/validated.tsv?download=true" || true
 
 echo "transcripts downloaded and saved to transcription."
 popd
@@ -55,11 +61,20 @@ done
 echo "All .tsv files have been processed."
 
 # Run program to convert sentences into IPA format.
-output_ipa="zh_ipa.txt"
+output_json_with_ipa="zh_ipa.json"
 echo "Converting sentences to IPA..."
-python3 utils/zh_to_ipa.py "$output_file" "$output_ipa"
-
+python3 utils/zh_to_ipa.py -j "$output_file" "$output_json_with_ipa"
 echo "IPA conversion finished."
 
+output_ipa_txt="zh_ipa.txt"
+python3 utils/extract_json_values.py "$output_json_with_ipa" "sentence_ipa" "$output_ipa_txt" 
+echo "IPA extraction finished."
+
+#TODO(gkielian): see if we can fix the parsing of rows instead of deleting
+# Remove lines which were not correclty processed (and start with numberic hash)
+wc -l "$output_ipa_txt"
+sed -i "/^[0-9].*/g" "$output_ipa_txt"
+wc -l "$output_ipa_txt"
+
 # Tokenization step to create train.bin and val.bin files.
-python3 prepare.py -t "$output_ipa" --method char
+python3 prepare.py -t "$output_ipa_txt" --method char
diff --git a/data/template/utils/zh_to_ipa.py b/data/template/utils/zh_to_ipa.py
@@ -5,6 +5,7 @@
 import re
 import json
 
+
 def transcribe_chinese(sentence):
     """Transcribe a Chinese sentence into its phonemes using dragonmapper."""
     try:
@@ -13,6 +14,7 @@ def transcribe_chinese(sentence):
     except Exception as e:
         return f"Error in transcribing Chinese: {str(e)}"
 
+
 def handle_mixed_language(word):
     """Handle a word with potential Chinese, Language, or number content."""
     if word.isdigit():  # Detect numbers but just pass through for now (different in each language)
@@ -22,39 +24,105 @@ def handle_mixed_language(word):
     else:  # Non-Chinese Word
         return "[[[[[" + word + "]]]]]"
 
-def transcribe_multilingual(lists, output_file):
-    """Transcribe multilingual sentences (English and Chinese, with numbers) and save to a file."""
-    with open(output_file, 'w', encoding='utf-8') as f:
-        for item in lists:
-            result = []
-            sentence = item['sentence']
-            # Split sentence using jieba
-            seg_list = jieba.cut(sentence, cut_all=False)
-            seg_sentence = " ".join(seg_list)
-            # Split sentence but keep punctuation (preserve spaces, commas, etc.)
-            words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
-            for word in words:
-                if re.match(r'\w+', word):  # Only process words (skip punctuation)
-                    result.append(handle_mixed_language(word))
+
+def transcribe_multilingual(data, output_file, json_inplace_update=False, json_input_field="sentence",
+                            json_output_field="sentence_ipa"):
+    """
+    Transcribe multilingual sentences (English and Chinese, with numbers) and save to a file.
+
+    Args:
+        data: The input data (list of dictionaries if JSON, list of strings if plain text).
+        output_file: Path to the output file.
+        json_inplace_update: If True, process JSON input and add IPA to the same JSON.
+        json_input_field: The field in the JSON data to transcribe (default: "sentence").
+        json_output_field: The field to write the IPA transcription to (default: "sentence_ipa").
+    """
+    if json_inplace_update:
+        # In-place update for JSON data
+        for item in data:
+            if json_input_field in item:
+                sentence = item[json_input_field]
+                result = []
+
+                # Split sentence using jieba
+                seg_list = jieba.cut(sentence, cut_all=False)
+                seg_sentence = "".join(seg_list)
+
+                # Split sentence but keep punctuation
+                words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
+                for word in words:
+                    if re.match(r'\w+', word):  # Only process words
+                        result.append(handle_mixed_language(word))
+                    else:
+                        result.append(word)  # Preserve punctuation
+
+                transcription_result = " ".join(result)
+                item[json_output_field] = transcription_result
+
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=4)
+            print(f"In-place JSON transcription saved to {output_file}")
+
+    else:
+        # Standard transcription (either JSON or plain text to plain text output)
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for item in data:
+                result = []
+                if isinstance(item, dict):
+                    sentence = item.get(json_input_field, "")
                 else:
-                    result.append(word)  # Preserve punctuation as is
-            transcription_result = " ".join(result)
-            f.write(transcription_result + "\n")
-            print(transcription_result)  # Print to console for reference
+                    sentence = item
+
+                # Split sentence using jieba
+                seg_list = jieba.cut(sentence, cut_all=False)
+                seg_sentence = "".join(seg_list)
+
+                # Split sentence but keep punctuation
+                words = re.findall(r'\w+|[^\w\s]', seg_sentence, re.UNICODE)
+                for word in words:
+                    if re.match(r'\w+', word):  # Only process words
+                        result.append(handle_mixed_language(word))
+                    else:
+                        result.append(word)  # Preserve punctuation
+
+                transcription_result = " ".join(result)
+                f.write(transcription_result + "\n")
+                print(transcription_result)  # Print to console
+
 
 def main():
     parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes.')
-    parser.add_argument('input_file', type=str, help='Path to the input file containing sentences in json format.')
+    parser.add_argument('input_file', type=str,
+                        help='Path to the input file containing sentences in json format.')
     parser.add_argument('output_file', type=str, help='Path to the output file for IPA transcription.')
+    parser.add_argument('--input_type', type=str, choices=['json', 'text'], default='json',
+                        help='Type of input file: "json" or "text" (default: json)')
+    parser.add_argument("-j", "--json_inplace_update", action="store_true",
+                        help="Process JSON input and add IPA to the same JSON entries")
+    parser.add_argument("--json_input_field", default="sentence",
+                        help="JSON field to read from (default: sentence)")
+    parser.add_argument("--json_output_field", default="sentence_ipa",
+                        help="JSON field to write IPA to (default: sentence_ipa)")
 
     args = parser.parse_args()
 
-    # Read input sentences
-    with open(args.input_file, 'r', encoding='utf-8') as f:
-        lists = json.load(f)
+    try:
+        with open(args.input_file, 'r', encoding='utf-8') as f:
+            if args.input_type == 'json':
+                data = json.load(f)
+            else:
+                data = f.readlines()
+
+        transcribe_multilingual(data, args.output_file, args.json_inplace_update,
+                                args.json_input_field, args.json_output_field)
+
+    except FileNotFoundError:
+        print(f"Error: Input file '{args.input_file}' not found.")
+    except json.JSONDecodeError:
+        print(f"Error: Invalid JSON format in '{args.input_file}'.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
 
-    # Transcribe and save to the output file
-    transcribe_multilingual(lists, args.output_file)
 
 if __name__ == '__main__':
     main()