Merge branch 'master' into modify_ja2ipa_to_json_centered_flow

ReaLLMASIC · Feb 1, 2025 · ddbc21b · ddbc21b
2 parents fa63ac7 + f28a296
commit ddbc21b
Show file tree

Hide file tree

Showing 14 changed files with 935 additions and 139 deletions.
diff --git a/benchmarks/bleu.py b/benchmarks/bleu.py
@@ -0,0 +1,56 @@
+"""
+Implements the evaluation metrics based on BLEU score
+
+example:
+    import sacrebleu
+
+    translated_sentences = ['The dog had bit the man.', "It wasn't surprising.", 'The man had bitten the dog.']
+    target_sentences = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
+    bleu_score = sacrebleu.corpus_bleu(translated_sentences, [target_sentences]).score
+    print(f'Test BLEU: {bleu_score}')
+
+"""
+
+import numpy as np
+from typing import List
+
+import sacrebleu
+
+def corpus_bleu(sys_sents: List[str],
+                refs_sents: List[List[str]],
+                smooth_method: str = 'exp',
+                smooth_value: float = None,
+                force: bool = True,
+                lowercase: bool = False,
+                tokenizer: str = '13a',
+                use_effective_order: bool = False):
+
+    return sacrebleu.corpus_bleu(sys_sents, refs_sents, smooth_method, smooth_value, force,
+                                 lowercase=False, tokenize='none', use_effective_order=use_effective_order).score
+
+
+def sentence_bleu(sys_sent: str,
+                  ref_sents: List[str],
+                  smooth_method: str = 'floor',
+                  smooth_value: float = None,
+                  lowercase: bool = False,
+                  tokenizer: str = '13a',
+                  use_effective_order: bool = True):
+
+    return corpus_bleu([sys_sent], [[ref] for ref in ref_sents], smooth_method, smooth_value, force=True,
+                       lowercase=lowercase, tokenizer=tokenizer, use_effective_order=use_effective_order)
+
+
+def corpus_averaged_sentence_bleu(sys_sents: List[str],
+                                  refs_sents: List[List[str]],
+                                  smooth_method: str = 'floor',
+                                  smooth_value: float = None,
+                                  lowercase: bool = False,
+                                  tokenizer: str = '13a',
+                                  use_effective_order: bool = True):
+
+    scores = []
+    for sys_sent, *ref_sents in zip(sys_sents, *refs_sents):
+        scores.append(sentence_bleu(sys_sent, ref_sents, smooth_method, smooth_value,
+                                    lowercase=lowercase, tokenizer=tokenizer, use_effective_order=use_effective_order))
+    return np.mean(scores)
diff --git a/data/commonvoice_ko/get_dataset.sh b/data/commonvoice_ko/get_dataset.sh
@@ -1,7 +1,7 @@
 # !/bin/bash
 
-# Set strict error handling
-set -euo pipefail
+# Show lines before execution and exit on errors
+set -xe
 
 # Install python dependencies for Hugging face
 pip install -U "huggingface_hub[cli]"
@@ -10,7 +10,13 @@ pip install -U "huggingface_hub[cli]"
 # Replace with your hugging face tokens
 ##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ######
 ##### "Token Type" of "Read" is recommended. ########
-HF_TOKEN=""
+if [[ -f ~/.cache/huggingface/token && -s ~/.cache/huggingface/token ]]; then
+  export HF_TOKEN=$(cat ~/.cache/huggingface/token)
+else
+  echo "Consider running 'python3 ./utils/save_hf_token.py' to automate finding HF_TOKEN"
+  read -s -p "To continue, please enter your Hugging Face token: " HF_TOKEN
+  echo "" # Add a newline for better readability
+fi
 
 # Authenticate with hugging face
 echo "Authenticating with Hugging Face..."
@@ -28,12 +34,12 @@ fi
 
 # Download transcription files under "transcription" directory.
 pushd "${out_dir}"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/ko/dev.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "invalidated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/ko/other.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/ko/test.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/ko/train.tsv?download=true"
-wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true"
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/ko/dev.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "invalidated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/ko/other.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/ko/test.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/ko/train.tsv?download=true" || true
+wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true" || true
 
 echo "transcripts downloaded and saved to transcription."
 popd
@@ -54,11 +60,11 @@ echo "All .tsv files have been processed."
 
 # Run program to convert sentences into IPA format.
 echo "Converting sentences to IPA..."
-python3 ./utils/ko_en_to_ipa.py "$output_file" --input_json_key "sentence" --output_json_key "phonetic"
+python3 ./utils/ko_en_to_ipa.py "$output_file" --input_json_key "sentence" --output_json_key "sentence_ipa"
 
 output_ipa="ko_ipa.txt"
 echo "export IPA to txt file"
-python3 ./utils/extract_json_values.py "$output_file" "phonetic" "$output_ipa"
+python3 ./utils/extract_json_values.py "$output_file" "sentence_ipa" "$output_ipa"
 
 echo "IPA conversion finished."
 

diff --git a/data/template/parallel_embeddings/part_of_speech_zh.py b/data/template/parallel_embeddings/part_of_speech_zh.py
@@ -0,0 +1,8 @@
+import jieba.posseg as pseg
+
+text = "他今天在北京大学的图书馆里看书，学习非常认真。这本书很有意思，内容包括历史、哲学和科学。"
+
+words = pseg.cut(text)
+
+for word, flag in words:
+    print(f"{word}: {flag}")
diff --git a/data/template/phoneme_list.txt b/data/template/phoneme_list.txt
@@ -1,87 +1,98 @@
-i:
-I
-iI
-eI
+
+\n
+\t
+.
+[
+]
+_
 a
-A:
-Q
-0
-'
-O:
-U
-u:
-V
-@
-eI
-aI
-OI
-aU
-oU
-p
+ä
+æ
 b
-t
+c
+ç
 d
-k
-g
+e
 f
-v
-T
-D
-s
-z
-S
-Z
+g
 h
+i
+j
+k
+l
 m
 n
-N
-l
-r
-w
-j
-iu
-i
-e
 o
-u
-W
-A
-y
-E
-ME
-O
-oo
-ou
-ye
-
-\n
-\r
-:
-,
-F
-C
-Y
-?
-.
-B
-c
-R
-M
-L
-c
-;
-!
-H
-P
+ø
+p
 q
-
-G
--
+r
+s
+t
+u
+v
+w
 x
-$
-&
-3
-J
-K
-X
-_
+y
+z
+ð
+ħ
+ŋ
+œ
+ɐ
+ɑ
+ɔ
+ɕ
+ɘ
+ə
+ɛ
+ɡ
+ɣ
+ɤ
+ɥ
+ɦ
+ɨ
+ɪ
+ɫ
+ɯ
+ɴ
+ɵ
+ɸ
+ɻ
+ɽ
+ɾ
+ʁ
+ʂ
+ʃ
+ʈ
+ʉ
+ʊ
+ʌ
+ʏ
+ʐ
+ʑ
+ʔ
+ʕ
+ʰ
+ʲ
+ʼ
+ˈ
+ˌ
+ː
+ˑ
+ˤ
+˥
+˦
+˧
+˨
+˩
+̂
+̃
+̆
+̌
+̚
+̥
+̬
+β
+θ
+χ
diff --git a/data/template/tests.py b/data/template/tests.py
@@ -15,6 +15,7 @@
 from rich.console import Console
 from rich.theme import Theme
 from rich.table import Table
+from rich.text import Text
 
 console = Console(theme=Theme({
     "pass": "bold green",
@@ -23,7 +24,7 @@
     "separator": "grey50",
     "input": "bold cyan",
     "output": "bold magenta",
-    "info": "bold blue"
+    "info": "bold blue",
 }))
 
 
@@ -188,10 +189,10 @@ def test_custom_char_tokenizer_with_byte_fallback(self):
         args = Namespace(custom_chars_file="custom_chars.txt")
         # Create a custom characters file for testing
         with open(args.custom_chars_file, 'w', encoding='utf-8') as f:
-            f.write('a\nb\nc\n')
+            f.write('a\nb\nc\n\\n')
 
         tokenizer = CustomCharTokenizerWithByteFallback(args)
-        test_string = 'abc😊'
+        test_string = 'abc😊d\nefg'
 
         ids = tokenizer.tokenize(test_string)
         detokenized = tokenizer.detokenize(ids)
@@ -201,6 +202,15 @@ def test_custom_char_tokenizer_with_byte_fallback(self):
         console.print("[output]Detokenized Output:[/output]")
         console.print(detokenized, style="output")
 
+        console.print("[info]Characters that used byte fallback:[/info]")
+        bft = [] # Byte Fallback Tokens
+        for char in detokenized:
+            if char not in tokenizer.custom_chars:
+                char = repr(char)
+                bft.append(char)
+
+        console.print(", ".join(bft), style="info")
+
         self.assertEqual(test_string, detokenized)
         print("CustomCharTokenizerWithByteFallback test passed.")
 

diff --git a/data/template/utils/meta_util.py b/data/template/utils/meta_util.py
@@ -58,7 +58,21 @@ def create_meta_from_text(text_file, output_path, special_chars={"<ukn>": 0}):
         pickle.dump(meta, f)
     print(f"Meta created from text and saved to {output_path}.")
 
-
+def export_tokens(meta_path, output_path):
+    meta = load_meta(meta_path)
+    with open(output_path, "w") as f:
+        for i in range(meta["vocab_size"]):
+            token = meta["itos"][i]
+            if token == "\n":
+                token = "\\n"
+            elif token == "\t":
+                token = "\\t"
+            elif token == "\r":
+                token = "\\r"
+            # Note: Add more special character handling here as needed
+            f.write(token + "\n")
+    print(f"Tokens exported to {output_path}")
+
 def main():
     parser = argparse.ArgumentParser(description="Utility for handling token metadata.")
 
@@ -73,6 +87,12 @@ def main():
         nargs=2,
         help="Path to the input text file and the output meta.pkl file for creation.",
     )
+
+    parser.add_argument(
+        "--export",
+        nargs=2,
+        help="Path to the meta.pkl file and the output text file for exporting tokens.",
+    )
 
     args = parser.parse_args()
 
@@ -82,7 +102,8 @@ def main():
         merge_metas(args.merge[0], args.merge[1], "merged_meta.pkl")
     elif args.create:
         create_meta_from_text(args.create[0], args.create[1])
-
+    elif args.export:
+        export_tokens(args.export[0], args.export[1])
 
 if __name__ == "__main__":
     main()
-Original file line number
+Diff line change
@@ -1,87 +1,98 @@
-    i:
-    I
-    iI
-    eI
+    \n
+    \t
+    .
+    [
+    ]
+    _
     a
-    A:
-    Q
-    '
-    O:
-    U
-    u:
-    V
-    @
-    eI
-    aI
-    OI
-    aU
-    oU
-    p
+    ä
+    æ
     b
-    t
+    c
+    ç
     d
-    k
-    g
+    e
     f
-    v
-    T
-    D
-    s
-    z
-    S
-    Z
+    g
     h
+    i
+    j
+    k
+    l
     m
     n
-    N
-    l
-    r
-    w
-    j
-    iu
-    i
-    e
     o
-    u
-    W
-    A
-    y
-    E
-    ME
-    O
-    oo
-    ou
-    ye
-    \n
-    \r
-    :
-    ,
-    F
-    C
-    Y
-    ?
-    .
-    B
-    c
-    R
-    M
-    L
-    c
-    ;
-    !
-    H
-    P
+    ø
+    p
     q
-    G
-    -
+    r
+    s
+    t
+    u
+    v
+    w
     x
-    $
-    &
-    J
-    K
-    X
-    _
+    y
+    z
+    ð
+    ħ
+    ŋ
+    œ
+    ɐ
+    ɑ
+    ɔ
+    ɕ
+    ɘ
+    ə
+    ɛ
+    ɡ
+    ɣ
+    ɤ
+    ɥ
+    ɦ
+    ɨ
+    ɪ
+    ɫ
+    ɯ
+    ɴ
+    ɵ
+    ɸ
+    ɻ
+    ɽ
+    ɾ
+    ʁ
+    ʂ
+    ʃ
+    ʈ
+    ʉ
+    ʊ
+    ʌ
+    ʏ
+    ʐ
+    ʑ
+    ʔ
+    ʕ
+    ʰ
+    ʲ
+    ʼ
+    ˈ
+    ˌ
+    ː
+    ˑ
+    ˤ
+    ˥
+    ˦
+    ˧
+    ˨
+    ˩
+    ̂
+    ̃
+    ̆
+    ̌
+    ̚
+    ̥
+    ̬
+    β
+    θ
+    χ