Skip to content

Commit

Permalink
Merge branch 'master' into modify_ja2ipa_to_json_centered_flow
Browse files Browse the repository at this point in the history
  • Loading branch information
klei22 authored Feb 1, 2025
2 parents fa63ac7 + f28a296 commit ddbc21b
Show file tree
Hide file tree
Showing 14 changed files with 935 additions and 139 deletions.
56 changes: 56 additions & 0 deletions benchmarks/bleu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Implements the evaluation metrics based on BLEU score
example:
import sacrebleu
translated_sentences = ['The dog had bit the man.', "It wasn't surprising.", 'The man had bitten the dog.']
target_sentences = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
bleu_score = sacrebleu.corpus_bleu(translated_sentences, [target_sentences]).score
print(f'Test BLEU: {bleu_score}')
"""

import numpy as np
from typing import List

import sacrebleu

def corpus_bleu(sys_sents: List[str],
refs_sents: List[List[str]],
smooth_method: str = 'exp',
smooth_value: float = None,
force: bool = True,
lowercase: bool = False,
tokenizer: str = '13a',
use_effective_order: bool = False):

return sacrebleu.corpus_bleu(sys_sents, refs_sents, smooth_method, smooth_value, force,
lowercase=False, tokenize='none', use_effective_order=use_effective_order).score


def sentence_bleu(sys_sent: str,
ref_sents: List[str],
smooth_method: str = 'floor',
smooth_value: float = None,
lowercase: bool = False,
tokenizer: str = '13a',
use_effective_order: bool = True):

return corpus_bleu([sys_sent], [[ref] for ref in ref_sents], smooth_method, smooth_value, force=True,
lowercase=lowercase, tokenizer=tokenizer, use_effective_order=use_effective_order)


def corpus_averaged_sentence_bleu(sys_sents: List[str],
refs_sents: List[List[str]],
smooth_method: str = 'floor',
smooth_value: float = None,
lowercase: bool = False,
tokenizer: str = '13a',
use_effective_order: bool = True):

scores = []
for sys_sent, *ref_sents in zip(sys_sents, *refs_sents):
scores.append(sentence_bleu(sys_sent, ref_sents, smooth_method, smooth_value,
lowercase=lowercase, tokenizer=tokenizer, use_effective_order=use_effective_order))
return np.mean(scores)
28 changes: 17 additions & 11 deletions data/commonvoice_ko/get_dataset.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# !/bin/bash

# Set strict error handling
set -euo pipefail
# Show lines before execution and exit on errors
set -xe

# Install python dependencies for Hugging face
pip install -U "huggingface_hub[cli]"
Expand All @@ -10,7 +10,13 @@ pip install -U "huggingface_hub[cli]"
# Replace with your hugging face tokens
##### You can find and create your own tokens here: https://huggingface.co/settings/tokens ######
##### "Token Type" of "Read" is recommended. ########
HF_TOKEN=""
if [[ -f ~/.cache/huggingface/token && -s ~/.cache/huggingface/token ]]; then
export HF_TOKEN=$(cat ~/.cache/huggingface/token)
else
echo "Consider running 'python3 ./utils/save_hf_token.py' to automate finding HF_TOKEN"
read -s -p "To continue, please enter your Hugging Face token: " HF_TOKEN
echo "" # Add a newline for better readability
fi

# Authenticate with hugging face
echo "Authenticating with Hugging Face..."
Expand All @@ -28,12 +34,12 @@ fi

# Download transcription files under "transcription" directory.
pushd "${out_dir}"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/ko/dev.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "invalidated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/ko/other.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/ko/test.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/ko/train.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true"
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "dev.tsv" "${url}/resolve/main/transcript/ko/dev.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "invalidated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "other.tsv" "${url}/resolve/main/transcript/ko/other.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "test.tsv" "${url}/resolve/main/transcript/ko/test.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "train.tsv" "${url}/resolve/main/transcript/ko/train.tsv?download=true" || true
wget --header="Authorization: Bearer ${HF_TOKEN}" -nc -O "validated.tsv" "${url}/resolve/main/transcript/ko/validated.tsv?download=true" || true

echo "transcripts downloaded and saved to transcription."
popd
Expand All @@ -54,11 +60,11 @@ echo "All .tsv files have been processed."

# Run program to convert sentences into IPA format.
echo "Converting sentences to IPA..."
python3 ./utils/ko_en_to_ipa.py "$output_file" --input_json_key "sentence" --output_json_key "phonetic"
python3 ./utils/ko_en_to_ipa.py "$output_file" --input_json_key "sentence" --output_json_key "sentence_ipa"

output_ipa="ko_ipa.txt"
echo "export IPA to txt file"
python3 ./utils/extract_json_values.py "$output_file" "phonetic" "$output_ipa"
python3 ./utils/extract_json_values.py "$output_file" "sentence_ipa" "$output_ipa"

echo "IPA conversion finished."

Expand Down
8 changes: 8 additions & 0 deletions data/template/parallel_embeddings/part_of_speech_zh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import jieba.posseg as pseg

text = "他今天在北京大学的图书馆里看书,学习非常认真。这本书很有意思,内容包括历史、哲学和科学。"

words = pseg.cut(text)

for word, flag in words:
print(f"{word}: {flag}")
165 changes: 88 additions & 77 deletions data/template/phoneme_list.txt
Original file line number Diff line number Diff line change
@@ -1,87 +1,98 @@
i:
I
iI
eI

\n
\t
.
[
]
_
a
A:
Q
0
'
O:
U
u:
V
@
eI
aI
OI
aU
oU
p
ä
æ
b
t
c
ç
d
k
g
e
f
v
T
D
s
z
S
Z
g
h
i
j
k
l
m
n
N
l
r
w
j
iu
i
e
o
u
W
A
y
E
ME
O
oo
ou
ye

\n
\r
:
,
F
C
Y
?
.
B
c
R
M
L
c
;
!
H
P
ø
p
q

G
-
r
s
t
u
v
w
x
$
&
3
J
K
X
_
y
z
ð
ħ
ŋ
œ
ɐ
ɑ
ɔ
ɕ
ɘ
ə
ɛ
ɡ
ɣ
ɤ
ɥ
ɦ
ɨ
ɪ
ɫ
ɯ
ɴ
ɵ
ɸ
ɻ
ɽ
ɾ
ʁ
ʂ
ʃ
ʈ
ʉ
ʊ
ʌ
ʏ
ʐ
ʑ
ʔ
ʕ
ʰ
ʲ
ʼ
ˈ
ˌ
ː
ˑ
ˤ
˥
˦
˧
˨
˩
̂
̃
̆
̌
̚
̥
̬
β
θ
χ
16 changes: 13 additions & 3 deletions data/template/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from rich.console import Console
from rich.theme import Theme
from rich.table import Table
from rich.text import Text

console = Console(theme=Theme({
"pass": "bold green",
Expand All @@ -23,7 +24,7 @@
"separator": "grey50",
"input": "bold cyan",
"output": "bold magenta",
"info": "bold blue"
"info": "bold blue",
}))


Expand Down Expand Up @@ -188,10 +189,10 @@ def test_custom_char_tokenizer_with_byte_fallback(self):
args = Namespace(custom_chars_file="custom_chars.txt")
# Create a custom characters file for testing
with open(args.custom_chars_file, 'w', encoding='utf-8') as f:
f.write('a\nb\nc\n')
f.write('a\nb\nc\n\\n')

tokenizer = CustomCharTokenizerWithByteFallback(args)
test_string = 'abc😊'
test_string = 'abc😊d\nefg'

ids = tokenizer.tokenize(test_string)
detokenized = tokenizer.detokenize(ids)
Expand All @@ -201,6 +202,15 @@ def test_custom_char_tokenizer_with_byte_fallback(self):
console.print("[output]Detokenized Output:[/output]")
console.print(detokenized, style="output")

console.print("[info]Characters that used byte fallback:[/info]")
bft = [] # Byte Fallback Tokens
for char in detokenized:
if char not in tokenizer.custom_chars:
char = repr(char)
bft.append(char)

console.print(", ".join(bft), style="info")

self.assertEqual(test_string, detokenized)
print("CustomCharTokenizerWithByteFallback test passed.")

Expand Down
25 changes: 23 additions & 2 deletions data/template/utils/meta_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,21 @@ def create_meta_from_text(text_file, output_path, special_chars={"<ukn>": 0}):
pickle.dump(meta, f)
print(f"Meta created from text and saved to {output_path}.")


def export_tokens(meta_path, output_path):
meta = load_meta(meta_path)
with open(output_path, "w") as f:
for i in range(meta["vocab_size"]):
token = meta["itos"][i]
if token == "\n":
token = "\\n"
elif token == "\t":
token = "\\t"
elif token == "\r":
token = "\\r"
# Note: Add more special character handling here as needed
f.write(token + "\n")
print(f"Tokens exported to {output_path}")

def main():
parser = argparse.ArgumentParser(description="Utility for handling token metadata.")

Expand All @@ -73,6 +87,12 @@ def main():
nargs=2,
help="Path to the input text file and the output meta.pkl file for creation.",
)

parser.add_argument(
"--export",
nargs=2,
help="Path to the meta.pkl file and the output text file for exporting tokens.",
)

args = parser.parse_args()

Expand All @@ -82,7 +102,8 @@ def main():
merge_metas(args.merge[0], args.merge[1], "merged_meta.pkl")
elif args.create:
create_meta_from_text(args.create[0], args.create[1])

elif args.export:
export_tokens(args.export[0], args.export[1])

if __name__ == "__main__":
main()
Loading

0 comments on commit ddbc21b

Please sign in to comment.