forked from karpathy/nanoGPT
-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into modify_ja2ipa_to_json_centered_flow
- Loading branch information
Showing
14 changed files
with
935 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
""" | ||
Implements the evaluation metrics based on BLEU score | ||
example: | ||
import sacrebleu | ||
translated_sentences = ['The dog had bit the man.', "It wasn't surprising.", 'The man had bitten the dog.'] | ||
target_sentences = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.'] | ||
bleu_score = sacrebleu.corpus_bleu(translated_sentences, [target_sentences]).score | ||
print(f'Test BLEU: {bleu_score}') | ||
""" | ||
|
||
import numpy as np | ||
from typing import List | ||
|
||
import sacrebleu | ||
|
||
def corpus_bleu(sys_sents: List[str], | ||
refs_sents: List[List[str]], | ||
smooth_method: str = 'exp', | ||
smooth_value: float = None, | ||
force: bool = True, | ||
lowercase: bool = False, | ||
tokenizer: str = '13a', | ||
use_effective_order: bool = False): | ||
|
||
return sacrebleu.corpus_bleu(sys_sents, refs_sents, smooth_method, smooth_value, force, | ||
lowercase=False, tokenize='none', use_effective_order=use_effective_order).score | ||
|
||
|
||
def sentence_bleu(sys_sent: str, | ||
ref_sents: List[str], | ||
smooth_method: str = 'floor', | ||
smooth_value: float = None, | ||
lowercase: bool = False, | ||
tokenizer: str = '13a', | ||
use_effective_order: bool = True): | ||
|
||
return corpus_bleu([sys_sent], [[ref] for ref in ref_sents], smooth_method, smooth_value, force=True, | ||
lowercase=lowercase, tokenizer=tokenizer, use_effective_order=use_effective_order) | ||
|
||
|
||
def corpus_averaged_sentence_bleu(sys_sents: List[str], | ||
refs_sents: List[List[str]], | ||
smooth_method: str = 'floor', | ||
smooth_value: float = None, | ||
lowercase: bool = False, | ||
tokenizer: str = '13a', | ||
use_effective_order: bool = True): | ||
|
||
scores = [] | ||
for sys_sent, *ref_sents in zip(sys_sents, *refs_sents): | ||
scores.append(sentence_bleu(sys_sent, ref_sents, smooth_method, smooth_value, | ||
lowercase=lowercase, tokenizer=tokenizer, use_effective_order=use_effective_order)) | ||
return np.mean(scores) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import jieba.posseg as pseg | ||
|
||
text = "他今天在北京大学的图书馆里看书,学习非常认真。这本书很有意思,内容包括历史、哲学和科学。" | ||
|
||
words = pseg.cut(text) | ||
|
||
for word, flag in words: | ||
print(f"{word}: {flag}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,87 +1,98 @@ | ||
i: | ||
I | ||
iI | ||
eI | ||
|
||
\n | ||
\t | ||
. | ||
[ | ||
] | ||
_ | ||
a | ||
A: | ||
Q | ||
0 | ||
' | ||
O: | ||
U | ||
u: | ||
V | ||
@ | ||
eI | ||
aI | ||
OI | ||
aU | ||
oU | ||
p | ||
ä | ||
æ | ||
b | ||
t | ||
c | ||
ç | ||
d | ||
k | ||
g | ||
e | ||
f | ||
v | ||
T | ||
D | ||
s | ||
z | ||
S | ||
Z | ||
g | ||
h | ||
i | ||
j | ||
k | ||
l | ||
m | ||
n | ||
N | ||
l | ||
r | ||
w | ||
j | ||
iu | ||
i | ||
e | ||
o | ||
u | ||
W | ||
A | ||
y | ||
E | ||
ME | ||
O | ||
oo | ||
ou | ||
ye | ||
|
||
\n | ||
\r | ||
: | ||
, | ||
F | ||
C | ||
Y | ||
? | ||
. | ||
B | ||
c | ||
R | ||
M | ||
L | ||
c | ||
; | ||
! | ||
H | ||
P | ||
ø | ||
p | ||
q | ||
|
||
G | ||
- | ||
r | ||
s | ||
t | ||
u | ||
v | ||
w | ||
x | ||
$ | ||
& | ||
3 | ||
J | ||
K | ||
X | ||
_ | ||
y | ||
z | ||
ð | ||
ħ | ||
ŋ | ||
œ | ||
ɐ | ||
ɑ | ||
ɔ | ||
ɕ | ||
ɘ | ||
ə | ||
ɛ | ||
ɡ | ||
ɣ | ||
ɤ | ||
ɥ | ||
ɦ | ||
ɨ | ||
ɪ | ||
ɫ | ||
ɯ | ||
ɴ | ||
ɵ | ||
ɸ | ||
ɻ | ||
ɽ | ||
ɾ | ||
ʁ | ||
ʂ | ||
ʃ | ||
ʈ | ||
ʉ | ||
ʊ | ||
ʌ | ||
ʏ | ||
ʐ | ||
ʑ | ||
ʔ | ||
ʕ | ||
ʰ | ||
ʲ | ||
ʼ | ||
ˈ | ||
ˌ | ||
ː | ||
ˑ | ||
ˤ | ||
˥ | ||
˦ | ||
˧ | ||
˨ | ||
˩ | ||
̂ | ||
̃ | ||
̆ | ||
̌ | ||
̚ | ||
̥ | ||
̬ | ||
β | ||
θ | ||
χ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.