forked from yyDing1/GNER
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenization_test.py
22 lines (18 loc) · 1 KB
/
tokenization_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from transformers import AutoTokenizer
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("dyyyyyyyy/GNER-LLaMA-7B")
# Define test cases with complex tokenization challenges
inputs = [
("Who is directing the Hobbit?", ["who", "is", "directing", "the", "ho", "##bb", "##it", "?"]),
("Analyzing multi-token words like re-directing", ["analyzing", "multi", "-", "token", "words", "like", "re", "-", "directing"]),
("Analyzing multi-token words like 'high-profile'", ["analyzing", "multi", "-", "token", "words", "like", "high", "-", "profile"]),
("Identifying end-to-end encryption methods", ["identifying", "end", "-", "to", "-", "end", "encryption", "methods"]),
]
# Test each input
for input_text, expected_tokens in inputs:
tokenized_output = tokenizer.tokenize(input_text)
match = tokenized_output == expected_tokens
print(f"Input: {input_text}")
print(f"Expected Tokens: {expected_tokens}")
print(f"Tokenized Output: {tokenized_output}")
print(f"Match: {match}\n")