Skip to content

Commit

Permalink
Merge pull request #236 from ggerganov/compilade/fix-mpt-pretok
Browse files Browse the repository at this point in the history
test-tokenizer-random : reduce potential confilcts with ggerganov#8379
  • Loading branch information
Nexesenex authored Jul 13, 2024
2 parents f950d48 + 59ce853 commit bc02f64
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions tests/test-tokenizer-random.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
'a\na', # bert fail
'"`', # falcon
' \u2e4e', # falcon
'\n\x0b ', # falcon
'a\xa0\xa0\x00b', # jina-v2-es
'one <mask>', # jina-v2-es <mask> lstrip=true
'a </s> b', # rstrip phi-3
Expand Down Expand Up @@ -458,8 +459,8 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool:
i = find_first_mismatch(ids1, ids2)
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
logger.error(" Expected: " + str(ids1) + f" {[tokenizer1.decode([id]) for id in ids1]}")
logger.error(" Result: " + str(ids2) + f" {[tokenizer2.decode([id]) for id in ids2]}")
logger.error(" Expected: " + str(ids1))
logger.error(" Result: " + str(ids2))
encode_errors += 1
logger.error(f" {encode_errors=}")
if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
Expand Down

0 comments on commit bc02f64

Please sign in to comment.