Skip to content

Commit

Permalink
add assertions for numba behavior re: UnicodeCharSeq / unicode_type
Browse files Browse the repository at this point in the history
  • Loading branch information
lapp0 committed Jun 4, 2024
1 parent ae9635c commit a759728
Showing 1 changed file with 29 additions and 0 deletions.
29 changes: 29 additions & 0 deletions tests/fsm/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,3 +658,32 @@ def convert_token_to_string(self, token):
)
is_accepted = len(state_seq) >= len(token_trans_key_seq)
assert should_accept == is_accepted


def test_numba_leading_null_byte_UnicodeCharSeq_remains_broken():
"""Assert numba UnicodeCharSeq w/ leading \x00 is still broken"""
# EXPLANATION:
# https://github.com/outlines-dev/outlines/pull/930#issuecomment-2143535968

# from https://github.com/numba/numba/issues/9542
d = numba.typed.typeddict.Dict.empty(numba.types.UnicodeCharSeq(1), numba.int64)
d["一"] = 10 # \xe4\xb8\x80
with pytest.raises(KeyError):
str(d)

# most characters are fine, but "\x00" is converted to ""
l = np.fromiter(["\x99", "\x00"], dtype=np.dtype("U2"))
assert str(l[0]) == "\x99" # fine
assert str(l[1]) == "" # 1-byte null converted to 0-bytes


@pytest.mark.parametrize("input_key", ["一", "\x00"])
def test_numba_leading_null_byte_unicode_type_sane(input_key):
"""Assert numba unicode_type w/ leading \x00 is working"""
# EXPLANATION:
# https://github.com/outlines-dev/outlines/pull/930#issuecomment-2143535968

# from https://github.com/numba/numba/issues/9542
d = numba.typed.typeddict.Dict.empty(numba.types.unicode_type, numba.int64)
d["一"] = 10 # \xe4\xb8\x80
str(d) # assert successfully interprets

0 comments on commit a759728

Please sign in to comment.