From c314cb8a8eeb54d3c21932bb4a535b974c6841ea Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Sat, 1 Jun 2024 13:12:44 -0500
Subject: [PATCH] add assertions for numba behavior re: UnicodeCharSeq /
 unicode_type

---
 tests/fsm/test_regex.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/fsm/test_regex.py b/tests/fsm/test_regex.py
index 14f606f47..eeeafcb07 100644
--- a/tests/fsm/test_regex.py
+++ b/tests/fsm/test_regex.py
@@ -655,3 +655,32 @@ def convert_token_to_string(self, token):
         )
         is_accepted = len(state_seq) >= len(token_trans_key_seq)
         assert should_accept == is_accepted
+
+
+def test_numba_leading_null_byte_UnicodeCharSeq_remains_broken():
+    """Assert numba UnicodeCharSeq w/ leading \x00 is still broken"""
+    # EXPLANATION:
+    # https://github.com/outlines-dev/outlines/pull/930#issuecomment-2143535968
+
+    # from https://github.com/numba/numba/issues/9542
+    d = numba.typed.typeddict.Dict.empty(numba.types.UnicodeCharSeq(1), numba.int64)
+    d["一"] = 10  # \xe4\xb8\x80
+    with pytest.raises(KeyError):
+        str(d)
+
+    # most characters are fine, but "\x00" is converted to ""
+    l = np.fromiter(["\x99", "\x00"], dtype=np.dtype("U2"))
+    assert str(l[0]) == "\x99"  # fine
+    assert str(l[1]) == ""  # 1-byte null converted to 0-bytes
+
+
+@pytest.mark.parametrize("input_key", ["一", "\x00"])
+def test_numba_leading_null_byte_unicode_type_sane(input_key):
+    """Assert numba unicode_type w/ leading \x00 is working"""
+    # EXPLANATION:
+    # https://github.com/outlines-dev/outlines/pull/930#issuecomment-2143535968
+
+    # from https://github.com/numba/numba/issues/9542
+    d = numba.typed.typeddict.Dict.empty(numba.types.unicode_type, numba.int64)
+    d["一"] = 10  # \xe4\xb8\x80
+    str(d)  # assert successfully interprets