Add tokenizer-only overconsumption tests, fix bug

smheidrich · Dec 16, 2022 · 7396e2e · 7396e2e
1 parent a7f347f
commit 7396e2e
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 10 deletions.
diff --git a/src/suitable_bytes_stream.rs b/src/suitable_bytes_stream.rs
@@ -35,12 +35,7 @@ impl Utf8CharSource for SuitableBytesStream {
             {
                 Char::Eof => None,
                 Char::Char(c) => Some(c),
-                Char::NoData => {
-                    return io::Result::Err(io::Error::new(
-                        io::ErrorKind::Other,
-                        "should never happen",
-                    ));
-                }
+                Char::NoData => None,  // for us this means the same as EOF I guess?
             },
         )
     }

diff --git a/src/suitable_text_stream.rs b/src/suitable_text_stream.rs
@@ -54,8 +54,8 @@ impl ParkCursorChars for SuitableTextStream {
         let chars_read_from_buf = self.chars_read_from_buf;
         if let Some(buf_start_seek_pos) = self.buf_start_seek_pos {
             self.inner.seek(OpaqueSeekFrom::Start(buf_start_seek_pos))?;
-            let buf = self.inner.read_string(chars_read_from_buf)?;
-            self.chars_iter = buf.into_chars();
+            self.inner.read_string(chars_read_from_buf)?;
+            self.chars_iter = OwnedChars::from_string("".to_owned());
         }
         Ok(())
     }

diff --git a/tests/test_overconsumption.py b/tests/test_overconsumption.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from json_stream_rs_tokenizer import load
+from json_stream_rs_tokenizer import RustTokenizer, load
 
 
 @pytest.fixture(params=["str", "bytes"])
@@ -19,6 +19,7 @@ def to_bytes_or_str_buf(request):
         assert False
 
 
+# this test requires a version of json-stream that supports park_cursor()
 @pytest.mark.parametrize(
     "s,expected_cursor_pos",
     [
@@ -29,9 +30,38 @@ def to_bytes_or_str_buf(request):
         ('{ "a": [1, 2, 3, 4, 5 ], "d": 4, "xyz": 99999 } { "b": 2 }', 47),
     ],
 )
-def test_overconsumption_multiple_documents(
+def test_overconsumption_load_ends_at_doc_end(
     s, expected_cursor_pos, to_bytes_or_str_buf
 ):
     buf = to_bytes_or_str_buf(s)
     list(load(buf))
     assert buf.tell() == expected_cursor_pos
+
+
+@pytest.mark.parametrize(
+    "s,expected_str_cursor_pos,expected_bytes_cursor_pos",
+    [
+        ('{ "a": 1 } | { "b": 2 }', 10, 10),
+        ('{"a": 1} | { "b": 2 }', 8, 8),
+        ('{"a":1} | { "b": 2 }', 7, 7),
+        ('{ "a":1, "b": 2, "c": 3, "d": 4, "xyz": 9 } | { "b": 2 }', 43, 43),
+        ('{ "æ": [1, 2, 3, 4, 5 ], "ð": 4, "xyz": 9 } | { "b": 2 }', 43, 45),
+    ],
+)
+def test_overconsumption_park_cursor_skip_3_chars_and_continue(
+    s, expected_str_cursor_pos, expected_bytes_cursor_pos, to_bytes_or_str_buf
+):
+    buf = to_bytes_or_str_buf(s)
+    tokenizer = RustTokenizer(buf)
+    for kind, val in tokenizer:
+        if val == "}":
+            break
+    tokenizer.park_cursor()
+    if isinstance(buf, StringIO):
+        assert buf.tell() == expected_str_cursor_pos
+    elif isinstance(buf, BytesIO):
+        assert buf.tell() == expected_bytes_cursor_pos
+    else:
+        assert False, "what"
+    buf.read(3)  # skip ahead 3 chars
+    assert "".join(str(val) for kind, val in tokenizer) == "{b:2}"