Skip to content

Commit

Permalink
Add tokenizer-only overconsumption tests, fix bug
Browse files Browse the repository at this point in the history
  • Loading branch information
smheidrich committed Dec 16, 2022
1 parent a7f347f commit 7396e2e
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 10 deletions.
7 changes: 1 addition & 6 deletions src/suitable_bytes_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,7 @@ impl Utf8CharSource for SuitableBytesStream {
{
Char::Eof => None,
Char::Char(c) => Some(c),
Char::NoData => {
return io::Result::Err(io::Error::new(
io::ErrorKind::Other,
"should never happen",
));
}
Char::NoData => None, // for us this means the same as EOF I guess?
},
)
}
Expand Down
4 changes: 2 additions & 2 deletions src/suitable_text_stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ impl ParkCursorChars for SuitableTextStream {
let chars_read_from_buf = self.chars_read_from_buf;
if let Some(buf_start_seek_pos) = self.buf_start_seek_pos {
self.inner.seek(OpaqueSeekFrom::Start(buf_start_seek_pos))?;
let buf = self.inner.read_string(chars_read_from_buf)?;
self.chars_iter = buf.into_chars();
self.inner.read_string(chars_read_from_buf)?;
self.chars_iter = OwnedChars::from_string("".to_owned());
}
Ok(())
}
Expand Down
34 changes: 32 additions & 2 deletions tests/test_overconsumption.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from json_stream_rs_tokenizer import load
from json_stream_rs_tokenizer import RustTokenizer, load


@pytest.fixture(params=["str", "bytes"])
Expand All @@ -19,6 +19,7 @@ def to_bytes_or_str_buf(request):
assert False


# this test requires a version of json-stream that supports park_cursor()
@pytest.mark.parametrize(
"s,expected_cursor_pos",
[
Expand All @@ -29,9 +30,38 @@ def to_bytes_or_str_buf(request):
('{ "a": [1, 2, 3, 4, 5 ], "d": 4, "xyz": 99999 } { "b": 2 }', 47),
],
)
def test_overconsumption_multiple_documents(
def test_overconsumption_load_ends_at_doc_end(
s, expected_cursor_pos, to_bytes_or_str_buf
):
buf = to_bytes_or_str_buf(s)
list(load(buf))
assert buf.tell() == expected_cursor_pos


@pytest.mark.parametrize(
"s,expected_str_cursor_pos,expected_bytes_cursor_pos",
[
('{ "a": 1 } | { "b": 2 }', 10, 10),
('{"a": 1} | { "b": 2 }', 8, 8),
('{"a":1} | { "b": 2 }', 7, 7),
('{ "a":1, "b": 2, "c": 3, "d": 4, "xyz": 9 } | { "b": 2 }', 43, 43),
('{ "æ": [1, 2, 3, 4, 5 ], "ð": 4, "xyz": 9 } | { "b": 2 }', 43, 45),
],
)
def test_overconsumption_park_cursor_skip_3_chars_and_continue(
s, expected_str_cursor_pos, expected_bytes_cursor_pos, to_bytes_or_str_buf
):
buf = to_bytes_or_str_buf(s)
tokenizer = RustTokenizer(buf)
for kind, val in tokenizer:
if val == "}":
break
tokenizer.park_cursor()
if isinstance(buf, StringIO):
assert buf.tell() == expected_str_cursor_pos
elif isinstance(buf, BytesIO):
assert buf.tell() == expected_bytes_cursor_pos
else:
assert False, "what"
buf.read(3) # skip ahead 3 chars
assert "".join(str(val) for kind, val in tokenizer) == "{b:2}"

0 comments on commit 7396e2e

Please sign in to comment.