forked from chapel-lang/chapel
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Clarify where fileReader offset is after reading invalid UTF-8 (chape…
…l-lang#24807) This PR adds a test to lock in the behavior and updates the I/O documentation to clarify the current behavior. Reviewed by @jeremiah-corrado - thanks!
- Loading branch information
Showing
3 changed files
with
105 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
use IO, CTypes; | ||
|
||
// This test checks that invalid UTF-8 errors from reading a | ||
// character-at-a-time leave the channel in the expected position. | ||
// At the time of this writing, that is at the first byte | ||
// that is problematic. | ||
|
||
config const verbose = false; | ||
|
||
extern var qbytes_iobuf_size:c_size_t; | ||
|
||
var defaultBufSz = qbytes_iobuf_size: int ; | ||
|
||
proc testWithBuf(invalid: bytes, expectOffset: int, bufsz: int = defaultBufSz) { | ||
qbytes_iobuf_size = bufsz: c_size_t; | ||
|
||
var f = openMemFile(); | ||
f.writer(locking=false).write(b"abc" + invalid + b"xyz"); | ||
|
||
var r = f.reader(locking=false); | ||
try { | ||
var codepoint: int; | ||
while true { | ||
if verbose then | ||
writef("offset %i ", r.offset()); | ||
r.readCodepoint(codepoint); | ||
if verbose then | ||
writef("codepoint %c (%xi)\n", codepoint, codepoint); | ||
} | ||
} catch e { | ||
if verbose { | ||
writeln("caught expected error ", e); | ||
writeln("reader position is ", r.offset()); | ||
} | ||
assert(r.offset() == expectOffset + 3); // 3 to pass "abc" | ||
} | ||
} | ||
|
||
proc test(invalid: bytes, expectOffset: int) { | ||
writeln("testing ", invalid.toHexadecimal(uppercase=true)); | ||
testWithBuf(invalid, expectOffset, defaultBufSz); | ||
for buffsz in 1..10 { | ||
testWithBuf(invalid, expectOffset, buffsz); | ||
} | ||
qbytes_iobuf_size = defaultBufSz: c_size_t; | ||
} | ||
|
||
// this is the byte patterns for valid UTF-8 | ||
// 1 byte: 0xxxxxxx | ||
// 2 byte: 110xxxxx 10xxxxxx | ||
// 3 byte: 1110xxxx 10xxxxxx 10xxxxxx | ||
// 4 byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
|
||
// testing invalid UTF-8 patterns | ||
|
||
// 1 byte: | ||
// 11111111 -- FF -- not a legal start byte | ||
test(b"\xFF", 1); | ||
// 10001000 -- 88 -- unexpected continuation byte | ||
test(b"\x88", 1); | ||
|
||
// 2 byte: | ||
// 11000010 00100100 -- C2 24 -- (missing continuation) | ||
test(b"\xC2\x24", 2); | ||
|
||
// 3 byte: | ||
// 11100000 00100100 00100100 -- E0 24 24 -- (missing continuation) | ||
test(b"\xE0\x24\x24", 2); | ||
// 11100000 10100100 00100100 -- E0 A4 24 -- (missing continuation) | ||
test(b"\xE0\xA4\x24", 3); | ||
|
||
// 4 byte: (missing continuation) | ||
// 11110000 00100100 00100100 00100100 -- F0 24 24 24 | ||
test(b"\xF0\x24\x24\x24", 2); | ||
// 11110000 10010000 00100100 00100100 -- F0 90 24 24 | ||
test(b"\xF0\x90\x24\x24", 3); | ||
// 11110000 10010000 10001101 00100100 -- F0 90 8D 24 | ||
test(b"\xF0\x90\x8D\x24", 4); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
testing FF | ||
testing 88 | ||
testing C224 | ||
testing E02424 | ||
testing E0A424 | ||
testing F0242424 | ||
testing F0902424 | ||
testing F0908D24 |