Skip to content

Commit

Permalink
Clarify where fileReader offset is after reading invalid UTF-8 (chape…
Browse files Browse the repository at this point in the history
…l-lang#24807)

This PR adds a test to lock in the behavior and updates the I/O
documentation to clarify the current behavior.

Reviewed by @jeremiah-corrado - thanks!
  • Loading branch information
mppf authored Apr 10, 2024
2 parents 134020f + d660ab1 commit f0a23dc
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 0 deletions.
19 changes: 19 additions & 0 deletions modules/standard/IO.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,25 @@ operating system streams standard input, standard output, and standard error.
All three are safe to use concurrently.
Unicode Support
---------------
Most I/O operations default to working with textual data in the UTF-8 encoding.
This choice of UTF-8 matches the encoding used by the ``string`` type (see
:ref:`Chapter-Strings`).
To work with non-UTF-8 data, it's necessary to use binary I/O routines (e.g.
:proc:`fileReader.readByte`, :proc:`fileReader.readBytes`,
:proc:`fileReader.readBinary` :proc:`fileReader.readBits`) or do I/O with a
serializer or deserializer that uses a binary format, such as
:record:`~IO.binaryDeserializer`.

Generally speaking, if invalid UTF-8 is encountered when reading textual data, a
``SystemError`` will be throw with ``EILSEQ`` and the channel position will be
left just after the first byte of UTF-8 that was determined to be invalid. Some
routines have other error handling behavior as described in their documentation
(for example, see :proc:`fileReader.readThrough`).

.. _about-io-error-handling:

Error Handling
Expand Down
78 changes: 78 additions & 0 deletions test/io/ferguson/utf8/offset-after-invalid-read.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
use IO, CTypes;

// This test checks that invalid UTF-8 errors from reading a
// character-at-a-time leave the channel in the expected position.
// At the time of this writing, that is at the first byte
// that is problematic.

config const verbose = false;

extern var qbytes_iobuf_size:c_size_t;

var defaultBufSz = qbytes_iobuf_size: int ;

proc testWithBuf(invalid: bytes, expectOffset: int, bufsz: int = defaultBufSz) {
qbytes_iobuf_size = bufsz: c_size_t;

var f = openMemFile();
f.writer(locking=false).write(b"abc" + invalid + b"xyz");

var r = f.reader(locking=false);
try {
var codepoint: int;
while true {
if verbose then
writef("offset %i ", r.offset());
r.readCodepoint(codepoint);
if verbose then
writef("codepoint %c (%xi)\n", codepoint, codepoint);
}
} catch e {
if verbose {
writeln("caught expected error ", e);
writeln("reader position is ", r.offset());
}
assert(r.offset() == expectOffset + 3); // 3 to pass "abc"
}
}

proc test(invalid: bytes, expectOffset: int) {
writeln("testing ", invalid.toHexadecimal(uppercase=true));
testWithBuf(invalid, expectOffset, defaultBufSz);
for buffsz in 1..10 {
testWithBuf(invalid, expectOffset, buffsz);
}
qbytes_iobuf_size = defaultBufSz: c_size_t;
}

// this is the byte patterns for valid UTF-8
// 1 byte: 0xxxxxxx
// 2 byte: 110xxxxx 10xxxxxx
// 3 byte: 1110xxxx 10xxxxxx 10xxxxxx
// 4 byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

// testing invalid UTF-8 patterns

// 1 byte:
// 11111111 -- FF -- not a legal start byte
test(b"\xFF", 1);
// 10001000 -- 88 -- unexpected continuation byte
test(b"\x88", 1);

// 2 byte:
// 11000010 00100100 -- C2 24 -- (missing continuation)
test(b"\xC2\x24", 2);

// 3 byte:
// 11100000 00100100 00100100 -- E0 24 24 -- (missing continuation)
test(b"\xE0\x24\x24", 2);
// 11100000 10100100 00100100 -- E0 A4 24 -- (missing continuation)
test(b"\xE0\xA4\x24", 3);

// 4 byte: (missing continuation)
// 11110000 00100100 00100100 00100100 -- F0 24 24 24
test(b"\xF0\x24\x24\x24", 2);
// 11110000 10010000 00100100 00100100 -- F0 90 24 24
test(b"\xF0\x90\x24\x24", 3);
// 11110000 10010000 10001101 00100100 -- F0 90 8D 24
test(b"\xF0\x90\x8D\x24", 4);
8 changes: 8 additions & 0 deletions test/io/ferguson/utf8/offset-after-invalid-read.good
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
testing FF
testing 88
testing C224
testing E02424
testing E0A424
testing F0242424
testing F0902424
testing F0908D24

0 comments on commit f0a23dc

Please sign in to comment.