Clarify where fileReader offset is after reading invalid UTF-8 (chape…

…l-lang#24807) This PR adds a test to lock in the behavior and updates the I/O documentation to clarify the current behavior. Reviewed by @jeremiah-corrado - thanks!
arezaii · Apr 10, 2024 · f0a23dc · f0a23dc
2 parents 134020f + d660ab1
commit f0a23dc
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 0 deletions.
diff --git a/modules/standard/IO.chpl b/modules/standard/IO.chpl
@@ -511,6 +511,25 @@ operating system streams standard input, standard output, and standard error.
 
 All three are safe to use concurrently.
 
+Unicode Support
+---------------
+
+Most I/O operations default to working with textual data in the UTF-8 encoding.
+This choice of UTF-8 matches the encoding used by the ``string`` type (see
+:ref:`Chapter-Strings`).
+
+To work with non-UTF-8 data, it's necessary to use binary I/O routines (e.g.
+:proc:`fileReader.readByte`, :proc:`fileReader.readBytes`,
+:proc:`fileReader.readBinary` :proc:`fileReader.readBits`) or do I/O with a
+serializer or deserializer that uses a binary format, such as
+:record:`~IO.binaryDeserializer`.
+
+Generally speaking, if invalid UTF-8 is encountered when reading textual data, a
+``SystemError`` will be throw with ``EILSEQ`` and the channel position will be
+left just after the first byte of UTF-8 that was determined to be invalid. Some
+routines have other error handling behavior as described in their documentation
+(for example, see :proc:`fileReader.readThrough`).
+
 .. _about-io-error-handling:
 
 Error Handling

diff --git a/test/io/ferguson/utf8/offset-after-invalid-read.chpl b/test/io/ferguson/utf8/offset-after-invalid-read.chpl
@@ -0,0 +1,78 @@
+use IO, CTypes;
+
+// This test checks that invalid UTF-8 errors from reading a
+// character-at-a-time leave the channel in the expected position.
+// At the time of this writing, that is at the first byte
+// that is problematic.
+
+config const verbose = false;
+
+extern var qbytes_iobuf_size:c_size_t;
+
+var defaultBufSz = qbytes_iobuf_size: int ;
+
+proc testWithBuf(invalid: bytes, expectOffset: int, bufsz: int = defaultBufSz) {
+  qbytes_iobuf_size = bufsz: c_size_t;
+
+  var f = openMemFile();
+  f.writer(locking=false).write(b"abc" + invalid + b"xyz");
+
+  var r = f.reader(locking=false);
+  try {
+    var codepoint: int;
+    while true {
+      if verbose then 
+        writef("offset %i ", r.offset());
+      r.readCodepoint(codepoint);
+      if verbose then
+        writef("codepoint %c (%xi)\n", codepoint, codepoint);
+    }
+  } catch e {
+    if verbose {
+      writeln("caught expected error ", e);
+      writeln("reader position is ", r.offset());
+    }
+    assert(r.offset() == expectOffset + 3); // 3 to pass "abc"
+  }
+}
+
+proc test(invalid: bytes, expectOffset: int) {
+  writeln("testing ", invalid.toHexadecimal(uppercase=true));
+  testWithBuf(invalid, expectOffset, defaultBufSz);
+  for buffsz in 1..10 {
+    testWithBuf(invalid, expectOffset, buffsz);
+  }
+  qbytes_iobuf_size = defaultBufSz: c_size_t;
+}
+
+// this is the byte patterns for valid UTF-8
+// 1 byte: 0xxxxxxx
+// 2 byte: 110xxxxx 10xxxxxx
+// 3 byte: 1110xxxx 10xxxxxx 10xxxxxx
+// 4 byte: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// testing invalid UTF-8 patterns
+
+// 1 byte:
+//   11111111 -- FF -- not a legal start byte
+test(b"\xFF", 1);
+//   10001000 -- 88 -- unexpected continuation byte
+test(b"\x88", 1);
+
+// 2 byte:
+//   11000010 00100100 -- C2 24 -- (missing continuation)
+test(b"\xC2\x24", 2);
+
+// 3 byte:
+//   11100000 00100100 00100100 -- E0 24 24 -- (missing continuation)
+test(b"\xE0\x24\x24", 2);
+//   11100000 10100100 00100100 -- E0 A4 24 -- (missing continuation)
+test(b"\xE0\xA4\x24", 3);
+
+// 4 byte: (missing continuation)
+//   11110000 00100100 00100100 00100100 -- F0 24 24 24
+test(b"\xF0\x24\x24\x24", 2);
+//   11110000 10010000 00100100 00100100 -- F0 90 24 24
+test(b"\xF0\x90\x24\x24", 3);
+//   11110000 10010000 10001101 00100100 -- F0 90 8D 24
+test(b"\xF0\x90\x8D\x24", 4);
diff --git a/test/io/ferguson/utf8/offset-after-invalid-read.good b/test/io/ferguson/utf8/offset-after-invalid-read.good
@@ -0,0 +1,8 @@
+testing FF
+testing 88
+testing C224
+testing E02424
+testing E0A424
+testing F0242424
+testing F0902424
+testing F0908D24