vnmabus · trossi · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 11, 2024
diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py
@@ -60,15 +60,20 @@ def _parse_array_values(
         return array
 
     def parse_string(self, length: int) -> bytes:
-        # Non-ascii characters in strings are written using octal byte codes,
+        # Read the ascii string
+        s = self._readline()
+
+        # R escapes question marks ('?') so they come always as r'\?'.
+        # Let's start unescaping those.
+        s = s.replace(r"\?", "?")
+
+        # Non-ascii characters and space are written using octal byte codes,
         # for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii
         # string r'a\303\244' (9 chars). We want to transform this to a byte
         # string b'a\303\244' (3 bytes) corresponding to the byte
         # representation of the original UTF-8 string.
         # Let's use this string as an example to go through the code below
 
-        # Read the ascii string
-        s = self._readline()
         # Now s = r'a\303\244' (9 chars)
 
         # Convert characters to bytes (all characters are ascii)

diff --git a/rdata/tests/data/test_ascii_ascii_chars.rds b/rdata/tests/data/test_ascii_ascii_chars.rds
@@ -0,0 +1,11 @@
+A
+3
+263168
+197888
+5
+UTF-8
+16
+1
+262153
+102
+0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\'()*+,-./:;<=>\?@[\\]^_`{|}~\040\t\n\r\v\f\r\n
diff --git a/rdata/tests/data/test_ascii_chars.rds b/rdata/tests/data/test_ascii_chars.rds
diff --git a/rdata/tests/data/test_ascii_na_string.rds b/rdata/tests/data/test_ascii_na_string.rds
@@ -0,0 +1,10 @@
+A
+3
+263168
+197888
+5
+UTF-8
+16
+1
+9
+-1
diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
@@ -102,6 +102,13 @@ def test_na_string(self) -> None:
             "test_na_string": [None],
         })
 
+    def test_ascii_na_string(self) -> None:
+        """Test that the NA string is parsed correctly."""
+        # File created in R with
+        # saveRDS(as.character(NA), file="test_ascii_na_string.rds", ascii=TRUE, compress=FALSE)  # noqa: E501
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_na_string.rds")
+        np.testing.assert_equal(data, [None])
+
     def test_complex(self) -> None:
         """Test that complex numbers can be parsed."""
         data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda")
@@ -708,6 +715,20 @@ def test_ascii(self) -> None:
                 np.testing.assert_equal(ma.get_fill_value(),
                                         ref_ma.get_fill_value())
 
+    def test_ascii_characters(self) -> None:
+        """Test reading string with all ascii printable characters."""
+        # File created in R with
+        # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_chars.rds")  # noqa: E501,ERA001
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
+
+    def test_ascii_ascii_characters(self) -> None:
+        """Test reading string with all ascii printable characters."""
+        # File created in R with
+        # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_ascii_chars.rds", ascii=TRUE, compress=FALSE)  # noqa: E501,ERA001
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
+
     def test_nan_inf(self) -> None:
         """Test reading nan and inf."""
         data = rdata.read_rds(TESTDATA_PATH / "test_nan_inf.rds")

diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py
@@ -68,16 +68,50 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
 
             self._add_line(line)
 
-    def unparse_string(self, value: bytes) -> None:
-        """Unparse a string."""
-        self.unparse_int(len(value))
-
+    def _unparse_string_characters(self, value: bytes) -> None:
         # Ideally we could do here the reverse of parsing,
-        # i.e., value = value.decode('latin1').encode('unicode_escape').decode('ascii')
+        # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
         # This would produce byte representation in hex such as '\xc3\xa4',
         # but we need to have the equivalent octal presentation '\303\244'.
-        # So, we do somewhat manual conversion instead:
-        s = "".join(chr(byte) if chr(byte) in string.printable else rf"\{byte:03o}"
-                    for byte in value)
+        # So, we need to do somewhat manual conversion instead.
+
+        # List of ascii characters that are written directly;
+        # this is all printable ascii except
+        # - ' '  that Python writes as ' ',    but R as '\040'
+        # - '\v' that Python writes as '\x0b', but R as '\v'
+        # - '\f' that Python writes as '\x0c', but R as '\f'
+        write_raw = string.printable.replace(" ", "")\
+                                    .replace("\v", "")\
+                                    .replace("\f", "")
+
+        def escape(b: bytes) -> str:
+            r"""Escape string, e.g., b'\n' -> r'\\n'."""
+            return b.decode("latin1").encode("unicode_escape").decode("ascii")
+
+        # Go though the string byte-by-byte as we need to
+        # convert every non-ascii character separately
+        output = ""
+        ascii_buffer = b""
+        for byte in value:
+            if chr(byte) in write_raw:
+                # Collect ascii characters to substring buffer
+                ascii_buffer += bytes([byte])
+            else:
+                # Encountered a non-ascii character!
+                # Escape and add the ascii buffer
+                output += escape(ascii_buffer)
+                ascii_buffer = b""
+                # Add '\v' or '\f' or non-ascii character in octal presentation
+                if chr(byte) == "\v":
+                    output += r"\v"
+                elif chr(byte) == "\f":
+                    output += r"\f"
+                else:
+                    output += rf"\{byte:03o}"
+        # Escape and add the remaining ascii buffer
+        output += escape(ascii_buffer)
+
+        # Escape some more characters like R does
+        output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?")
 
-        self._add_line(s)
+        self._add_line(output)
diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py
@@ -73,9 +73,17 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None:
     def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
         """Unparse the values of an array."""
 
-    @abc.abstractmethod
-    def unparse_string(self, value: bytes) -> None:
+    def unparse_string(self, value: bytes | None) -> None:
         """Unparse a string."""
+        if value is None:
+            self.unparse_int(-1)
+            return
+        self.unparse_int(len(value))
+        self._unparse_string_characters(value)
+
+    @abc.abstractmethod
+    def _unparse_string_characters(self, value: bytes) -> None:
+        """Unparse characters of a string (not None)."""
 
     def unparse_r_data(self, r_data: RData) -> None:
         """Unparse an RData object."""

diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py
@@ -56,10 +56,5 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
         data = array.data if array.flags["C_CONTIGUOUS"] else array.tobytes()
         self.file.write(data)
 
-    def unparse_string(self, value: bytes) -> None:
-        """Unparse a string."""
-        if value is None:
-            self.unparse_int(-1)
-        else:
-            self.unparse_int(len(value))
-            self.file.write(value)
+    def _unparse_string_characters(self, value: bytes) -> None:
+        self.file.write(value)