From 6c0849ac7224ad637dafa63fa561f400d1957378 Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Mon, 9 Sep 2024 09:54:48 +0300
Subject: [PATCH 1/7] Add test for all ascii characters

---
 rdata/tests/data/test_ascii_ascii_chars.rds |  11 +++++++++++
 rdata/tests/data/test_ascii_chars.rds       | Bin 0 -> 154 bytes
 rdata/tests/test_rdata.py                   |  10 ++++++++++
 3 files changed, 21 insertions(+)
 create mode 100644 rdata/tests/data/test_ascii_ascii_chars.rds
 create mode 100644 rdata/tests/data/test_ascii_chars.rds

diff --git a/rdata/tests/data/test_ascii_ascii_chars.rds b/rdata/tests/data/test_ascii_ascii_chars.rds
new file mode 100644
index 0000000..0ea9427
--- /dev/null
+++ b/rdata/tests/data/test_ascii_ascii_chars.rds
@@ -0,0 +1,11 @@
+A
+3
+263168
+197888
+5
+UTF-8
+16
+1
+262153
+102
+0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\'()*+,-./:;<=>\?@[\\]^_`{|}~\040\t\n\r\v\f\r\n
diff --git a/rdata/tests/data/test_ascii_chars.rds b/rdata/tests/data/test_ascii_chars.rds
new file mode 100644
index 0000000000000000000000000000000000000000..2922e49b20c2837ad04952e82038c6e9f3dd5439
GIT binary patch
literal 154
zcmV;L0A>FliwFP!000001B>8dU|?WoU||80tUx9MYiNj@t_6@G0K|+8EDW4LI?ce)
z$k@cx%-q5<F)2AEH7z|OGb=kMH!r`Su&B7Cw5+_MvdYoP*~Qh(-NVz%+sD_>KOitD
zI3zSIJR&kmQAt@vRZU$(Q%hS%S5M!{+Q!z--XS_BHZDG)x~8^Hfs>1un}?SR0M41(
I(v1KB02MStE&u=k

literal 0
HcmV?d00001

diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
index ccffead..53c3f29 100644
--- a/rdata/tests/test_rdata.py
+++ b/rdata/tests/test_rdata.py
@@ -708,6 +708,16 @@ def test_ascii(self) -> None:
                 np.testing.assert_equal(ma.get_fill_value(),
                                         ref_ma.get_fill_value())
 
+    def test_ascii_characters(self) -> None:
+        """Test reading string with all ascii printable characters."""
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data
+
+    def test_ascii_ascii_characters(self) -> None:
+        """Test reading string with all ascii printable characters."""
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data
+
     def test_nan_inf(self) -> None:
         """Test reading nan and inf."""
         data = rdata.read_rds(TESTDATA_PATH / "test_nan_inf.rds")

From 3645e149b60d929afe739cff272323eee2ab3bfc Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Mon, 9 Sep 2024 09:55:35 +0300
Subject: [PATCH 2/7] Unescape question mark

---
 rdata/parser/_ascii.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py
index 15f59a7..22afa7e 100644
--- a/rdata/parser/_ascii.py
+++ b/rdata/parser/_ascii.py
@@ -60,15 +60,20 @@ def _parse_array_values(
         return array
 
     def parse_string(self, length: int) -> bytes:
-        # Non-ascii characters in strings are written using octal byte codes,
+        # Read the ascii string
+        s = self._readline()
+
+        # R escapes question marks ('?') so they come always as r'\?'.
+        # Let's start unescaping those.
+        s = s.replace(r"\?", "?")
+
+        # Non-ascii characters and space are written using octal byte codes,
         # for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii
         # string r'a\303\244' (9 chars). We want to transform this to a byte
         # string b'a\303\244' (3 bytes) corresponding to the byte
         # representation of the original UTF-8 string.
         # Let's use this string as an example to go through the code below
 
-        # Read the ascii string
-        s = self._readline()
         # Now s = r'a\303\244' (9 chars)
 
         # Convert characters to bytes (all characters are ascii)

From 26a626caf90a82082704e0457c1b8429621e6cec Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Mon, 9 Sep 2024 10:33:15 +0300
Subject: [PATCH 3/7] Fix escaping various characters for R output

---
 rdata/unparser/_ascii.py | 45 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py
index a20f8fc..bc1e2cc 100644
--- a/rdata/unparser/_ascii.py
+++ b/rdata/unparser/_ascii.py
@@ -73,11 +73,46 @@ def unparse_string(self, value: bytes) -> None:
         self.unparse_int(len(value))
 
         # Ideally we could do here the reverse of parsing,
-        # i.e., value = value.decode('latin1').encode('unicode_escape').decode('ascii')
+        # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
         # This would produce byte representation in hex such as '\xc3\xa4',
         # but we need to have the equivalent octal presentation '\303\244'.
-        # So, we do somewhat manual conversion instead:
-        s = "".join(chr(byte) if chr(byte) in string.printable else rf"\{byte:03o}"
-                    for byte in value)
+        # So, we need to do somewhat manual conversion instead.
+
+        # List of ascii characters that are written directly;
+        # this is all printable ascii except
+        # - ' '  that Python writes as ' ',    but R as '\040'
+        # - '\v' that Python writes as '\x0b', but R as '\v'
+        # - '\f' that Python writes as '\x0c', but R as '\f'
+        write_raw = string.printable.replace(' ', '').replace('\v', '').replace('\f', '')
+
+        def escape(b: bytes) -> str:
+            """Escape string, e.g., b'\n' -> r'\\n'"""
+            return b.decode('latin1').encode('unicode_escape').decode('ascii')
+
+        # Go though the string byte-by-byte as we need to
+        # convert every non-ascii character separately
+        output = ""
+        ascii_buffer = b""
+        for byte in value:
+            if chr(byte) in write_raw:
+                # Collect ascii characters to substring buffer
+                ascii_buffer += bytes([byte])
+            else:
+                # Encountered a non-ascii character!
+                # Escape and add the ascii buffer
+                output += escape(ascii_buffer)
+                ascii_buffer = b""
+                # Add '\v' or '\f' or non-ascii character in octal presentation
+                if chr(byte) == "\v":
+                    output += r"\v"
+                elif chr(byte) == "\f":
+                    output += r"\f"
+                else:
+                    output += rf"\{byte:03o}"
+        # Escape and add the remaining ascii buffer
+        output += escape(ascii_buffer)
+
+        # Escape some more characters like R does
+        output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?")
 
-        self._add_line(s)
+        self._add_line(output)

From 4c825b4002b723f619d1f1b03c2b536f44eea976 Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Wed, 11 Sep 2024 15:45:10 +0300
Subject: [PATCH 4/7] Fix ruff

---
 rdata/tests/test_rdata.py | 4 ++--
 rdata/unparser/_ascii.py  | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
index 53c3f29..5783d19 100644
--- a/rdata/tests/test_rdata.py
+++ b/rdata/tests/test_rdata.py
@@ -711,12 +711,12 @@ def test_ascii(self) -> None:
     def test_ascii_characters(self) -> None:
         """Test reading string with all ascii printable characters."""
         data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
-        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
 
     def test_ascii_ascii_characters(self) -> None:
         """Test reading string with all ascii printable characters."""
         data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
-        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data
+        assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
 
     def test_nan_inf(self) -> None:
         """Test reading nan and inf."""
diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py
index bc1e2cc..50d9bc0 100644
--- a/rdata/unparser/_ascii.py
+++ b/rdata/unparser/_ascii.py
@@ -83,11 +83,13 @@ def unparse_string(self, value: bytes) -> None:
         # - ' '  that Python writes as ' ',    but R as '\040'
         # - '\v' that Python writes as '\x0b', but R as '\v'
         # - '\f' that Python writes as '\x0c', but R as '\f'
-        write_raw = string.printable.replace(' ', '').replace('\v', '').replace('\f', '')
+        write_raw = string.printable.replace(" ", "")\
+                                    .replace("\v", "")\
+                                    .replace("\f", "")
 
         def escape(b: bytes) -> str:
-            """Escape string, e.g., b'\n' -> r'\\n'"""
-            return b.decode('latin1').encode('unicode_escape').decode('ascii')
+            r"""Escape string, e.g., b'\n' -> r'\\n'."""
+            return b.decode("latin1").encode("unicode_escape").decode("ascii")
 
         # Go though the string byte-by-byte as we need to
         # convert every non-ascii character separately

From 77d726dd2779a3b46ac3e8db79f536582271e64f Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Fri, 13 Sep 2024 12:56:23 +0300
Subject: [PATCH 5/7] Add test for ascii file with NA string

---
 rdata/tests/data/test_ascii_na_string.rds | 10 ++++++++++
 rdata/tests/test_rdata.py                 |  7 +++++++
 2 files changed, 17 insertions(+)
 create mode 100644 rdata/tests/data/test_ascii_na_string.rds

diff --git a/rdata/tests/data/test_ascii_na_string.rds b/rdata/tests/data/test_ascii_na_string.rds
new file mode 100644
index 0000000..f1ef747
--- /dev/null
+++ b/rdata/tests/data/test_ascii_na_string.rds
@@ -0,0 +1,10 @@
+A
+3
+263168
+197888
+5
+UTF-8
+16
+1
+9
+-1
diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
index 5783d19..6d925ce 100644
--- a/rdata/tests/test_rdata.py
+++ b/rdata/tests/test_rdata.py
@@ -102,6 +102,13 @@ def test_na_string(self) -> None:
             "test_na_string": [None],
         })
 
+    def test_ascii_na_string(self) -> None:
+        """Test that the NA string is parsed correctly."""
+        # File created in R with
+        # saveRDS(as.character(NA), file="test_ascii_na_string.rds", ascii=TRUE, compress=FALSE)  # noqa: E501
+        data = rdata.read_rds(TESTDATA_PATH / "test_ascii_na_string.rds")
+        np.testing.assert_equal(data, [None])
+
     def test_complex(self) -> None:
         """Test that complex numbers can be parsed."""
         data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda")

From 833db9a9d51e63f5aeabf91b4194766378efcd00 Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Fri, 13 Sep 2024 13:00:08 +0300
Subject: [PATCH 6/7] Fix ascii unparser for NA string

---
 rdata/unparser/_ascii.py    |  5 +----
 rdata/unparser/_unparser.py | 12 ++++++++++--
 rdata/unparser/_xdr.py      |  9 ++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py
index 50d9bc0..5aba32e 100644
--- a/rdata/unparser/_ascii.py
+++ b/rdata/unparser/_ascii.py
@@ -68,10 +68,7 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
 
             self._add_line(line)
 
-    def unparse_string(self, value: bytes) -> None:
-        """Unparse a string."""
-        self.unparse_int(len(value))
-
+    def _unparse_string_characters(self, value: bytes) -> None:
         # Ideally we could do here the reverse of parsing,
         # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
         # This would produce byte representation in hex such as '\xc3\xa4',
diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py
index 7361b65..b2b073e 100644
--- a/rdata/unparser/_unparser.py
+++ b/rdata/unparser/_unparser.py
@@ -73,9 +73,17 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None:
     def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
         """Unparse the values of an array."""
 
-    @abc.abstractmethod
-    def unparse_string(self, value: bytes) -> None:
+    def unparse_string(self, value: bytes | None) -> None:
         """Unparse a string."""
+        if value is None:
+            self.unparse_int(-1)
+            return
+        self.unparse_int(len(value))
+        self._unparse_string_characters(value)
+
+    @abc.abstractmethod
+    def _unparse_string_characters(self, value: bytes) -> None:
+        """Unparse characters of a string (not None)."""
 
     def unparse_r_data(self, r_data: RData) -> None:
         """Unparse an RData object."""
diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py
index 8bea3f0..742aa87 100644
--- a/rdata/unparser/_xdr.py
+++ b/rdata/unparser/_xdr.py
@@ -56,10 +56,5 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
         data = array.data if array.flags["C_CONTIGUOUS"] else array.tobytes()
         self.file.write(data)
 
-    def unparse_string(self, value: bytes) -> None:
-        """Unparse a string."""
-        if value is None:
-            self.unparse_int(-1)
-        else:
-            self.unparse_int(len(value))
-            self.file.write(value)
+    def _unparse_string_characters(self, value: bytes) -> None:
+        self.file.write(value)

From 2d8c1f4ce0e709cf83c594c068fca8a062e5a2a4 Mon Sep 17 00:00:00 2001
From: Tuomas Rossi <tuomas.rossi@csc.fi>
Date: Fri, 13 Sep 2024 13:04:44 +0300
Subject: [PATCH 7/7] Add R code creating the files

---
 rdata/tests/test_rdata.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
index 6d925ce..c2e6436 100644
--- a/rdata/tests/test_rdata.py
+++ b/rdata/tests/test_rdata.py
@@ -717,11 +717,15 @@ def test_ascii(self) -> None:
 
     def test_ascii_characters(self) -> None:
         """Test reading string with all ascii printable characters."""
+        # File created in R with
+        # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_chars.rds")  # noqa: E501,ERA001
         data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
         assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501
 
     def test_ascii_ascii_characters(self) -> None:
         """Test reading string with all ascii printable characters."""
+        # File created in R with
+        # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_ascii_chars.rds", ascii=TRUE, compress=FALSE)  # noqa: E501,ERA001
         data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
         assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data  # noqa: E501