From 6c0849ac7224ad637dafa63fa561f400d1957378 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 9 Sep 2024 09:54:48 +0300 Subject: [PATCH 1/7] Add test for all ascii characters --- rdata/tests/data/test_ascii_ascii_chars.rds | 11 +++++++++++ rdata/tests/data/test_ascii_chars.rds | Bin 0 -> 154 bytes rdata/tests/test_rdata.py | 10 ++++++++++ 3 files changed, 21 insertions(+) create mode 100644 rdata/tests/data/test_ascii_ascii_chars.rds create mode 100644 rdata/tests/data/test_ascii_chars.rds diff --git a/rdata/tests/data/test_ascii_ascii_chars.rds b/rdata/tests/data/test_ascii_ascii_chars.rds new file mode 100644 index 0000000..0ea9427 --- /dev/null +++ b/rdata/tests/data/test_ascii_ascii_chars.rds @@ -0,0 +1,11 @@ +A +3 +263168 +197888 +5 +UTF-8 +16 +1 +262153 +102 +0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\'()*+,-./:;<=>\?@[\\]^_`{|}~\040\t\n\r\v\f\r\n diff --git a/rdata/tests/data/test_ascii_chars.rds b/rdata/tests/data/test_ascii_chars.rds new file mode 100644 index 0000000000000000000000000000000000000000..2922e49b20c2837ad04952e82038c6e9f3dd5439 GIT binary patch literal 154 zcmV;L0A>FliwFP!000001B>8dU|?WoU||80tUx9MYiNj@t_6@G0K|+8EDW4LI?ce) z$k@cx%-q5KOitD zI3zSIJR&kmQAt@vRZU$(Q%hS%S5M!{+Q!z--XS_BHZDG)x~8^Hfs>1un}?SR0M41( I(v1KB02MStE&u=k literal 0 HcmV?d00001 diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index ccffead..53c3f29 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -708,6 +708,16 @@ def test_ascii(self) -> None: np.testing.assert_equal(ma.get_fill_value(), ref_ma.get_fill_value()) + def test_ascii_characters(self) -> None: + """Test reading string with all ascii printable characters.""" + data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds") + assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data + + def test_ascii_ascii_characters(self) -> None: + """Test reading string with all ascii printable characters.""" + data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds") + assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data + def test_nan_inf(self) -> None: """Test reading nan and inf.""" data = rdata.read_rds(TESTDATA_PATH / "test_nan_inf.rds") From 3645e149b60d929afe739cff272323eee2ab3bfc Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 9 Sep 2024 09:55:35 +0300 Subject: [PATCH 2/7] Unescape question mark --- rdata/parser/_ascii.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rdata/parser/_ascii.py b/rdata/parser/_ascii.py index 15f59a7..22afa7e 100644 --- a/rdata/parser/_ascii.py +++ b/rdata/parser/_ascii.py @@ -60,15 +60,20 @@ def _parse_array_values( return array def parse_string(self, length: int) -> bytes: - # Non-ascii characters in strings are written using octal byte codes, + # Read the ascii string + s = self._readline() + + # R escapes question marks ('?') so they come always as r'\?'. + # Let's start unescaping those. + s = s.replace(r"\?", "?") + + # Non-ascii characters and space are written using octal byte codes, # for example, a string 'aƤ' (2 chars) in UTF-8 is written as an ascii # string r'a\303\244' (9 chars). We want to transform this to a byte # string b'a\303\244' (3 bytes) corresponding to the byte # representation of the original UTF-8 string. # Let's use this string as an example to go through the code below - # Read the ascii string - s = self._readline() # Now s = r'a\303\244' (9 chars) # Convert characters to bytes (all characters are ascii) From 26a626caf90a82082704e0457c1b8429621e6cec Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Mon, 9 Sep 2024 10:33:15 +0300 Subject: [PATCH 3/7] Fix escaping various characters for R output --- rdata/unparser/_ascii.py | 45 +++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index a20f8fc..bc1e2cc 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -73,11 +73,46 @@ def unparse_string(self, value: bytes) -> None: self.unparse_int(len(value)) # Ideally we could do here the reverse of parsing, - # i.e., value = value.decode('latin1').encode('unicode_escape').decode('ascii') + # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii') # This would produce byte representation in hex such as '\xc3\xa4', # but we need to have the equivalent octal presentation '\303\244'. - # So, we do somewhat manual conversion instead: - s = "".join(chr(byte) if chr(byte) in string.printable else rf"\{byte:03o}" - for byte in value) + # So, we need to do somewhat manual conversion instead. + + # List of ascii characters that are written directly; + # this is all printable ascii except + # - ' ' that Python writes as ' ', but R as '\040' + # - '\v' that Python writes as '\x0b', but R as '\v' + # - '\f' that Python writes as '\x0c', but R as '\f' + write_raw = string.printable.replace(' ', '').replace('\v', '').replace('\f', '') + + def escape(b: bytes) -> str: + """Escape string, e.g., b'\n' -> r'\\n'""" + return b.decode('latin1').encode('unicode_escape').decode('ascii') + + # Go though the string byte-by-byte as we need to + # convert every non-ascii character separately + output = "" + ascii_buffer = b"" + for byte in value: + if chr(byte) in write_raw: + # Collect ascii characters to substring buffer + ascii_buffer += bytes([byte]) + else: + # Encountered a non-ascii character! + # Escape and add the ascii buffer + output += escape(ascii_buffer) + ascii_buffer = b"" + # Add '\v' or '\f' or non-ascii character in octal presentation + if chr(byte) == "\v": + output += r"\v" + elif chr(byte) == "\f": + output += r"\f" + else: + output += rf"\{byte:03o}" + # Escape and add the remaining ascii buffer + output += escape(ascii_buffer) + + # Escape some more characters like R does + output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?") - self._add_line(s) + self._add_line(output) From 4c825b4002b723f619d1f1b03c2b536f44eea976 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Wed, 11 Sep 2024 15:45:10 +0300 Subject: [PATCH 4/7] Fix ruff --- rdata/tests/test_rdata.py | 4 ++-- rdata/unparser/_ascii.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 53c3f29..5783d19 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -711,12 +711,12 @@ def test_ascii(self) -> None: def test_ascii_characters(self) -> None: """Test reading string with all ascii printable characters.""" data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds") - assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data + assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501 def test_ascii_ascii_characters(self) -> None: """Test reading string with all ascii printable characters.""" data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds") - assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data + assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501 def test_nan_inf(self) -> None: """Test reading nan and inf.""" diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index bc1e2cc..50d9bc0 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -83,11 +83,13 @@ def unparse_string(self, value: bytes) -> None: # - ' ' that Python writes as ' ', but R as '\040' # - '\v' that Python writes as '\x0b', but R as '\v' # - '\f' that Python writes as '\x0c', but R as '\f' - write_raw = string.printable.replace(' ', '').replace('\v', '').replace('\f', '') + write_raw = string.printable.replace(" ", "")\ + .replace("\v", "")\ + .replace("\f", "") def escape(b: bytes) -> str: - """Escape string, e.g., b'\n' -> r'\\n'""" - return b.decode('latin1').encode('unicode_escape').decode('ascii') + r"""Escape string, e.g., b'\n' -> r'\\n'.""" + return b.decode("latin1").encode("unicode_escape").decode("ascii") # Go though the string byte-by-byte as we need to # convert every non-ascii character separately From 77d726dd2779a3b46ac3e8db79f536582271e64f Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 12:56:23 +0300 Subject: [PATCH 5/7] Add test for ascii file with NA string --- rdata/tests/data/test_ascii_na_string.rds | 10 ++++++++++ rdata/tests/test_rdata.py | 7 +++++++ 2 files changed, 17 insertions(+) create mode 100644 rdata/tests/data/test_ascii_na_string.rds diff --git a/rdata/tests/data/test_ascii_na_string.rds b/rdata/tests/data/test_ascii_na_string.rds new file mode 100644 index 0000000..f1ef747 --- /dev/null +++ b/rdata/tests/data/test_ascii_na_string.rds @@ -0,0 +1,10 @@ +A +3 +263168 +197888 +5 +UTF-8 +16 +1 +9 +-1 diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 5783d19..6d925ce 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -102,6 +102,13 @@ def test_na_string(self) -> None: "test_na_string": [None], }) + def test_ascii_na_string(self) -> None: + """Test that the NA string is parsed correctly.""" + # File created in R with + # saveRDS(as.character(NA), file="test_ascii_na_string.rds", ascii=TRUE, compress=FALSE) # noqa: E501 + data = rdata.read_rds(TESTDATA_PATH / "test_ascii_na_string.rds") + np.testing.assert_equal(data, [None]) + def test_complex(self) -> None: """Test that complex numbers can be parsed.""" data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda") From 833db9a9d51e63f5aeabf91b4194766378efcd00 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 13:00:08 +0300 Subject: [PATCH 6/7] Fix ascii unparser for NA string --- rdata/unparser/_ascii.py | 5 +---- rdata/unparser/_unparser.py | 12 ++++++++++-- rdata/unparser/_xdr.py | 9 ++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/rdata/unparser/_ascii.py b/rdata/unparser/_ascii.py index 50d9bc0..5aba32e 100644 --- a/rdata/unparser/_ascii.py +++ b/rdata/unparser/_ascii.py @@ -68,10 +68,7 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: self._add_line(line) - def unparse_string(self, value: bytes) -> None: - """Unparse a string.""" - self.unparse_int(len(value)) - + def _unparse_string_characters(self, value: bytes) -> None: # Ideally we could do here the reverse of parsing, # i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii') # This would produce byte representation in hex such as '\xc3\xa4', diff --git a/rdata/unparser/_unparser.py b/rdata/unparser/_unparser.py index 7361b65..b2b073e 100644 --- a/rdata/unparser/_unparser.py +++ b/rdata/unparser/_unparser.py @@ -73,9 +73,17 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None: def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: """Unparse the values of an array.""" - @abc.abstractmethod - def unparse_string(self, value: bytes) -> None: + def unparse_string(self, value: bytes | None) -> None: """Unparse a string.""" + if value is None: + self.unparse_int(-1) + return + self.unparse_int(len(value)) + self._unparse_string_characters(value) + + @abc.abstractmethod + def _unparse_string_characters(self, value: bytes) -> None: + """Unparse characters of a string (not None).""" def unparse_r_data(self, r_data: RData) -> None: """Unparse an RData object.""" diff --git a/rdata/unparser/_xdr.py b/rdata/unparser/_xdr.py index 8bea3f0..742aa87 100644 --- a/rdata/unparser/_xdr.py +++ b/rdata/unparser/_xdr.py @@ -56,10 +56,5 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None: data = array.data if array.flags["C_CONTIGUOUS"] else array.tobytes() self.file.write(data) - def unparse_string(self, value: bytes) -> None: - """Unparse a string.""" - if value is None: - self.unparse_int(-1) - else: - self.unparse_int(len(value)) - self.file.write(value) + def _unparse_string_characters(self, value: bytes) -> None: + self.file.write(value) From 2d8c1f4ce0e709cf83c594c068fca8a062e5a2a4 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 13 Sep 2024 13:04:44 +0300 Subject: [PATCH 7/7] Add R code creating the files --- rdata/tests/test_rdata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py index 6d925ce..c2e6436 100644 --- a/rdata/tests/test_rdata.py +++ b/rdata/tests/test_rdata.py @@ -717,11 +717,15 @@ def test_ascii(self) -> None: def test_ascii_characters(self) -> None: """Test reading string with all ascii printable characters.""" + # File created in R with + # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_chars.rds") # noqa: E501,ERA001 data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds") assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501 def test_ascii_ascii_characters(self) -> None: """Test reading string with all ascii printable characters.""" + # File created in R with + # saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_ascii_chars.rds", ascii=TRUE, compress=FALSE) # noqa: E501,ERA001 data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds") assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501