Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing and unparsing ascii files with escaped characters and NA strings #44

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions rdata/parser/_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,20 @@ def _parse_array_values(
return array

def parse_string(self, length: int) -> bytes:
# Non-ascii characters in strings are written using octal byte codes,
# Read the ascii string
s = self._readline()

# R escapes question marks ('?') so they come always as r'\?'.
# Let's start unescaping those.
s = s.replace(r"\?", "?")

# Non-ascii characters and space are written using octal byte codes,
# for example, a string 'aä' (2 chars) in UTF-8 is written as an ascii
# string r'a\303\244' (9 chars). We want to transform this to a byte
# string b'a\303\244' (3 bytes) corresponding to the byte
# representation of the original UTF-8 string.
# Let's use this string as an example to go through the code below

# Read the ascii string
s = self._readline()
# Now s = r'a\303\244' (9 chars)

# Convert characters to bytes (all characters are ascii)
Expand Down
11 changes: 11 additions & 0 deletions rdata/tests/data/test_ascii_ascii_chars.rds
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
A
3
263168
197888
5
UTF-8
16
1
262153
102
0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&\'()*+,-./:;<=>\?@[\\]^_`{|}~\040\t\n\r\v\f\r\n
Binary file added rdata/tests/data/test_ascii_chars.rds
Binary file not shown.
10 changes: 10 additions & 0 deletions rdata/tests/data/test_ascii_na_string.rds
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
A
3
263168
197888
5
UTF-8
16
1
9
-1
21 changes: 21 additions & 0 deletions rdata/tests/test_rdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ def test_na_string(self) -> None:
"test_na_string": [None],
})

def test_ascii_na_string(self) -> None:
"""Test that the NA string is parsed correctly."""
# File created in R with
# saveRDS(as.character(NA), file="test_ascii_na_string.rds", ascii=TRUE, compress=FALSE) # noqa: E501
data = rdata.read_rds(TESTDATA_PATH / "test_ascii_na_string.rds")
np.testing.assert_equal(data, [None])

def test_complex(self) -> None:
"""Test that complex numbers can be parsed."""
data = rdata.read_rda(TESTDATA_PATH / "test_complex.rda")
Expand Down Expand Up @@ -708,6 +715,20 @@ def test_ascii(self) -> None:
np.testing.assert_equal(ma.get_fill_value(),
ref_ma.get_fill_value())

def test_ascii_characters(self) -> None:
"""Test reading string with all ascii printable characters."""
# File created in R with
# saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_chars.rds") # noqa: E501,ERA001
data = rdata.read_rds(TESTDATA_PATH / "test_ascii_chars.rds")
assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501

def test_ascii_ascii_characters(self) -> None:
"""Test reading string with all ascii printable characters."""
# File created in R with
# saveRDS("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", file="test_ascii_ascii_chars.rds", ascii=TRUE, compress=FALSE) # noqa: E501,ERA001
data = rdata.read_rds(TESTDATA_PATH / "test_ascii_ascii_chars.rds")
assert data == "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\v\f\r\n", data # noqa: E501

def test_nan_inf(self) -> None:
"""Test reading nan and inf."""
data = rdata.read_rds(TESTDATA_PATH / "test_nan_inf.rds")
Expand Down
52 changes: 43 additions & 9 deletions rdata/unparser/_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,50 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:

self._add_line(line)

def unparse_string(self, value: bytes) -> None:
"""Unparse a string."""
self.unparse_int(len(value))

def _unparse_string_characters(self, value: bytes) -> None:
# Ideally we could do here the reverse of parsing,
# i.e., value = value.decode('latin1').encode('unicode_escape').decode('ascii')
# i.e., output = value.decode('latin1').encode('unicode_escape').decode('ascii')
# This would produce byte representation in hex such as '\xc3\xa4',
# but we need to have the equivalent octal presentation '\303\244'.
# So, we do somewhat manual conversion instead:
s = "".join(chr(byte) if chr(byte) in string.printable else rf"\{byte:03o}"
for byte in value)
# So, we need to do somewhat manual conversion instead.

# List of ascii characters that are written directly;
# this is all printable ascii except
# - ' ' that Python writes as ' ', but R as '\040'
# - '\v' that Python writes as '\x0b', but R as '\v'
# - '\f' that Python writes as '\x0c', but R as '\f'
write_raw = string.printable.replace(" ", "")\
.replace("\v", "")\
.replace("\f", "")

def escape(b: bytes) -> str:
r"""Escape string, e.g., b'\n' -> r'\\n'."""
return b.decode("latin1").encode("unicode_escape").decode("ascii")

# Go though the string byte-by-byte as we need to
# convert every non-ascii character separately
output = ""
ascii_buffer = b""
for byte in value:
if chr(byte) in write_raw:
# Collect ascii characters to substring buffer
ascii_buffer += bytes([byte])
else:
# Encountered a non-ascii character!
# Escape and add the ascii buffer
output += escape(ascii_buffer)
ascii_buffer = b""
# Add '\v' or '\f' or non-ascii character in octal presentation
if chr(byte) == "\v":
output += r"\v"
elif chr(byte) == "\f":
output += r"\f"
else:
output += rf"\{byte:03o}"
# Escape and add the remaining ascii buffer
output += escape(ascii_buffer)

# Escape some more characters like R does
output = output.replace('"', r'\"').replace("'", r"\'").replace("?", r"\?")

self._add_line(s)
self._add_line(output)
12 changes: 10 additions & 2 deletions rdata/unparser/_unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,17 @@ def unparse_array(self, array: npt.NDArray[Any]) -> None:
def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
"""Unparse the values of an array."""

@abc.abstractmethod
def unparse_string(self, value: bytes) -> None:
def unparse_string(self, value: bytes | None) -> None:
"""Unparse a string."""
if value is None:
self.unparse_int(-1)
return
self.unparse_int(len(value))
self._unparse_string_characters(value)

@abc.abstractmethod
def _unparse_string_characters(self, value: bytes) -> None:
"""Unparse characters of a string (not None)."""

def unparse_r_data(self, r_data: RData) -> None:
"""Unparse an RData object."""
Expand Down
9 changes: 2 additions & 7 deletions rdata/unparser/_xdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,5 @@ def _unparse_array_values(self, array: npt.NDArray[Any]) -> None:
data = array.data if array.flags["C_CONTIGUOUS"] else array.tobytes()
self.file.write(data)

def unparse_string(self, value: bytes) -> None:
"""Unparse a string."""
if value is None:
self.unparse_int(-1)
else:
self.unparse_int(len(value))
self.file.write(value)
def _unparse_string_characters(self, value: bytes) -> None:
self.file.write(value)
Loading