From ca7a6881ebc46835b90e9dff708842e263c65bc0 Mon Sep 17 00:00:00 2001 From: angie Date: Thu, 25 Apr 2024 21:42:45 -0400 Subject: [PATCH] More string decoding fixing --- spimdisasm/common/Utils.py | 74 ++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/spimdisasm/common/Utils.py b/spimdisasm/common/Utils.py index 1050994..9350cf1 100644 --- a/spimdisasm/common/Utils.py +++ b/spimdisasm/common/Utils.py @@ -250,12 +250,6 @@ def getMaybeBooleyFromMaybeStr(booley: str|None) -> bool|None: 0x8D, } -escapeCharactersMaybeRealLookAhead = { - 0x8C, - 0x8D, - 0xC9, -} - def decodeBytesToStrings(buf: bytes, offset: int, stringEncoding: str, terminator: int=0) -> tuple[list[str], int]: result = [] @@ -266,48 +260,48 @@ def decodeBytesToStrings(buf: bytes, offset: int, stringEncoding: str, terminato if char in bannedEscapeCharacters: return [], -10 - # Some of the escape characters are real Japanese characters, so we need to properly check them - theEscapeCharacterWasARealChar = False - if char in escapeCharactersMaybeRealLookAhead: + validNonAsciiSequence = False + if char > 0x7F: dst.append(char) try: decoded = dst.decode(stringEncoding) - theEscapeCharacterWasARealChar = True + validNonAsciiSequence = True except UnicodeDecodeError: if offset + i + 1 < len(buf): nextChar = buf[offset + i + 1] - dst.append(nextChar) - try: - decoded = dst.decode(stringEncoding) - theEscapeCharacterWasARealChar = True - except UnicodeDecodeError: - pass - dst.pop() + if nextChar != 0x5C: # '\\' + dst.append(nextChar) + try: + decoded = dst.decode(stringEncoding) + validNonAsciiSequence = True + except UnicodeDecodeError: + pass + dst.pop() dst.pop() - if not theEscapeCharacterWasARealChar and char > 0x7F and offset + i + 1 < len(buf): - nextChar = buf[offset + i + 1] - if nextChar == 0x5C: # '\\' - # If the second part of a Japanese character is the 0x5C value ('\\') then we need to - # special handle it. Otherwise when it gets iconv'd then the compiler will get confused - # and think it should try to escape the next character instead. - # So we break down the string here, add this two characters as individual characters and - # we skip them - - if dst: - try: - decoded = dst.decode(stringEncoding) - except UnicodeDecodeError: - return [], -60 - result.append(rabbitizer.Utils.escapeString(decoded)) - dst.clear() - result.append(f"\\x{char:02X}") - result.append(f"\\x{nextChar:02X}") - - i += 2 - continue - - if not theEscapeCharacterWasARealChar and char in escapeCharactersSpecialCases: + if not validNonAsciiSequence and offset + i + 1 < len(buf): + nextChar = buf[offset + i + 1] + if nextChar == 0x5C: # '\\' + # If the second byte of a Japanese character is the 0x5C value ('\\') then we need to + # handle it specially. Otherwise, when it gets iconv'd, the compiler gets confused + # and thinks it should try to escape the next character instead. + # So we break down the string here, add these two characters as individual characters and + # skip them + + if dst: + try: + decoded = dst.decode(stringEncoding) + except UnicodeDecodeError: + return [], -60 + result.append(rabbitizer.Utils.escapeString(decoded)) + dst.clear() + result.append(f"\\x{char:02X}") + result.append(f"\\x{nextChar:02X}") + + i += 2 + continue + + if not validNonAsciiSequence and char in escapeCharactersSpecialCases: if dst: try: decoded = dst.decode(stringEncoding)