From a391443bbb4e100fda9f60da99429a20e775a858 Mon Sep 17 00:00:00 2001 From: robojumper Date: Sun, 6 Oct 2024 18:42:57 +0200 Subject: [PATCH 1/8] Script for moving decomp symbols to Ghidra --- tools/ghidra_scripts/DecompMapToGhidra.py | 190 ++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 tools/ghidra_scripts/DecompMapToGhidra.py diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py new file mode 100644 index 00000000..77f722ca --- /dev/null +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -0,0 +1,190 @@ +#Take symbols from the decomp's map files, treat them as authoritative, and import them in Ghidra. +#Requires cwdemangle in the PATH +#Must run `python .\configure.py --map && ninja` beforehand! +#@author robojumper +#@category GameCube/Wii +#@keybinding +#@menupath +#@toolbar + +import os +import re +import subprocess + +from ghidra.app.util import NamespaceUtils +from ghidra.program.model.symbol import SymbolUtilities +from ghidra.program.model.symbol.SourceType import * +from ghidra.program.model.symbol.Namespace import * +from ghidra.program.model.listing.CodeUnit import * + +AF = currentProgram.getAddressFactory() +mem = currentProgram.getMemory() +listing = currentProgram.getListing() + +allowed_sections = [ + '.text', + '.data', + '.sdata', + '.sdata2', + '.bss', + '.sbss', + '.sbss2', + '.rodata', +] + +def demangle(name): + # try demangling + if "__" in name: + try: + output = subprocess.check_output(["cwdemangle", name], stderr=subprocess.STDOUT) + return output.strip().split("(")[0] + except subprocess.CalledProcessError as e: + if "Failed to demangle symbol" not in e.output: + raise + # otherwise we try to undo the effects of the original + # ghidra -> symbols.txt export here + if not "$" in name and not "arraydtor" in name and not name.startswith("__"): + name = name.replace("__", "::") + name = name.replace("::::", "::__") + return name + +def parse_symbol(line): + if "entry of" in line: + return None + objs = line.strip().split() + vAddr = objs[2] + name = objs[5] + + if name.startswith("gap_") or name == "*fill*" or name.startswith(".") or "........" in vAddr: + return None + if default_sym_re.match(name): + return None + + return { + 'name': name, + 'vAddr': int(vAddr, 16), + } + +default_sym_re = re.compile(".*_[0-9A-Za-z]{8}$") + +def parse_map_file(file): + lines = [line for line in file] + i = 0 + sections = {} + while i < len(lines): + line = lines[i] + if "section layout" in line: + section_name = line.split(' ')[0] + if section_name in allowed_sections: + sections[section_name] = [] + i += 4 # go to symbols + while lines[i].strip() != "": + sym = parse_symbol(lines[i]) + if sym is not None: + sections[section_name].append(sym) + i += 1 + i += 1 + + return sections + +# This script works incrementally by recording +# the mangled name in a special plate comment. +# If the mangled name is the same, we don't even bother +# shelling out to cwdemangle, speeding up the whole process +# quite a bit. +mangled_prefix = "mangled-decomp-name-v1: " +def update_addr(addr, mangled_name): + unit = listing.getCodeUnitAt(addr) + if not unit: + return + + new_comment_line = mangled_prefix + mangled_name + + comment = unit.getComment(PLATE_COMMENT) + update_symbol = False + new_comment = None + if not comment: + # no plate comment here, add one and set symbol + update_symbol = True + new_comment = [new_comment_line] + else: + comment_lines = comment.splitlines() + if any (line.startswith(mangled_prefix) for line in comment_lines): + # replace with new mangled name + new_comment = [] + for line in comment_lines: + if line.startswith(mangled_prefix) and new_comment_line not in line: + update_symbol = True + new_comment.append(new_comment_line) + else: + new_comment.append(line) + + else: + # existing plate comment without symbol, append + update_symbol = True + new_comment = [comment, new_comment_line] + + if update_symbol: + new_comment = '\n'.join(new_comment) + demangled_name = demangle(mangled_name) + + # print(new_comment, demangled_name) + + name_list = [SymbolUtilities.replaceInvalidChars(part, True) for part in demangled_name.split("::")] + symbol_str = name_list[-1] + namespace = None + if len(name_list) > 1: + namespace_str = "::".join(name_list[:-1]) + print(mangled_name) + namespace = NamespaceUtils.createNamespaceHierarchy(namespace_str, None, currentProgram, IMPORTED) + + sym = getSymbolAt(addr) + if sym: + if namespace: + sym.setNameAndNamespace(symbol_str, namespace, IMPORTED) + else: + sym.setName(symbol_str, IMPORTED) + else: + if namespace: + createLabel(addr, symbol_str, namespace, True, IMPORTED) + else: + createLabel(addr, symbol_str, True, IMPORTED) + + unit.setComment(PLATE_COMMENT, new_comment) + +def apply_symbols_map(symbols_map, file_name): + for section, syms in symbols_map.items(): + for sym in syms: + if file_name == "MAIN": + # in the main dol, each symbol is loaded at a fixed address + addr_obj = AF.getAddress("0x%08X" % sym["vAddr"]) + else: + # REL sections can't be reliably identified + if section != ".text": + continue + # in rels, every section is relocated indivdually, so treat + # this as an offset + block_name = file_name + "_" + section + "0" + block = mem.getBlock(block_name) + addr_obj = block.getStart().add(sym["vAddr"]) + update_addr(addr_obj, sym["name"]) + + +path = str(askDirectory("Program build directory (e.g. build/SOUE01)", "Import")) + +new_contents = None +main_symbols = os.path.join(path, "main.elf.MAP") +symbols_map = None +with open(main_symbols, "rt") as file: + symbols_map = parse_map_file(file) + +apply_symbols_map(symbols_map, "MAIN") + +for rel_name in os.listdir(path): + if rel_name.endswith("NP"): + rel_symbols = os.path.join(path, rel_name, rel_name + ".plf.MAP") + symbols_map = None + with open(rel_symbols, "rt") as file: + symbols_map = parse_map_file(file) + + apply_symbols_map(symbols_map, rel_name) From ba90c20b27d6def6f86bf0aace8a03d2aff9e4fe Mon Sep 17 00:00:00 2001 From: robojumper Date: Sun, 6 Oct 2024 19:13:17 +0200 Subject: [PATCH 2/8] Namespace fix --- tools/ghidra_scripts/DecompMapToGhidra.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index 77f722ca..c31d2c5c 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -139,17 +139,12 @@ def update_addr(addr, mangled_name): namespace = NamespaceUtils.createNamespaceHierarchy(namespace_str, None, currentProgram, IMPORTED) sym = getSymbolAt(addr) + if namespace is None: + namespace = currentProgram.getGlobalNamespace() if sym: - if namespace: - sym.setNameAndNamespace(symbol_str, namespace, IMPORTED) - else: - sym.setName(symbol_str, IMPORTED) + sym.setNameAndNamespace(symbol_str, namespace, IMPORTED) else: - if namespace: - createLabel(addr, symbol_str, namespace, True, IMPORTED) - else: - createLabel(addr, symbol_str, True, IMPORTED) - + createLabel(addr, symbol_str, namespace, True, IMPORTED) unit.setComment(PLATE_COMMENT, new_comment) def apply_symbols_map(symbols_map, file_name): From befd5331ec9770c64178a4acbc43642a7fc26c4d Mon Sep 17 00:00:00 2001 From: robojumper Date: Sun, 6 Oct 2024 19:54:10 +0200 Subject: [PATCH 3/8] Exclude pad_ symbols --- tools/ghidra_scripts/DecompMapToGhidra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index c31d2c5c..abcc1aa2 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -55,7 +55,7 @@ def parse_symbol(line): vAddr = objs[2] name = objs[5] - if name.startswith("gap_") or name == "*fill*" or name.startswith(".") or "........" in vAddr: + if name.startswith("pad_") or name.startswith("gap_") or name == "*fill*" or name.startswith(".") or "........" in vAddr: return None if default_sym_re.match(name): return None From 10cda760205bfdc272b3fa6476f70231e1b06221 Mon Sep 17 00:00:00 2001 From: robojumper Date: Fri, 25 Oct 2024 22:28:30 +0200 Subject: [PATCH 4/8] Rework script --- .gitignore | 1 + tools/ghidra_scripts/DecompMapToGhidra.py | 257 +++++++--- tools/ghidra_scripts/GhidraToDtkSymbols.py | 2 +- tools/ghidra_scripts/demangle.py | 543 +++++++++++++++++++++ 4 files changed, 729 insertions(+), 74 deletions(-) create mode 100644 tools/ghidra_scripts/demangle.py diff --git a/.gitignore b/.gitignore index 275db754..ee38c248 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__ .mypy_cache .cache/ +*$py.class # Original files orig/*/* diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index abcc1aa2..053edb14 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -1,5 +1,4 @@ #Take symbols from the decomp's map files, treat them as authoritative, and import them in Ghidra. -#Requires cwdemangle in the PATH #Must run `python .\configure.py --map && ninja` beforehand! #@author robojumper #@category GameCube/Wii @@ -9,7 +8,9 @@ import os import re -import subprocess + +import demangle +demangle.mode = 'demangle' from ghidra.app.util import NamespaceUtils from ghidra.program.model.symbol import SymbolUtilities @@ -32,22 +33,75 @@ '.rodata', ] -def demangle(name): +commit = None + + +def postprocess_demangled_name(demangled): + if demangled.startswith("vtable for "): + demangled = demangled[len("vtable for "):] + "::__vtable" + + if demangled.endswith(" const"): + demangled = demangled[:-len("const ")] + + thunk = False + guard = False + if demangled.startswith("non-virtual thunk to "): + thunk = True + demangled = demangled[len("non-virtual thunk to "):] + + if demangled.startswith("virtual thunk to "): + thunk = True + demangled = demangled[len("virtual thunk to "):] + + if demangled.startswith("guard variable for "): + guard = True + demangled = demangled[len("guard variable for "):] + + template_open = demangled.index("<") if "<" in demangled else None + first_space = demangled.index(" ") if " " in demangled else None + open_paren = demangled.index("(") if "(" in demangled else None + if template_open and first_space and open_paren and first_space < template_open and template_open < open_paren: + # this looks like a templated return type, so drop the return type + demangled = demangled[(first_space+1):] + + demangled = demangled.replace("(anonymous namespace)", "anonymous") + demangled = demangled.replace("operator ", "operator_") + demangled = demangled.replace(" ", "") + if ")::" in demangled: + # dFontMng_c::getFontPath(unsigned char)::TEMP_FONT_NAME + left = demangled.split("(")[0] + right = demangled.rsplit(")")[-1] + # dFontMng_c::getFontPath::TEMP_FONT_NAME + demangled = left + right + else: + demangled = demangled.split("(")[0] + + if thunk: + demangled += "_thunk" + if guard: + demangled += "_guard" + + return demangled + + +def do_demangle(name): # try demangling if "__" in name: try: - output = subprocess.check_output(["cwdemangle", name], stderr=subprocess.STDOUT) - return output.strip().split("(")[0] - except subprocess.CalledProcessError as e: - if "Failed to demangle symbol" not in e.output: - raise + output = demangle.demangle_try(name) + return output.strip() + except Exception: + pass # otherwise we try to undo the effects of the original # ghidra -> symbols.txt export here - if not "$" in name and not "arraydtor" in name and not name.startswith("__"): + if "$" not in name and "arraydtor" not in name and not name.startswith("__"): name = name.replace("__", "::") name = name.replace("::::", "::__") return name + +default_sym_re = re.compile(".*_[0-9A-Fa-f]{8}$") + def parse_symbol(line): if "entry of" in line: return None @@ -65,7 +119,6 @@ def parse_symbol(line): 'vAddr': int(vAddr, 16), } -default_sym_re = re.compile(".*_[0-9A-Za-z]{8}$") def parse_map_file(file): lines = [line for line in file] @@ -77,7 +130,7 @@ def parse_map_file(file): section_name = line.split(' ')[0] if section_name in allowed_sections: sections[section_name] = [] - i += 4 # go to symbols + i += 4 # go to symbols while lines[i].strip() != "": sym = parse_symbol(lines[i]) if sym is not None: @@ -87,93 +140,151 @@ def parse_map_file(file): return sections + +anon_static_re = re.compile("^@[0-9]+$") + + +def symbol_needs_history(name): + if anon_static_re.match(name) or "arraydtor" in name: + return False + + return True + + # This script works incrementally by recording # the mangled name in a special plate comment. # If the mangled name is the same, we don't even bother # shelling out to cwdemangle, speeding up the whole process # quite a bit. -mangled_prefix = "mangled-decomp-name-v1: " -def update_addr(addr, mangled_name): +mangled_prefix = "mangled-name" + +# We also keep a history of the original Ghidra name and previous +# decomp names in this plate comment, for future reference +original_prefix = "original-name" +previous_prefix = "full-name" + + +def parse_comment(plate_comment): + ret = { + "other": [], + "original": None, + "history": [], + "mangled": None, + } + + if plate_comment: + for line in plate_comment.splitlines(): + if line.startswith(mangled_prefix): + ret["mangled"] = line.split(" ", 1)[1].strip() + elif line.startswith(original_prefix): + ret["original"] = line.split(" ", 1)[1].strip() + elif line.startswith(previous_prefix): + ret["history"].append(line.split(" ", 1)[1].strip()) + else: + ret["other"].append(line.strip()) + return ret + + +def update_addr(addr, mangled_name, create_function=False): unit = listing.getCodeUnitAt(addr) if not unit: return - - new_comment_line = mangled_prefix + mangled_name - - comment = unit.getComment(PLATE_COMMENT) - update_symbol = False - new_comment = None - if not comment: - # no plate comment here, add one and set symbol - update_symbol = True - new_comment = [new_comment_line] + + comment_str = unit.getComment(PLATE_COMMENT) + comment = parse_comment(comment_str) + + existing_symbol = getSymbolAt(addr) + existing_name = existing_symbol.getName(True) if existing_symbol else None + + if comment["mangled"] and comment["mangled"] == mangled_name: + # skip updating + return + + if not comment["mangled"] and not comment["original"] and existing_name and not default_sym_re.match(existing_name): + comment["original"] = existing_name + + demangled_name = do_demangle(mangled_name) + postprocessed = postprocess_demangled_name(demangled_name) + comment["mangled"] = mangled_name + comment["history"].append(commit + " " + demangled_name) + + complete_plate_comment = comment["other"] + if comment["original"]: + complete_plate_comment.append(original_prefix + " " + comment["original"]) + for h in comment["history"]: + complete_plate_comment.append(previous_prefix + " " + h) + complete_plate_comment.append(mangled_prefix + " " + mangled_name) + + complete_plate_comment = "\n".join(complete_plate_comment) + + name_list = [SymbolUtilities.replaceInvalidChars(part, True) for part in postprocessed.split("::")] + symbol_str = name_list[-1] + namespace = None + if len(name_list) > 1: + namespace_str = "::".join(name_list[:-1]) + namespace = NamespaceUtils.createNamespaceHierarchy(namespace_str, None, currentProgram, IMPORTED) + + sym = getSymbolAt(addr) + if namespace is None: + namespace = currentProgram.getGlobalNamespace() + if sym: + sym.setNameAndNamespace(symbol_str, namespace, IMPORTED) else: - comment_lines = comment.splitlines() - if any (line.startswith(mangled_prefix) for line in comment_lines): - # replace with new mangled name - new_comment = [] - for line in comment_lines: - if line.startswith(mangled_prefix) and new_comment_line not in line: - update_symbol = True - new_comment.append(new_comment_line) - else: - new_comment.append(line) - - else: - # existing plate comment without symbol, append - update_symbol = True - new_comment = [comment, new_comment_line] - - if update_symbol: - new_comment = '\n'.join(new_comment) - demangled_name = demangle(mangled_name) - - # print(new_comment, demangled_name) - - name_list = [SymbolUtilities.replaceInvalidChars(part, True) for part in demangled_name.split("::")] - symbol_str = name_list[-1] - namespace = None - if len(name_list) > 1: - namespace_str = "::".join(name_list[:-1]) - print(mangled_name) - namespace = NamespaceUtils.createNamespaceHierarchy(namespace_str, None, currentProgram, IMPORTED) - - sym = getSymbolAt(addr) - if namespace is None: - namespace = currentProgram.getGlobalNamespace() - if sym: - sym.setNameAndNamespace(symbol_str, namespace, IMPORTED) - else: - createLabel(addr, symbol_str, namespace, True, IMPORTED) - unit.setComment(PLATE_COMMENT, new_comment) - -def apply_symbols_map(symbols_map, file_name): + createLabel(addr, symbol_str, namespace, True, IMPORTED) + + if create_function: + createFunction(addr, postprocessed) + + if symbol_needs_history(mangled_name): + unit.setComment(PLATE_COMMENT, complete_plate_comment) + + +def get_section_names(file_name, build_dir): + splits_path = build_dir.replace("build", "config") + splits_path = os.path.join(splits_path, "rels", file_name, "splits.txt") + section_names = [] + with open(splits_path, "rt") as file: + for line in file: + line = line.strip() + if line == "": + break + elif line == "Sections:": + continue + else: + section_names.append(line.split()[0]) + return section_names + + +def apply_symbols_map(symbols_map, file_name, build_dir): + if file_name != "MAIN": + section_names = get_section_names(file_name, build_dir) + blocks = mem.getBlocks() + blocks = [b for b in blocks if b.getName().startswith(file_name)] + for section, syms in symbols_map.items(): for sym in syms: if file_name == "MAIN": # in the main dol, each symbol is loaded at a fixed address addr_obj = AF.getAddress("0x%08X" % sym["vAddr"]) else: - # REL sections can't be reliably identified - if section != ".text": - continue - # in rels, every section is relocated indivdually, so treat + index = section_names.index(section) + block = blocks[index] + # in rels, every section is relocated individually, so treat # this as an offset - block_name = file_name + "_" + section + "0" - block = mem.getBlock(block_name) addr_obj = block.getStart().add(sym["vAddr"]) - update_addr(addr_obj, sym["name"]) + is_text = section == ".text" + update_addr(addr_obj, sym["name"], create_function=is_text) path = str(askDirectory("Program build directory (e.g. build/SOUE01)", "Import")) - +commit = askString("Commit hash for symbol history", "Confirm") new_contents = None main_symbols = os.path.join(path, "main.elf.MAP") symbols_map = None with open(main_symbols, "rt") as file: symbols_map = parse_map_file(file) -apply_symbols_map(symbols_map, "MAIN") +apply_symbols_map(symbols_map, "MAIN", path) for rel_name in os.listdir(path): if rel_name.endswith("NP"): @@ -182,4 +293,4 @@ def apply_symbols_map(symbols_map, file_name): with open(rel_symbols, "rt") as file: symbols_map = parse_map_file(file) - apply_symbols_map(symbols_map, rel_name) + apply_symbols_map(symbols_map, rel_name, path) diff --git a/tools/ghidra_scripts/GhidraToDtkSymbols.py b/tools/ghidra_scripts/GhidraToDtkSymbols.py index a8bcd913..23301f9c 100644 --- a/tools/ghidra_scripts/GhidraToDtkSymbols.py +++ b/tools/ghidra_scripts/GhidraToDtkSymbols.py @@ -12,7 +12,7 @@ mem = currentProgram.getMemory() listing = currentProgram.getListing() -sym_re = re.compile("(?:lbl|fn|FUN|DAT)_[0-9A-Fa-f_]+ = \.([a-z0-9]+):0x([0-9A-Fa-f]{8})(.*)\n") +sym_re = re.compile("(?:lbl|fn|FUN|DAT)_[0-9A-Fa-f_]+ = \\.([a-z0-9]+):0x([0-9A-Fa-f]{8})(.*)\n") default_sym_re = re.compile(".*_[0-9A-Za-z]+$") used_symbols = set() diff --git a/tools/ghidra_scripts/demangle.py b/tools/ghidra_scripts/demangle.py new file mode 100644 index 00000000..9b116b5d --- /dev/null +++ b/tools/ghidra_scripts/demangle.py @@ -0,0 +1,543 @@ +# https://gist.github.com/RootCubed/8f8102fe6cf4ed79a45f1dfe23020a06 + +# Demangler / Itanium remangler for the CodeWarrior ABI + +# Adapted from the NVIDIA demangler script by Ninji +# See https://gist.github.com/RootCubed/9ebecf21eec344f10164cdfabbf0bb41 (Python) +# and https://gist.github.com/RootCubed/d7e2629f4576059853505b7931ffd105 (C++) +# for those scripts + +# Ported to Ghidra's Jython / Python 2.7 by robojumper + +import argparse +import re +import sys + +mode = None +verbose = False + +def is_demangle(): + return mode == 'demangle' + +""" +The names of the types in the output +First demangled, then remangled for Itanium +""" +names_mapping = { + 'v': ('void', 'v'), + 'b': ('bool', 'b'), + 'c': ('char', 'c'), + 's': ('short', 's'), + 'i': ('int', 'i'), + 'l': ('long', 'l'), + 'x': ('long long', 'x'), + 'Sc': ('signed char', 'a'), + 'Uc': ('unsigned char', 'h'), + 'Us': ('unsigned short', 't'), + 'Ui': ('unsigned int', 'j'), + 'Ul': ('unsigned long', 'm'), + 'Ux': ('unsigned long long', 'y'), + 'f': ('float', 'f'), + 'd': ('double', 'd'), + 'r': ('long double', 'e'), + 'w': ('wchar_t', 'w'), + 'e': ('...', 'z') +} + +""" +The names of the methods in the output +First demangled, then remangled for Itanium +""" +method_mapping = { + '__dt': ('~$CLS$', 'D0'), + '__ct': ('$CLS$', 'C1'), + '__nw': ('operator new', 'nw'), + '__nwa': ('operator new[]', 'na'), + '__dl': ('operator delete', 'dl'), + '__dla': ('operator delete[]', 'da'), + '__pl': ('operator+', 'pl'), + '__mi': ('operator-', 'mi'), + '__ml': ('operator*', 'ml'), + '__dv': ('operator/', 'dv'), + '__md': ('operator%', 'rm'), + '__er': ('operator^', 'eo'), + '__ad': ('operator&', 'an'), + '__or': ('operator|', 'or'), + '__co': ('operator~', 'co'), + '__nt': ('operator!', 'nt'), + '__as': ('operator=', 'aS'), + '__lt': ('operator<', 'lt'), + '__gt': ('operator>', 'gt'), + '__apl': ('operator+=', 'pL'), + '__ami': ('operator-=', 'mI'), + '__amu': ('operator*=', 'mL'), + '__adv': ('operator/=', 'dV'), + '__amd': ('operator%=', 'rM'), + '__aer': ('operator^=', 'eO'), + '__aad': ('operator&=', 'aN'), + '__aor': ('operator|=', 'oR'), + '__ls': ('operator<<', 'ls'), + '__rs': ('operator>>', 'rs'), + '__ars': ('operator>>=', 'rS'), + '__als': ('operator<<=', 'lS'), + '__eq': ('operator==', 'eq'), + '__ne': ('operator!=', 'ne'), + '__le': ('operator<=', 'le'), + '__ge': ('operator>=', 'ge'), + '__aa': ('operator&&', 'aa'), + '__oo': ('operator||', 'oo'), + '__pp': ('operator++', 'pp'), + '__mm': ('operator--', 'mm'), + '__cm': ('operator,', 'cm'), + '__rm': ('operator->*', 'pm'), + '__rf': ('operator->', 'pt'), + '__cl': ('operator()', 'cl'), + '__vc': ('operator[]', 'ix'), +} + +def parse_number(s, i): + """ + Parses a number starting at position i. + Examples: + parse_number('123ABC', 0) -> (123, 3) + + Args: + s (str): The input string to parse. + i (int): The starting position in the input string. + + Returns: + Tuple[int, int]: The parsed number and the new position in the string. + """ + num = 0 + while s[i].isdigit(): + num = num * 10 + int(s[i]) + i += 1 + return num, i + +def parse_typename(s, i): + """ + Fully processes a mangled typename starting at index i. + Examples: + (demangle) parse_typename('Q23ABC3DEF', 0) -> ('ABC::DEF', 10) + (demangle) parse_typename('Q23ABC6DEF', 0) -> ('ABC::DEF', 13) + (remangle) parse_typename('Q23ABC3DEF', 0) -> ('3ABC3DEF', 10) + + Args: + s (str): The input string. + i (int): The starting index. + is_toplevel (bool): Whether the type is a global-level type. Used for remangling. + + Returns: + Tuple[str, int]:The parsed typename and the new position in the string. + """ + if s[i] == 'Q': + count = int(s[i + 1]) + i += 2 + bits = [] + for _ in range(count): + size, i = parse_number(s, i) + bits.append(resolve_templates(s[i:(i + size)], True)) + i += size + if is_demangle(): + return '::'.join(bits), i + else: + return ''.join(bits), i + else: + size, i = parse_number(s, i) + return resolve_templates(s[i:(i + size)], True), i + size + +def join_modifiers(modifiers): + """ + Joins the list of modifiers into a single string. + Modifiers are e.g. const, pointer, reference, etc. + In a demangled string these are right-to-left (e.g. int const * - pointer to const int) + whereas in a mangled string they are left-to-right (e.g. PKi - pointer to const int) + + Args: + modifiers (list[str]): The list of modifiers to join. + + Returns: + str: The joined string of modifiers. + """ + if is_demangle(): + return ''.join(modifiers[::-1]) + else: + return ''.join(modifiers) + +def parse_function(s, i, modifiers, name='', rettype_mode = 'show'): + """ + Parses a function from a demangled string. + Examples: + (demangle) parse_function('v_v', 0, ['*']) -> ('void (*) ()', 4) + (demangle) parse_function('s_b', 0, ['&']) -> ('bool (&) (short)', 4) + (remangle) parse_function('i_v', 0, ['*']) -> ('FviE', 4) + + Args: + s (str): The demangled string. + i (int): The current index in the string. + modifiers (list[str]): The list of modifiers. + name (str): An identifier, if available. This is the "main" symbol name. + rettype_mode (Literal['show', 'hide_in_demangle', 'remove']): How to handle the return type. + + Returns: + Tuple[str, int]: The transformed function signature and the new position in the string. + """ + # Parse the function args, return type handled later + args = [] + while i < len(s) and s[i] != '_' and s[i] != '@': + argtype, i = parse_type(s, i) + args.append(argtype) + + # Special case: const + # Note that if the function is const, it will be the last modifier + # because e.g. CPFv is a (const pointer) to a function + const_str = '' + if len(modifiers) > 0 and (modifiers[-1] == ' const' or modifiers[-1] == 'K'): + const_str = ' const' if is_demangle() else 'K' + modifiers.pop() + + mod_str = join_modifiers(modifiers) + + if is_demangle(): + if mod_str != '': + mod_str = '(%s)' % mod_str.strip() + arg_str = ', '.join(args) if args[0] != 'void' else '' + func_str = '%s%s(%s)%s' % (name, mod_str, arg_str, const_str) + if i >= len(s) or s[i] == '@': + return func_str, i + if rettype_mode == 'hide_in_demangle' or rettype_mode == 'remove': + _, i = parse_type(s, i + 1) + return func_str, i + else: + return parse_type(s, i + 1, [' ' + func_str]) + else: + if i < len(s) and s[i] != '@': + rettype, i = parse_type(s, i + 1) + else: + rettype, i = ('', i) + if rettype_mode == 'remove': + rettype = '' + func_encoding = '%s%s' % (rettype, ''.join(args)) if name != '' else 'F%s%sE' % (rettype, ''.join(args)) + if name != '': + func_encoding = 'N%s%sE%s' % (const_str, name, func_encoding) + else: + func_encoding = '%s%s' % (const_str, func_encoding) + return mod_str + func_encoding, i + +def parse_type(s, i, modifiers = None, name='', rettype_mode = 'show'): + """ + Parses a type from a string - main transformation function. + + Args: + s (str): The string to parse. + i (int): The starting index. + + Returns: + Tuple[str, int]: The transformed type name and the new position in the string. + """ + + if modifiers == None: + modifiers = [] + + # Type modifier is for unsigned/signed + type_modifier = '' + + while i < len(s) and s[i].isupper() and s[i] != 'Q': + c = s[i] + if c == 'C': # Const + modifiers.append(' const' if is_demangle() else 'K') + elif c == 'P': # Pointer + modifiers.append('*' if is_demangle() else 'P') + elif c == 'R': # Reference + modifiers.append('&' if is_demangle() else 'R') + elif c == 'V': # Volatile + modifiers.append(' volatile' if is_demangle() else 'V') + elif c == 'U' or c == 'S': # Unsigned/Signed + type_modifier = c + elif c == 'F': # Function, will return early + return parse_function(s, i + 1, modifiers, name, rettype_mode) + elif c == 'M': # Pointer-to-member + class_name, i = parse_type(s, i + 1) + + modifiers.append(' %s::*' % class_name if is_demangle() else 'M%s' % class_name) + if s[i] == 'F': + # CW includes the hidden pointer arguments in the PTMF signature + # and also uses this to communicate constness of the PTMF + if s[i:].startswith('FPCvPCv'): + modifiers.append(' const' if is_demangle() else 'K') + i += 7 + elif s[i:].startswith('FPCvPv'): + i += 6 + if s[i] == '_': + # small hack: simulate Fv_... by reusing the v from FPCvPCv/FPCvPv + i -= 1 + return parse_function(s, i, modifiers) + else: + # pointer-to-member-nonfunction, continue parsing as normal + continue + elif c == 'A': # Array + count, i = parse_number(s, i + 1) + # Automatically skips past the '_' after the number before the next iteration + if is_demangle(): + modstr = join_modifiers(modifiers) + if re.search(r'\[.*\]$', modstr) != None: + modifiers.insert(0, '[%d]' % count) + elif modstr == '': + modifiers.insert(0, ' [%d]' % count) + else: + # modifiers.insert(0, f' ({modstr}) [{count}]') + modifiers = [' (%s) [%d]' % (modstr, count)] + else: + modifiers.append('A%d_' % count) + else: + raise Exception('Invalid type modifier "' + c + '"') + i += 1 + + assert i < len(s) + assert s[i].isalpha() or s[i].isdigit() + + # Now we have either an identifier or a basic type + + if s[i] == 'Q' or s[i].isdigit(): + type_name, i = parse_typename(s, i) + if not is_demangle(): + type_name = 'N%sE' % type_name + else: + # Basic type - combine with type modifier and look up in mapping + actual_type = type_modifier + s[i] + if actual_type not in names_mapping: + raise Exception('Invalid type "' + actual_type + '"') + type_name = names_mapping[actual_type][0 if is_demangle() else 1] + i += 1 + + mod_str = join_modifiers(modifiers) + + if is_demangle(): + return '%s%s' % (type_name, mod_str), i + else: + return '%s%s' % (mod_str, type_name), i + +def resolve_templates(s, remangle_add_length): + """ + Resolves template types in a type string. + Examples: + (demangle) resolve_templates('std', false) -> 'std' + (remangle) resolve_templates('std', false) -> 'stdIcE' + (remangle) resolve_templates('std', true) -> '3stdIcE' + + Args: + s (str): The string to resolve. + remangle_add_length (bool): Whether to add the length prefix in remangling. + + Returns: + str: The resolved string. + """ + begin_pos = s.find('<') + if begin_pos == -1: + if re.match(r'^@unnamed@.+@$', s) != None: + if is_demangle(): + # name.split('@')[2] contains the name of the file the anonymous namespace is in, + # but we lose that information here since we follow c++filt's behavior. + return '(anonymous namespace)' + else: + return '12_GLOBAL__N_1' + unnamed_type_m = re.match(r'^@class\$(\d*).+$', s) + if unnamed_type_m != None: + typenum = int(unnamed_type_m.group(1)) if unnamed_type_m.group(1) != '' else -1 + if is_demangle(): + return '{unnamed type#%d}' % (typenum + 2) + else: + return 'Ut%s_' % (str(typenum) if typenum > -1 else '') + if not is_demangle() and remangle_add_length: + return '%d%s' % (len(s), s) + return s + template_str = '' + i = begin_pos + 1 + while i < len(s): + if s[i] == ',': + if is_demangle(): + template_str += ', ' + i += 1 + continue + if s[i] == '>': + break + elif re.match(r'[-\d]+[>,]', s[i:]) != None: + # Integer literal + literal = re.match(r'[-\d]+', s[i:])[0] + template_str += literal if is_demangle() else 'XLi%sEE' % literal.replace('-', 'n') + i += len(literal) + else: + type, i = parse_type(s, i) + template_str += type + if is_demangle(): + template_str = '<%s>' % template_str + # replicate c++filt behavior + if template_str[-2:] == '>>': + template_str = template_str[:-1] + ' >' + return s[0:begin_pos] + template_str + else: + if remangle_add_length: + return str(begin_pos) + s[0:begin_pos] + ('I%sE' % template_str) + return s[0:begin_pos] + ('I%sE' % template_str) + +def demangle(s): + """ + Demangles a mangled symbol name. + """ + + at_sym = '' + thunk_offsets = [] + m = re.match(r'^@([^@]+)@(.+)$', s) + if m != None: + m_thunk = re.match(r'^@(\d+)(?:@(\d+))?@(.+)$', s) + if m_thunk != None: + thunk_offsets = [int(m_thunk.group(1))] + if m_thunk.group(2) != None: + thunk_offsets.append(int(m_thunk.group(2))) + s = m_thunk.group(3) + else: + at_sym = m.group(1) + if at_sym not in ['LOCAL', 'GUARD', 'STRING']: + raise Exception('Invalid symbol name "' + s + '"') + s = m.group(2) + + template_depth = 0 + last_possible_end = -1 + for i in range(1, len(s)): + if s[i] == '<': + template_depth += 1 + elif s[i] == '>': + template_depth -= 1 + if template_depth == 0 and i + 2 < len(s) and s[i:i + 2] == '__' and s[i + 2] in 'CFQ0123456789': + last_possible_end = i + break + if last_possible_end == -1: + return s + + i = last_possible_end + + method, remainder = s[:i], s[i + 2:] + if remainder[0] == 'F': + # Global function without class + class_name = '' + i = 0 + else: + class_name, i = parse_typename(remainder, 0) + + if '<' in method: + template_start = method.find('<') + pre_template, template = method[:template_start], method[template_start:] + resolved_templates = resolve_templates(template, False) + else: + pre_template, resolved_templates = method, '' + + if pre_template in ['__ct', '__dt']: + rettype_mode = 'remove' + elif at_sym != '': + rettype_mode = 'hide_in_demangle' + else: + rettype_mode = 'show' + + if method == '__vt': + return 'vtable for %s' % class_name if is_demangle() else '_ZTVN%sE' % class_name + elif method.startswith('__op'): + # Use method because the type might contain templates + cv_type_name, _ = parse_type(method[4:], 0) + pre_template = 'operator %s' % cv_type_name if is_demangle() else 'cv%s' % cv_type_name + # __op cannot be templated + resolved_templates = '' + elif pre_template in method_mapping: + pre_template = method_mapping[pre_template][0 if is_demangle() else 1] + if is_demangle(): + # __ct should use the template of the function, not of the parent class + last_class_name = re.sub(r'<.+>', '', class_name).split('::')[-1] + pre_template = pre_template.replace('$CLS$', last_class_name) + else: + if not is_demangle(): + pre_template = '%d%s' % (len(pre_template), pre_template) + + method = '%s%s' % (pre_template, resolved_templates) + + if is_demangle(): + demangled = '::'.join(filter(None, [class_name, method])) + else: + demangled = class_name + method + + if i < len(remainder): + demangled, i = parse_type(remainder, i, name=demangled, rettype_mode=rettype_mode) + elif not is_demangle(): + demangled = 'N%sE' % demangled + + if i < len(remainder) and remainder[i] == '@' and at_sym in ['LOCAL', 'GUARD']: + subs = remainder[i + 1:].split('@') + local_sym_name = subs[0] + local_sym_extra = ('_' + subs[1]) if len(subs) > 1 else '' + if not is_demangle(): + local_sym_name = str(len(local_sym_name)) + local_sym_name + local_sym_extra + elif at_sym == 'GUARD' and i >= len(remainder): + local_sym_name = method + elif at_sym == 'STRING' and i < len(remainder) and remainder[i] == '@' and not is_demangle(): + local_sym_name = '_' + remainder[i + 1] + else: + local_sym_name = '' + + if is_demangle(): + if local_sym_name != '': + demangled += '::%s' % local_sym_name + + # c++filt removes spaces in (* ) -> (*), try to replicate this + while True: + m = re.search(r'\((?:[*&]|const| )+ (\w+.+)$', demangled) + if m == None or m.group(1).startswith('const'): + break + demangled = demangled[:m.start(1) - 1] + m.group(1) + + if at_sym == 'GUARD': + return 'guard variable for %s' % demangled + elif at_sym == 'STRING': + return '%s::string literal' % demangled + elif len(thunk_offsets) > 0: + thunk_type = 'virtual' if len(thunk_offsets) == 2 else 'non-virtual' + return '%s thunk to %s' % (thunk_type, demangled) + else: + return demangled + else: + if len(thunk_offsets) == 1: + demangled = 'Th%d_%s' % (thunk_offsets[0], demangled) + elif len(thunk_offsets) == 2: + demangled = 'Tv%d_n%d_%s' % (thunk_offsets[0], thunk_offsets[1], demangled) + if at_sym == 'LOCAL': + demangled = 'Z%sE%s' % (demangled, local_sym_name) + if at_sym == 'GUARD': + demangled = 'GVZ%sE%s' % (demangled, local_sym_name) + if at_sym == 'STRING': + demangled = 'Z%sEs%s' % (demangled, local_sym_name) + return '_Z%s' % demangled + +def demangle_try(s): + try: + return demangle(s) + except Exception as e: + sys.stderr.write('Demangler error: ' + str(e) + '\n') + raise e + +def main(): + global mode + global verbose + parser = argparse.ArgumentParser() + parser.add_argument('symbol', type=str, nargs='?') + parser.add_argument('-m', '--mode', choices=['demangle', 'remangle_itanium'], required=True) + parser.add_argument('-v', '--verbose', action='store_true', default=False) + args = parser.parse_args() + mode = args.mode + verbose = args.verbose + if args.symbol is None: + while True: + sym = input() + print(demangle_try(sym)) + else: + print(demangle_try(args.symbol)) + return + +if __name__ == '__main__': + main() From 38db6c087dd2d04b9d68fcc6e1a357bc5c9f0845 Mon Sep 17 00:00:00 2001 From: robojumper Date: Fri, 25 Oct 2024 22:57:11 +0200 Subject: [PATCH 5/8] Small fixes --- config/SOUE01/symbols.txt | 2 +- tools/ghidra_scripts/DecompMapToGhidra.py | 2 +- tools/ghidra_scripts/demangle.py | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/config/SOUE01/symbols.txt b/config/SOUE01/symbols.txt index 260f1e70..a2dc7796 100644 --- a/config/SOUE01/symbols.txt +++ b/config/SOUE01/symbols.txt @@ -34304,7 +34304,7 @@ __vt__58sFState_c = .data:0x805 __vt__Q39dCsGame_c15lytItemCursor_c16lytPachinkoCsr_c = .data:0x8052F0A8; // type:object size:0x10 __vt__84sFStateMgr_c = .data:0x8052F0B8; // type:object size:0x30 __vt__114sStateMgr_c = .data:0x8052F0E8; // type:object size:0x30 -__vt__61sFStateFct_c = .data:0x8052F118; // type:object size:0x18 +__vt__61sFStateFct_c = .data:0x8052F118; // type:object size:0x18 __vt__58sFState_c = .data:0x8052F130; // type:object size:0x18 __vt__Q39dCsGame_c15lytItemCursor_c15lytDowsingCsr_c = .data:0x8052F148; // type:object size:0x10 __vt__83sFStateMgr_c = .data:0x8052F158; // type:object size:0x30 diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index 053edb14..2ad0074b 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -233,7 +233,7 @@ def update_addr(addr, mangled_name, create_function=False): createLabel(addr, symbol_str, namespace, True, IMPORTED) if create_function: - createFunction(addr, postprocessed) + createFunction(addr, name_list[-1]) if symbol_needs_history(mangled_name): unit.setComment(PLATE_COMMENT, complete_plate_comment) diff --git a/tools/ghidra_scripts/demangle.py b/tools/ghidra_scripts/demangle.py index 9b116b5d..320d1564 100644 --- a/tools/ghidra_scripts/demangle.py +++ b/tools/ghidra_scripts/demangle.py @@ -363,7 +363,8 @@ def resolve_templates(s, remangle_add_length): break elif re.match(r'[-\d]+[>,]', s[i:]) != None: # Integer literal - literal = re.match(r'[-\d]+', s[i:])[0] + # ss/robojumper: fix [0] -> .group(0) + literal = re.match(r'[-\d]+', s[i:]).group(0) template_str += literal if is_demangle() else 'XLi%sEE' % literal.replace('-', 'n') i += len(literal) else: @@ -518,7 +519,8 @@ def demangle_try(s): try: return demangle(s) except Exception as e: - sys.stderr.write('Demangler error: ' + str(e) + '\n') + # ss/robojumper: more context + sys.stderr.write('Demangler error: ' + str(e) + ' trying to demangle ' + s + '\n') raise e def main(): From ae39aa26483f18a144b826bd24f99ec1f35d3d59 Mon Sep 17 00:00:00 2001 From: robojumper Date: Fri, 25 Oct 2024 23:02:20 +0200 Subject: [PATCH 6/8] Recover more fake mangled symbols --- tools/ghidra_scripts/DecompMapToGhidra.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index 2ad0074b..0484abb9 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -89,7 +89,9 @@ def do_demangle(name): if "__" in name: try: output = demangle.demangle_try(name) - return output.strip() + output = output.strip() + if output != name: + return output except Exception: pass # otherwise we try to undo the effects of the original From b70ca7683bf5b6be59cf98225fd77013f22dee0f Mon Sep 17 00:00:00 2001 From: robojumper Date: Sun, 3 Nov 2024 23:56:46 +0100 Subject: [PATCH 7/8] Fix creating function names without namespace --- tools/ghidra_scripts/DecompMapToGhidra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index 0484abb9..1def00c3 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -235,7 +235,7 @@ def update_addr(addr, mangled_name, create_function=False): createLabel(addr, symbol_str, namespace, True, IMPORTED) if create_function: - createFunction(addr, name_list[-1]) + createFunction(addr, None) if symbol_needs_history(mangled_name): unit.setComment(PLATE_COMMENT, complete_plate_comment) From 2eb363c83b8bccc6df3c3017ad64cedec6278505 Mon Sep 17 00:00:00 2001 From: robojumper Date: Thu, 28 Nov 2024 20:16:19 +0100 Subject: [PATCH 8/8] Check commit hash length --- tools/ghidra_scripts/DecompMapToGhidra.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/ghidra_scripts/DecompMapToGhidra.py b/tools/ghidra_scripts/DecompMapToGhidra.py index 1def00c3..d17d97a7 100644 --- a/tools/ghidra_scripts/DecompMapToGhidra.py +++ b/tools/ghidra_scripts/DecompMapToGhidra.py @@ -280,6 +280,9 @@ def apply_symbols_map(symbols_map, file_name, build_dir): path = str(askDirectory("Program build directory (e.g. build/SOUE01)", "Import")) commit = askString("Commit hash for symbol history", "Confirm") +if len(commit) < 7: + raise ValueError("commit hash " + commit + " is too short") +commit = commit[:7] new_contents = None main_symbols = os.path.join(path, "main.elf.MAP") symbols_map = None