diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..0205d62 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100755 index 0000000..2e29e3c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Battelle Memorial Institute + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100755 index 0000000..b319898 --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ + ____ __ __ _ _ + | _ \ __ _ _ __ ___ _ __ | \/ | __ _ ___| |__ ___| |_ ___ ________ + | |_) / _` | '_ \/ _ \ '__| | |\/| |/ _` |/ __| '_ \ / _ \ __/ _ \ /_______/ + | __/ (_| | |_)| __/ | | | | | (_| | (__| | | | __/ || __/ \_______\ + |_| \__,_| .__/\___|_| |_| |_|\__,_|\___|_| |_|\___|\__\___| /_______/ + |_| @==|;;;;;;> + +## About +Paper Machete (PM) orchestrates [Binary Ninja](https://binary.ninja) and [Grakn.ai](https://grakn.ai) to aid static binary analysis for the purpose of finding bugs in software. PM leverages the Binary Ninja MLIL SSA to extract semantic meaning about individual instructions, operations, register/variable state, and overall control flow. + +PM migrates this data into Grakn - a knowledge graph that gives us the ability to define domain-specific ontologies for data and write powerful inference rules to form relationships between data we don't want to (or can't) explicitly store. [Heeh, how neat is that](https://www.youtube.com/watch?v=Hm3JodBR-vs)? + +This project was released in conjunction with a DerbyCon 2017 talk titled "Aiding Static Analysis: Discovering Vulnerabilities in Binary Targets through Knowledge Graph Inferences." You can watch that talk [here](http://www.irongeek.com/i.php?page=videos/derbycon7/t116-aiding-static-analysis-discovering-vulnerabilities-in-binary-targets-through-knowledge-graph-inferences-john-toterhi). + +Paper Machete's initial prototype and public codebase were developed by security researchers at the [Battelle Memorial Institute](https://www.battelle.org/government-offerings/national-security/cyber/mission-focused-tools). As this project matures, we hope that you will find it useful in your own research and consider contributing to the project. + +## Why BNIL? +The BNIL suite of ILs is easy to work with, pleasantly verbose, and human-readable. At any point we can decide to leverage other levels and forms of the IL with little development effort on our part. When you add to that the ability to [lift multiple architectures](https://binary.ninja/faq/) and [write custom lifters](https://github.com/joshwatson/binaryninja-msp430), we have little reason not to use BNIL. + +## Why Grakn? +Grakn's query language (Graql) is easy to learn and intuitive, which is extremely important in the early stages of this research while we're still hand-writing queries to model the patterns vulnerability researchers look for when performing static analysis. + +The ability to write our own domain-specific ontologies lets us quickly experiment with new query ideas and ways of making our queries less complex. When we run into a case where we think "gee, if I just had access to the relationship between..." we can modify our ontology and inference rules to get that information. + +While the end game for PM is to eliminate the need for human-written queries, the fact is we're starting from square one. Which means hand-jamming a lot queries to model the patterns human vulnerability researchers look for when bug hunting. + +## Dependencies +Paper Machete requires [BinaryNinja v1.1](https://binary.ninja), [Grakn v1.0.0](https://github.com/graknlabs/grakn/releases/tag/v1.0.0), the [Grakn Python Driver](http://github.com/graknlabs/grakn-python), and the [Java JRE](http://www.oracle.com/technetwork/java/javase/downloads/index.html) + + +## Query Scripts +We've included some basic queries to get you started if you want to play around with PM. As you can imagine, there is no "silver bullet" query that will find all manifestations of a specific vulnerability class. Because of this, we've included versions for each CWE query. As we add new methods of finding the same CWE, we'll add scripts with incremented the version numbers to differentiate. + +`cwe_120_v1.py` - Tests for use of unsafe 'gets()' function ([CWE-120](https://cwe.mitre.org/data/definitions/120.html)) + +`cwe_121_v1.py` - Tests for buffer overflows ([CWE-121](https://cwe.mitre.org/data/definitions/121.html)) + +`cwe_129_v1.py` - Tests for missing bounds checks ([CWE-129](https://cwe.mitre.org/data/definitions/129.html)) + +`cwe_134_v1.py` - Tests for format string vulnerabilities ([CWE-134](https://cwe.mitre.org/data/definitions/134.html)) + +`cwe_788_v1.py` - Tests for missing bounds check on array indexes ([CWE-788](https://cwe.mitre.org/data/definitions/788.html)) + +## How Do I Use It? + +For basic use, run the `paper_machete.py` script and follow the prompts. For more advanced use, please [read the wiki](https://github.com/cetfor/PaperMachete/wiki). + +Typically you'll start with option `[1]` and work your way down to option `[3]`. If you run into any issues with Grakn use option `[4]` to reset Grakn to a clean state and try again. +``` +... banner ... +[1] Analyze a binary file +[2] Migrate a JSON file into Grakn +[3] Run all CWE queries +[4] Clean and restart Grakn +[5] Quit +``` + +Option `[1]` lists all executable files in the `/analysis` directory. So place any executables you want to analyze in `/analysis`. This option will run `pmanalyze.py` and generate a JSON file in the `/analysis` directory. + +Once you've analyzed files with `[1]` and produced resulting JSON files, they will appear as a choice in option `[2]`. Selecting a JSON file in option `[2]` will migrate the data into Grakn. + +Now that you have data in Grakn, you can use option `[3]`. This will kick off all scripts in `/queries` against the keyspace of your choice. If you write your own query patterns, just throw them in `/queries` and option `[3]` will run them too. diff --git a/analysis/about_this_folder b/analysis/about_this_folder new file mode 100755 index 0000000..a543957 --- /dev/null +++ b/analysis/about_this_folder @@ -0,0 +1,9 @@ +This folder serves two purposes: + 1. It's where you put the binaries or Binary Ninja databases you want to analyze (PE, ELF, Mach-O, .bndb) + 2. It's where analysis files (JSON) are stored after being processed by Paper Machete. + +The Paper Machete CLI `paper_machete.py` enumerates this folder when presenting you with analysis/migration options. + +FAQ: +Q: What if my target isn't a PE/ELF/Mach-O executable? It's a binary blob! +A: Analyze it with Binary Ninja and save your analysis as a .bndb file in this folder. diff --git a/config b/config new file mode 100755 index 0000000..cb9be31 --- /dev/null +++ b/config @@ -0,0 +1,2 @@ +[PATHS] +GRAKN=/home/user/PaperMachete/grakn-dist-1.0.0 diff --git a/img/grakn-start.png b/img/grakn-start.png new file mode 100755 index 0000000..ba43846 Binary files /dev/null and b/img/grakn-start.png differ diff --git a/img/grakn_crash.png b/img/grakn_crash.png new file mode 100755 index 0000000..8b75e7f Binary files /dev/null and b/img/grakn_crash.png differ diff --git a/img/grakn_crash_2.png b/img/grakn_crash_2.png new file mode 100755 index 0000000..078392e Binary files /dev/null and b/img/grakn_crash_2.png differ diff --git a/paper_machete.py b/paper_machete.py new file mode 100755 index 0000000..83592f0 --- /dev/null +++ b/paper_machete.py @@ -0,0 +1,284 @@ +import sys +import subprocess +from os import listdir +from os.path import abspath, isdir, isfile, join, splitext +from ConfigParser import RawConfigParser +from mimetypes import guess_type +from urllib2 import urlopen +from ast import literal_eval +import pmanalyze + +ENTER = '\nPress ENTER to continue' +MACHETE = abspath('.') +query_path = join(MACHETE, "queries") +configParser = RawConfigParser() +configParser.read('config') +GRAKN = configParser.get('PATHS', 'GRAKN') +ANALYSIS = join(MACHETE, "analysis") + +MAX_ACTIVE = 25 # migration knob: max number of migration workers running at once +MAX_BATCHES = 1000000000 # migration knob: max number of rows to execute in one transation + +MENU1 = "[1] Analyze a binary file" +MENU2 = "[2] Migrate a JSON file into Grakn" +MENU3 = "[3] Run all CWE queries" +MENU4 = "[4] Clean and restart Grakn" +MENU5 = "[5] Quit" + +TEMPLATE_DESC = [ + '', # n/a + 'Migrating functions.', # template 1 + 'Migrating basic-blocks.', # template 2 + 'Linking basic-blocks to their functions.', # template 3 + 'Migrating instructions.', # template 4 + 'Linking instructions to their basic-blocks.', # template 5 + 'Migrating all AST nodes.', # template 6 + 'Linking AST nodes.' # template 7 +] + +def print_banner(title=""): + subprocess.call("clear") + print(""" + ____ __ __ _ _ +| _ \ __ _ _ __ ___ _ __ | \/ | __ _ ___| |__ ___| |_ ___ ________ +| |_) / _` | '_ \/ _ \ '__| | |\/| |/ _` |/ __| '_ \ / _ \ __/ _ \ /_______/ +| __/ (_| | |_)| __/ | | | | | (_| | (__| | | | __/ || __/ \_______\\ +|_| \__,_| .__/\___|_| |_| |_|\__,_|\___|_| |_|\___|\__\___| /_______/ + |_| @==|;;;;;;> +""") + total_len = 80 + if title: + padding = total_len - len(title) - 4 + print("== {} {}\n".format(title, "=" * padding)) + else: + print("{}\n".format("=" * total_len)) + +def run_script(query_path, query, keyspace): + try: + subprocess.call(["python3.6", join(query_path, query), keyspace]) + except OSError: + print("It looks like you don't have Python3.6 installed. " \ + "The Grakn Python driver requires it.") + return -1 + return 0 + +def run_queries(query, keyspace): + if query == 'all_queries': + print("Running all CWE queries against the '{}' keyspace...".format(keyspace)) + queries = [f for f in listdir(query_path) if isfile(join(query_path, f))] + for query in queries: + if ".py" not in query: continue + if run_script(query_path, query, keyspace): return + print("Script " + query + " complete.") + print("All queries complete.") + else: + if isfile(join(query_path, query)): + if run_script(query_path, query, keyspace): return + else: + print("Could not find the python script " + query) + print("Please make sure it is located in " + query_path) + return + + +def get_file_selection(types): + file_list = listdir(ANALYSIS) + filtered = [] + for file in file_list: + if types == "json" and guess_type(join(ANALYSIS, file))[0] == "application/json": + filtered.append(file) + elif types == "bin": + filecmd = (subprocess.check_output(["file", join(ANALYSIS, file)])).lower() + filecmd = filecmd.split(": ")[1] # remove file path returned by 'file' utility + if "elf" in filecmd or "mach-o" in filecmd or "pe" in filecmd or ".bndb" in file.lower(): + filtered.append(file) + else: + pass # not json or executable binary + + # print file choices + if len(filtered) == 0: + if types == "json": + print("No json files were found in {}".format(ANALYSIS)) + elif types == "bin": + print("No executable files were found in {}".format(ANALYSIS)) + raw_input(ENTER) + return "quit" + else: + for i, file in enumerate(filtered): + print "[{}] {}".format(i, file) + + index = raw_input("\nSelect a file number to analyze ([q]uit): ").lower() + if index == "q" or index == "quit": + return "quit" + + try: + index = int(index) + if index in range(0, len(filtered)): + return filtered[int(index)] + except ValueError: + pass + + if index != "": + print("\nThat is not a valid file selection. Try again.") + raw_input(ENTER) + if types == "bin": + print_banner(MENU1) + elif types == "json": + print_banner(MENU2) + else: + print_banner() + + return False + + +def main(): + menu = True + while menu: + print_banner() + + # check directories + if not isdir(GRAKN): + if GRAKN == '': + print("Please set the path to your Grakn installation in the config file.\n") + print("Open the file called 'config' in your paper machete folder, and set") + print("the variable 'GRAKN' to the full file path of your Grakn installation.") + else: + print("Grakn directory not found\n") + print("Please ensure grakn is located in {}".format(GRAKN)) + sys.exit() + + if not isdir(MACHETE): + print("Paper Machete directory not found") + print("Please ensure Paper Machete is located in {}".format(MACHETE)) + sys.exit() + + if not isdir(ANALYSIS): + print("Creating directory '{}'".format(ANALYSIS)) + subprocess.call(["mkdir", "analysis"]) + + menu_option = raw_input("{}\n{}\n{}\n{}\n{}\n\n>> ".format(MENU1,MENU2,MENU3,MENU4,MENU5)) + + try: + menu_option = int(menu_option) + except ValueError: + if menu_option != "": + print("'{}' is not a valid option.".format(menu_option)) + raw_input(ENTER) + continue + + # analyze a binary file + if menu_option == 1: + + # display supported binary files in ./analysis + binary = False + while binary == False: + print_banner(MENU1) + binary = get_file_selection("bin") + if binary == "quit": + break + if binary == "quit": + continue + + # check to see if the file exists, if it does, process it + if not isfile(join(ANALYSIS, binary)): + print("File '{}' not found.".format(binary)) + else: + functions = str(raw_input('Specify a list of functions examine seperated by spaces (ENTER for all): ')).split() + if len(functions) == 0: + pmanalyze.main(join(ANALYSIS, binary)) + else: + print functions + pmanalyze.main(join(ANALYSIS, binary), functions) + raw_input(ENTER) + + # migrate a json file into Grakn + elif menu_option == 2: + + # display supported binary files in ./analysis + json = False + while json == False: + print_banner(MENU2) + json = get_file_selection("json") + if json == "quit": + break + if json == "quit": + continue + + # check to see if the keyspace already exists for this file + try: + keyspace = json.lower().replace('.json', '') + keyspaces = literal_eval(urlopen('http://127.0.0.1:4567/kb').read()) + + inc = 1 + finding_name = True + while finding_name: + inc += 1 + if keyspace not in keyspaces: + finding_name = False # keyspace name is not in use + else: + keyspace = "{}_{}".format(keyspace, inc) # add a _# suffix and try again + except: + print("Unable to query keyspace names. Is Grakn running?\nContinuing assuming keyspace '{}' is OK to use.".format(keyspace)) + + try: + # insert the ontology + print("Inserting ontology into the '{}' keyspace...".format(keyspace)) + subprocess.call([join(GRAKN,"graql"),"console", "-f", join(MACHETE, "templates", "binja_mlil_ssa.gql"), "-k", keyspace]) + + + # migrate data into Grakn + print("\nMigrating data from '{}' into the '{}' keyspace...".format(json, keyspace)) + + # loop over all 7 templates + for num in range(1,8): + print(">> Migration step {} of 7: {}".format(num, TEMPLATE_DESC[num])) + subprocess.call([join(GRAKN, "graql"), "migrate", "json", "--template", join(MACHETE, "templates", "binja_mlil_ssa_{}.tpl".format(num)), "--input", join(ANALYSIS, json), "--keyspace", keyspace]) + + print("Data successfully migrated into Grakn. You can now run CWE query scripts against '{}' to check for vulnerabilities".format(keyspace)) + raw_input(ENTER) + except: + print("Upload failed... please try agin.") + raw_input(ENTER) + + # run CWE queries + elif menu_option == 3: + keyspace = None + keyspaces = literal_eval(urlopen('http://127.0.0.1:4567/kb').read())['keyspaces'] + + print_banner(MENU3) + + for i, ks in enumerate(keyspaces): + print("[{}] {}".format(i, ks['name'])) + + index = raw_input("\nSelect a keyspace to run all queries against ([q]uit): ").lower() + if index == "q" or index == "quit": + continue + + try: + index = int(index) + if index in range(0, len(keyspaces)): + keyspace = keyspaces[int(index)]['name'] + except ValueError: + continue + + run_queries('all_queries', keyspace) + raw_input(ENTER) + + # clean and restart Grakn + elif menu_option == 4: + print("Restarting Grakn. Press \"Y\" when prompted.\nWait until you see the Grakn banner before continuing!") + raw_input(ENTER) + + subprocess.call([join(GRAKN, "grakn"), "server", "stop"]) + subprocess.call([join(GRAKN, "grakn"), "server", "clean"]) + subprocess.call([join(GRAKN, "grakn"), "server", "start"]) + + # quit + elif menu_option == 5: + menu = False + + else: + print("Invalid option!\n") + raw_input(ENTER) + +if __name__ == "__main__": + main() diff --git a/pmanalyze.py b/pmanalyze.py new file mode 100755 index 0000000..4bdf089 --- /dev/null +++ b/pmanalyze.py @@ -0,0 +1,505 @@ +import sys +import json +from struct import pack, unpack +from os.path import basename, join, isfile +from operator import attrgetter +from collections import defaultdict +import binaryninja as binja + +PM = None +vars_and_sizes = {} + +class PaperMachete(): + def __init__(self): + self.functions = [] + +class PMFunction(): + def __init__(self, func_name, asm_addr): + self.func_name = func_name + self.asm_addr = asm_addr + self.basic_blocks = [] + self.bb_edges = [] + +class PMBasicBlock(): + def __init__(self, bb_name, bb_start, bb_end): + self.bb_name = bb_name + self.bb_start = bb_start + self.bb_end = bb_end - 1 # set end as last il index (not +1 like binja gives us) + self.instructions = [] + +class PMInstruction(): + def __init__(self, name, il_index, asm_address, operation_type, in_bb): + self.name = name + self.il_index = il_index + self.asm_address = asm_address + self.operation_type = operation_type + self.in_bb = in_bb + self.nodes = [] + +class PMOperation(): + def __init__(self, name, depth, node_type, edge_label, parent_hash): + self.name = name + self.depth = depth + self.node_type = node_type + self.edge_label = edge_label + self.parent_hash = parent_hash + +class PMNodeList(): + def __init__(self, name, depth, node_type, edge_label, parent_hash, list_size): + self.name = name + self.depth = depth + self.node_type = node_type + self.edge_label = edge_label + self.parent_hash = parent_hash + self.list_size = list_size + +class PMEndNodeConstant(): + def __init__(self, name, depth, node_type, edge_label, parent_hash, constant_value): + self.name = name + self.depth = depth + self.node_type = node_type + self.edge_label = edge_label + self.parent_hash = parent_hash + self.constant_value = constant_value + +class PMEndNodeVarSSA(): + def __init__(self, name, depth, node_type, edge_label, parent_hash, var, version, var_type, var_size, var_func): + self.name = name + self.depth = depth + self.node_type = node_type + self.edge_label = edge_label + self.parent_hash = parent_hash + self.var = var + self.version = version + self.var_type = var_type + self.var_size = var_size + self.var_func = var_func + +class PMEndNodeVariable(): + def __init__(self, name, depth, node_type, edge_label, parent_hash, var, var_type, var_size, var_func): + self.name = name + self.depth = depth + self.node_type = node_type + self.edge_label = edge_label + self.parent_hash = parent_hash + self.var = var + self.var_type = var_type + self.var_size = var_size + self.var_func = var_func + +class PMBBEdge(): + def __init__(self, source, target): + self.source = source + self.target = target + + +def process_function(func): + global insn_list + global vars_and_sizes + + insn_list = [] + vars_and_sizes = {} + + stack = str(binja.function.Function.stack_layout.__get__(func)) + vars_and_sizes = get_variable_sizes(stack) + + func_name = func.name.replace('.', '_') + asm_addr = hex(func.start).strip('L') + + PM.functions.append(PMFunction(func_name, asm_addr)) + + +def process_basic_block(func, block): + func_name = func.name.replace('.', '_') + bb_name = "bb_{}_{}_{}".format(block.start, block.end-1, func_name) + + for func in PM.functions: + if func.func_name == func_name: + func.basic_blocks.append(PMBasicBlock(bb_name, block.start, block.end)) + + +def process_instruction(func, block, insn): + global insn_list + + func_name = func.name.replace('.', '_') + + # A single ISA instruction can map to many IL instructions. + # This can cause the same instruction to be processed many times. + # To avoid this, we track instructions in a function and only + # process them once. We clear this global list in process_function(). + + # To complicate this more, MLIL_GOTO operations always seem to have + # address => 0x0. So we have to process 0x0 addresses multiple times until + # this behavior changes in Binary Ninja (this may actually be expected). + + if (insn.address not in insn_list) or (insn.address == 0x0): + ast_parse([func, block, insn]) + insn_list.append(insn.address) + + # sort the 'nodes' list in each instruction by 'depth' + # This is extremely important for Grakn's migration template + # since nodes at depth 1 need to exist before nodes at depth + # 2 can be linked to them (and so on). + + for func in PM.functions: + for bb in func.basic_blocks: + for inst in bb.instructions: + (inst.nodes).sort(key=attrgetter('depth')) + + +def ast_build_json(args, name, il, level=0, edge=""): + global insn_list + global vars_and_sizes + + func = args[0] + block = args[1] + insn = args[2] + + func_name = func.name.replace('.', '_') + + # slice off the last "_#" and rejoin to get the parent reference hash + parent = "_".join(name.split('_')[:-1]) + + # Hashes of instruction nodes in the AST look like: "N_8735918103813_4195908" + # One element down from an instruction will look like: "N_8735918103813_4195908_0" + # So if there are two "_" in the hash, the node is an instruction. List nodes have + # the letter 'L' appended to them. (Yeah, I LOL'd when I wrote this too.) + depth = name.count("_") - 2 + if 'L' in parent: + parent_type = "list" + name = name.replace('L', 'N') # reset node status + elif parent.count("_") == 2: + parent_type = "instruction" + else: + parent_type = "operation" + + # get the instruction hash this node belongs in + inst_hash = "_".join(name.split('_')[:3]) + + # get the basic-block this node belongs in + inbb = "bb_{}_{}_{}".format(block.start, block.end-1, func_name) + + if isinstance(il, binja.MediumLevelILInstruction): + + # instruction + if level == 0: + il_index = il.instr_index + asm_address = hex(il.address).strip('L') + operation_type = str(il.operation).split('.')[1] + + for func in PM.functions: + for bb in func.basic_blocks: + if bb.bb_name == inbb: + # This next if statement is to avoid issues with MLIL_GOTO nodes + # being placed in the wrong basic blocks. This is because all MLIL_GOTO + # nodes have and asm_address of 0x0, so we leave them out of the insn_list global. + # This also means, the same instruction can be added twice! So we need to check if + # the same node already exists. If it does, we don't add it. + if il_index >= bb.bb_start and il_index <= bb.bb_end: + if operation_type == "MLIL_GOTO": + if (inst_hash not in insn_list): + insn_list.append(inst_hash) + else: + continue # don't add this again! + bb.instructions.append(PMInstruction(inst_hash, il_index, asm_address, operation_type, inbb)) + + # operation + else: + node_type = str(il.operation).split('.')[1] + edge_label = str(edge) + parent_hash = parent + + for func in PM.functions: + for bb in func.basic_blocks: + for inst in bb.instructions: + if inst.name == inst_hash: + inst.nodes.append(PMOperation(name, depth, node_type, edge_label, parent_hash)) + + # edge + for i, o in enumerate(il.operands): + try: + edge_label = str(il.ILOperations[il.operation][i][0]) + except IndexError: + # Addresses issue in binja v1.1 stable with MLIL_SET_VAR_ALIASED + # operations in the Python bindings. + # See: https://github.com/Vector35/binaryninja-api/issues/787 + edge_label = "unimplemented" + child_name = "{}_{}".format(name, i) + ast_build_json(args, child_name, o, level+1, edge_label) + + + # list of operands / nodes + elif isinstance(il, list): + node_type = "list" + edge_label = str(edge) + parent_hash = parent + name = name.replace('N', 'L') # list hashes have an 'L' prefix to distinguish from nodes ('N'). + list_size = len(il) + + for func in PM.functions: + for bb in func.basic_blocks: + for inst in bb.instructions: + if inst.name == inst_hash: + inst.nodes.append(PMNodeList(name, depth, node_type, edge_label, parent_hash, list_size)) + + + # add elements from + for i, item in enumerate(il): + edge_label = str(i) + item_name = "{}_{}".format(name, i) + ast_build_json(args, item_name, item, level+1, edge_label) + + # end node + else: + parent_hash = parent + edge_label = str(edge) + + # constant + if isinstance(il, long): + node_type = "constant" + constant_value = str(il) + + for func in PM.functions: + for bb in func.basic_blocks: + for inst in bb.instructions: + if inst.name == inst_hash: + inst.nodes.append(PMEndNodeConstant(name, depth, node_type, edge_label, parent_hash, constant_value)) + + + # SSAVariable (not using type information) + elif isinstance(il, binja.mediumlevelil.SSAVariable): + node_type = "variable-ssa" + var = str(il.var) + version = il.version + + var_type = str(il.var.type) + var_size = vars_and_sizes.get(str(il.var), 4) + var_func = func_name + + for func in PM.functions: + for bb in func.basic_blocks: + for inst in bb.instructions: + if inst.name == inst_hash: + inst.nodes.append(PMEndNodeVarSSA(name, depth, node_type, edge_label, parent_hash, var, version, var_type, var_size, var_func)) + + + # Variable (contains more information than we currently use) + elif isinstance(il, binja.function.Variable): + node_type = "variable" + var = str(il) + + var_type = str(il.type) + var_size = vars_and_sizes.get(str(il), 4) + var_func = func_name + + for func in PM.functions: + for bb in func.basic_blocks: + for inst in bb.instructions: + if inst.name == inst_hash: + inst.nodes.append(PMEndNodeVariable(name, depth, node_type, edge_label, parent_hash, var, var_type, var_size, var_func)) + + + # Unknown terminating node (this should not be reached) + else: + print "A terminating node was encountered that was not expected: '{}'".format(type(il)) + raise ValueError + + +def ast_name_element(args, il_type, il): + h = hash(il) + name = "N_{}_{}".format(h, il.address) + ast_build_json(args, name, il) + + +def ast_parse(args): + func = args[0] + block = args[1] + insn = args[2] + + print " function: {} (asm-addr: {})".format(func.name, hex(insn.address).strip('L')) + lookup = defaultdict(lambda: defaultdict(list)) + + for block in func.medium_level_il.ssa_form: + for mil in block: + lookup['MediumLevelILSSA'][mil.address].append(mil) + + for il_type in sorted(lookup): + ils = lookup[il_type][insn.address] + for il in sorted(ils): + ast_name_element(args, il_type, il) + + +def process_edges(func): + func_name = (func.name).replace('.', '_') + + for block in func.medium_level_il.ssa_form: + if len(block.outgoing_edges) > 0: + for edge in block.outgoing_edges: + source = "bb_{}_{}_{}".format(edge.source.start, edge.source.end-1, func_name) + target = "bb_{}_{}_{}".format(edge.target.start, edge.target.end-1, func_name) + for func in PM.functions: + if func.func_name == func_name: + func.bb_edges.append(PMBBEdge(source, target)) + + +def get_offset_from_var(var): + """ + Helper for get_variable_sizes)_ + Use this to calculate var offset. + e.g. var_90, __saved_edi --> 144, -1 + """ + instance = False + i=0 + + # Parse string + i = var.rfind(' ')+1 + tmp = var[i:-1] + + # Parse var + if tmp[0] == 'v': + tmp = tmp[4:] + j = tmp.find('_') + + # Handles SSA var instances (var_14_1) and converts c, 58, 88 --> 12, 88, 136 + if (j != -1): + tmp = tmp[:j] + instance = True + else: + instance = False + + try: + tmp = int(tmp, 16) + except: + tmp = -1 + + # -1 for non vars + else: + tmp = -1 + + return tmp, instance + + +def get_variable_sizes(stack): + """ + Called from process_function. This function Accepts a string + of stack variables and returns a dict of var names and sizes. + """ + prev_offset = 0 + offset = 0 + counter = 0 + i=0 + var_dict = {} + str_list = list(reversed(stack[1:-1].split(', '))) + + # Loop through each item on stack backwards + for item in str_list: + size=0 + tmp=0 + instance = False + + # Handle args and return addr + if (('arg' in item) or ('return' in item)): + size = 4 + + elif('int32' in item): + size = 4 + tmp, instance = get_offset_from_var(str_list[counter]) + if tmp != -1: + offset = tmp + if not instance: + offset = prev_offset+4 + + elif ('int64' in item): + size = 8 + tmp, instance = get_offset_from_var(str_list[counter]) + if not instance: + offset = prev_offset+8 + if tmp != -1: + offset = tmp + + else: + offset, instance = get_offset_from_var(str_list[counter]) + if instance: + offset = offset-4 + + if size == 0: + size = offset-prev_offset + if (not instance): + prev_offset = offset + + # Parse string + i = item.rfind(' ')+1 + key = item[i:-1] + + var_dict.update({key:size}) + counter = counter+1 + + return var_dict + + +def analyze(bv, func_list=[]): + + list_len = len(func_list) + + ## process functions + for func in bv.functions: + if list_len > 0 and func.name not in func_list: continue + process_function(func) + + ## process basic blocks + for block in func.medium_level_il.ssa_form: + process_basic_block(func, block) + + ## process instructions + for insn in block: + process_instruction(func, block, insn) + + ## process basic block edges + # all edges need to exist in Grakn before we can do this + # because edges stemming from loops wont have an associated + # basic block inserted to create a relationship for. + process_edges(func) + + +def main(target, func_list=[]): + global PM + + PM = PaperMachete() + + if not isfile(target): + print "The specified target '{}' is not a file. Try again.".format(target) + return + + print "Invoking Binary Ninja and analyzing file: {}".format(target) + bv = binja.BinaryViewType.get_view_of_file(target) + bv.add_analysis_option('linearsweep') + print "Performing linear sweep..." + bv.update_analysis_and_wait() + print "Linear sweep complete. Collecting BNIL data..." + analyze(bv, func_list) + + # pretty printed json (pretty printed files are much larger than compact files!) + target_json = json.dumps(PM, default=lambda o: o.__dict__, indent=4, sort_keys=True) + + # compact / minified json + #target_json = json.dumps(PM, default=lambda o: o.__dict__) + + try: + jf = None + if __name__ == "__main__": + jf = open("{}.json".format(basename(target)), "w") + else: + jf = open(join("analysis", "{}.json".format(basename(target))), "w") + jf.write(target_json) + jf.close() + except IOError: + print "ERROR: Unable to open/write to {}.json.".format(basename(target)) + return + +if __name__ == "__main__": + if len(sys.argv) > 1: + target = sys.argv[1] + func_list = sys.argv[2:] + else: + print "Usage: %s [function1 function2 ...]" % sys.argv[0] + main(target, func_list) diff --git a/queries/cwe_120_v1.py b/queries/cwe_120_v1.py new file mode 100755 index 0000000..336a204 --- /dev/null +++ b/queries/cwe_120_v1.py @@ -0,0 +1,46 @@ +#============================================================================================================ +# CWE-120: Buffer Copy without Checking Size of Input +# +# Vuln Info: A trivial way to cause this vulnerability is using the gets() function which is not secure. +# Ex: +# bytes_received = gets(input); <--Bad +# bytes_received = receive_until(input, sizeof(input), '\n'); <--Good +# +# Methodology: +# 1. Find gets instruction +# 2. There's a vulnerability +# +# Try it on: REMATCH_1--Hat_Trick--Morris_Worm +# +#============================================================================================================ + +import sys +import grakn + +def main(keyspace): + graph = grakn.Client(uri='http://localhost:4567', keyspace=keyspace) + + # Check for gets() function + # Get address of function to use for next query + function_name = 'gets' + query1 = 'match $func isa function, has func-name contains "{}", has asm-address $a; get $a;'.format(function_name) + result1 = graph.execute(query1) + + # If the function is found continue query + if result1: + # Get all instructions that have function name + func_addr = int(result1[0]['a']['value'], 16) + query2 = 'match $x has operation-type "MLIL_CALL_SSA" has asm-address $a; $y isa"MLIL_CONST_PTR"; ($x,$y); $z isa constant, has constant-value {}; ($y,$z); get $x, $a;'.format(func_addr) + result2 = graph.execute(query2) + + # If there are instructions that use the function check the instructions + for instr in result2: + ins_addr = instr['a']['value'] + print("CWE-120: Buffer Copy Without Checking Size of Input at {}\n".format(ins_addr)) + +if __name__ == "__main__": + if len(sys.argv) > 1: + keyspace = sys.argv[1] + else: + keyspace = "grakn" + main(keyspace) diff --git a/queries/cwe_121_v1.py b/queries/cwe_121_v1.py new file mode 100755 index 0000000..dce3928 --- /dev/null +++ b/queries/cwe_121_v1.py @@ -0,0 +1,98 @@ +#============================================================================================================ +# CWE-121: Stack-based Buffer Overflow +# +# Vuln Info: This vulnerability comes from allocating too much space for a string. +# Ex: char string[64] +# (cgc_receive_delim(0, string, 128, '\n') != 0) <--Bad +# (cgc_receive_delim(0, string, sizeof(string), '\n') != 0) <--Good +# +# Methodo#logy: +# 1. Find all instructions that call a specific function specified with function_name +# 2. Check these instructions' parameters, string, and bytes allocated (sizeof(string)) +# 3. Find where the string was initialized to get amount of bytes allocated +# 4. If the amount of bytes allocated != size of string alert possible vulerability +# +# Try it on: Palindrome2, ShoutCTF +# +# Includes functions: +# fgets(name, sizeof(name), stdin) +# receive_delim(0, 0, string, sizeof(string), '\n') +# strncpy(targetBuffer, srcBuffer, sizeof(targetBuffer)); +# receive_until(buff, '\n', 25); +# memcpy(str1, str2, n); +# freaduntil(buf, sizeof(buf), '\n', stdin) +# read(int fd, void *buf, size_t count); +#============================================================================================================ + +import sys +import grakn + +def main(keyspace): + graph = grakn.Client(uri='http://localhost:4567', keyspace=keyspace) + + # Functions with indexes for (dest, sizeof(dest)) stored in dict + functions = {"receive_delim": (2,3), "fgets": (0,1), "strncpy": (0,2), "receive_until": (0,2), "memcpy": (0,2), "freaduntil": (1,2), "read":(1,2)} + + # Check for potential vuln in each function + for function_name in functions: + # Get address of function to use for next query + query1 = 'match $func isa function, has func-name contains "{}", has asm-address $a; get $a;'.format(function_name) + result1 = graph.execute(query1) + + # If the function is found continue query + if result1: + # Get all instructions that have function name + func_addr = int(result1[0]['a']['value'], 16) + query2 = 'match $x has operation-type "MLIL_CALL_SSA"; $y isa"MLIL_CONST_PTR"; ($x,$y); $z isa constant, has constant-value {}; ($y,$z); get $x;'.format(func_addr) + result2 = graph.execute(query2) + + # If there are instructions that use the function check the instructions + if result2: + + buff_index = functions[function_name][0] + size_index = functions[function_name][1] + for instr in result2: + Id = instr['x']['id'] + query3 = 'match $x id "' + Id + '"; $l isa list; ($x,$l); (from-node: $l, $q); $q has edge-label $e; (from-node: $q, $v); {$v has var $s;} or {$v has constant-value $s;}; get $e, $s;' + result3 = graph.execute(query3) + + # This section grabs instrution params and insert into an array + param_array = [0, 0, 0, 0, 0, 0, 0, 0] + + for ele in result3: + index = int(ele['e']['value']) + val = ele['s']['value'] + param_array[index] = val + # Get var name - This is done to determine how many bytes the variable is + var_name = param_array[buff_index] + var_name = var_name.split('#',1)[0].lstrip() + + # NOTE Enhancement Make finding buff_size the same as string_size + # This assumes that buffer_size is a number, breaks when its a var or register + # Get buffer size + try: + buff_size = int(param_array[size_index]) + except ValueError as err: + continue + # Get size of string in by finding initialization Ex. var_88 = &var_58 + # Find where string is initialzed + query4 = 'match $x id "{}"; $y isa basic-block; ($x,$y); $z isa instruction, has operation-type "MLIL_SET_VAR_SSA"; ($y,$z); {{$v1 isa variable, has var "{}";}} or {{$v1 isa variable-ssa, has var "{}";}}; ($z, $v1); $w isa MLIL_ADDRESS_OF; ($w, $z); $v isa variable, has var-size $s; ($w, $v); get $s, $x;'.format(Id, var_name, var_name) + result4 = graph.execute(query4) + + if (result4): + string_size = result4[0]['s']['value'] + # Finally Determine if buffer size == sizeof(str) + if string_size != buff_size: + instruction_ID = result4[0]['x']['id'] + query5 = 'match $i id {}, has asm-address $a; get $a;'.format(instruction_ID) + result5 = graph.execute(query5) + instr_addr = result5[0]['a']['value'] + + print("CWE-121: Stack-based Overflow possible at {}".format(instr_addr)) + +if __name__ == "__main__": + if len(sys.argv) > 1: + keyspace = sys.argv[1] + else: + keyspace = "grakn" + main(keyspace) diff --git a/queries/cwe_129_v1.py b/queries/cwe_129_v1.py new file mode 100755 index 0000000..2c634b2 --- /dev/null +++ b/queries/cwe_129_v1.py @@ -0,0 +1,91 @@ +#============================================================================================================ +# CWE-129:Imporper validation of array index +# +# Vuln Info: This vulnerability comes from using untrusted (unchecked) input when using an array index. +# +# Methodology: Find all signed comparisons of a varaible and constant and follow the variable to see if its +# other bound is checked. +# +# TODO: Currently the script searches out all comparisons to see if the other bound is checked by looking +# for the same variable in other comparisons. The search can be improved by instead searching for where +# the user can modify an array index then checking for bounds on that. +# +# Limitations: This implementation only find instances where one bound was checked, but not the other. +# Also this implementation does not specifically search for array indexs, but comparisons in general. +# +# try it on: recipe_and_pantry_manager +#============================================================================================================ + +import sys +import grakn + +#Exits the script +def fail(): + sys.exit() + +#Finds comparisons that are acting as a lower boudns check +def lowerCheck(): + query = graph.execute('match {$comp isa MLIL_CMP_SGE;} or {$comp isa MLIL_CMP_SGT;};$node isa MLIL_VAR_SSA;$cons isa MLIL_CONST;($comp, $node);($comp, $cons);$varssa isa variable-ssa has var $var;($node, $varssa);get $comp, $var;') + return query + +#Finds comparisons that are acting as an upper bounds check +def upperCheck(): + query = graph.execute('match {$comp isa MLIL_CMP_SLE;} or {$comp isa MLIL_CMP_SLT;};$node isa MLIL_VAR_SSA;$cons isa MLIL_CONST;($comp, $node);($comp, $cons);$varssa isa variable-ssa has var $var;($node, $varssa);get $comp, $var;') + return query + +#Returns the addresss of a comparison instruction +def get_addr(comp): + query = graph.execute('match $comp id "' + comp + '";$inst isa instruction, has asm-address $addr;($comp, $inst);get $addr;') + return query + +def main(keyspace): + global graph + graph = grakn.Client(uri='http://localhost:4567', keyspace=keyspace) + + #Find a variable being compared + query1 = graph.execute('match {$comp isa MLIL_CMP_SGE;} or {$comp isa MLIL_CMP_SLE;} or {$comp isa MLIL_CMP_SLT;} or {$comp isa MLIL_CMP_SGT;};$node isa MLIL_VAR_SSA;$cons isa MLIL_CONST;($comp, $node);($comp, $cons);$varssa isa variable-ssa has var $var;($node, $varssa);get $comp, $var;') + + #Parse the output of query1 into the compare statements and varaible names + comp, var = [], [] + if query1: + for entry in query1: + comp.append(entry['comp']['id']) + var.append(entry['var']['value']) + else: + fail() + for entry in comp: + #Do upper bound check + if ('SGE' or 'SGT') in entry: + lower = lowerCheck() + if lower: + for item in lower: + if item['var']['value'] not in var: + #failed to find upper bound check + addr = get_addr(entry) + print('CWE-129: Missing upper bound check at ' + str(addr[0]['addr']['value'])) + else: + adddr = get_addr(entry) + else: + addr = get_addr(entry) + print('CWE-129: Missing upper bound check at ' + str(addr[0]['addr']['value'])) + #Do lower bound check + else: + upper = upperCheck() + if upper: + for item in upper: + if item['var']['value'] not in var: + #failed to find lower bound check + addr = get_addr(entry) + print('CWE-129: Missing lower bound check at ' + str(addr[0]['addr']['value'])) + else: + addr = get_addr(entry) + else: + addr = get_addr(entry) + print('CWE-129: Missing lower bound check at ' + str(addr[0]['addr']['value'])) + +if __name__ == "__main__": + if len(sys.argv) > 1: + keyspace = sys.argv[1] + else: + keyspace = "grakn" + main(keyspace) diff --git a/queries/cwe_134_v1.py b/queries/cwe_134_v1.py new file mode 100755 index 0000000..3100dda --- /dev/null +++ b/queries/cwe_134_v1.py @@ -0,0 +1,50 @@ +#============================================================================================================ +# CWE-134 Uncontrolled Format String +# +# Vuln Info: This vulnerability comes from using printf without a modifier +# Ex: cgc_printf(message); <--Bad +# cgc_printf("%s", message); <--Good +# +# Methodology: +# 1. Check if file has a printf function +# 2. Check if any instructions use printf +# 3. Check if params in printf are data type(correct) or var_type(incorrect, no modifier i.e. %s used) +# +# Try it on: Barcoder, Checkmate, Kaprica_Go +#============================================================================================================ + +import sys +import grakn + +def main(keyspace): + graph = grakn.Client(uri='http://localhost:4567', keyspace=keyspace) + + # Get address of printf to use for next query + query1 ='match $func isa function, has func-name contains "printf", has asm-address $a; offset 0; limit 100; get $a;' + result1 = graph.execute(query1) + if len(result1) > 0: + print("Found potential calls at the following addresses:") + for addr in result1: + print(addr['a']['value']) + + # If printf is found continue query + for printf_func in result1: + # Pull any instructions that use printf and don't use a modifier (have var type and not data type) + func_addr = int(printf_func['a']['value'], 16) + print("Scanning address {}".format(hex(func_addr))) + query2 = 'match $x isa instruction, has operation-type "MLIL_CALL_SSA", has asm-address $a; $y isa "MLIL_CONST_PTR"; ($x,$y); $z isa constant, has constant-value {}; ($y,$z); $l isa list, has list-size 1; ($x,$l); $s isa "MLIL_VAR_SSA"; ($l,$s); offset 0; limit 500; get $x, $a;'.format(func_addr) + result2 = graph.execute(query2) + + # If there is an instruction that uses printf without modifier, output instruction + if result2: + for instr in result2: + asm_addr = instr['a']['value'] + print("CWE-134: Uncontrolled Format String possible at {} ".format(asm_addr)) + +if __name__ == "__main__": + if len(sys.argv) > 1: + keyspace = sys.argv[1] + main(keyspace) + else: + print("Please specify a keyspace to search.\nUsage: python3.6 {} ".format(sys.argv[0])) + diff --git a/queries/cwe_788_v1.py b/queries/cwe_788_v1.py new file mode 100755 index 0000000..8f8a986 --- /dev/null +++ b/queries/cwe_788_v1.py @@ -0,0 +1,125 @@ +#======================================================================================= +# CWE-788: Access of Memory Location After End of Buffer +# +# Vuln Info: The software reads or writes to a buffer using an index or pointer that +# references a memory location after the end of the buffer. +# +# Methodology: +# 1.Find any arrays +# 2.Find indexing variables for said arrays +# 3.Look to see if those variables are used in a comparison (bounds check) +#======================================================================================= + +import sys +import grakn + +#Exits script +def fail(): + return 0 + sys.exit() + +#Searches for potential array declarations +def query1(): + result = graph.execute('match $set isa instruction, has operation-type "MLIL_SET_VAR_SSA";$ptr isa MLIL_CONST_PTR;($set, $ptr);$reg isa variable-ssa, has var $index;($set, $reg); get $index;') + return result + +#Finds potential loops +def query2(): + result = graph.execute('match $block isa basic-block;($block, $inst);$inst isa instruction;$reg isa variable-ssa, has var $index, has edge-label "dest";($inst, $reg);get $index, $block;') + return result + +#Checks query2 for if statements +def query3(item): + result = graph.execute('match $block isa basic-block, id "' + item + '";($block, $inst);$inst isa instruction, has operation-type "MLIL_IF";offset 0; get $inst;') + return result + +#Finds and returns various information about the loops, including the counting variable +def query4(entry): + result = graph.execute('match $block isa basic-block, id "' + entry + '";($block, contains-instruction:$inst);$inst isa instruction, has operation-type "MLIL_SET_VAR_SSA";($inst, to-node:$add);$add isa MLIL_ADD;$var isa MLIL_VAR_SSA;($add, $var);$const isa MLIL_CONST;($add, $const);$one isa constant has constant-value 1;($const, $one);$reg isa variable-ssa, has var $index, has version $version, has edge-label "dest";($inst, $reg);get $index, $reg, $version;') + return result + +#Checks if the bounds on the counting varaible (array index) are ever checked +def query5(): + result = graph.execute('match $block isa basic-block;$inst isa instruction, has operation-type "MLIL_IF";($block, $inst);{$comp isa MLIL_CMP_SGE;} or {$comp isa MLIL_CMP_SLE;} or {$comp isa MLIL_CMP_SLT;} or {$comp isa MLIL_CMP_SGT;} or {$comp isa MLIL_CMP_UGE;} or {$comp isa MLIL_CMP_ULE;} or {$comp isa MLIL_CMP_ULT;} or {$comp isa MLIL_CMP_UGT;};($inst, $comp);$reg isa MLIL_VAR_SSA;($comp, $reg);$index isa variable-ssa, has var $var, has version $version;($reg, $index);get $var, $version;') + return result + +#Returns asm-address of vulnerability +def query6(reg_type, reg): + result = graph.execute('match $inst isa instruction, has asm-address $adr;$var isa '+ reg_type + ', id "' + reg + '";($inst, $var);get $adr;') + return result + +def main(keyspace): + global graph + graph = grakn.Client(uri='http://localhost:4567', keyspace=keyspace) + + # Find possible arrays + array = [] + q1 = query1() + if q1: + i = 0 + for item in q1: + array.append(q1[i]['index']['id']) + i += 1 + else: + fail() + + # Find loops involving the array + block = [] + q2 = query2() + if q2: + i = 0 + for item in q2: + if q2[i]['index']['id'] in array: + block.append(q2[i]['block']['id']) + i += 1 + else: + fail() + + # Do the 'loop' blocks contain if statements? + if_id = [] + block2 = block.copy() + for item in block2: + q3 = query3(item) + if not q3: + block.remove(item) + + # Find the loop counters + var, version, var_id, reg, reg_type, block2 = [], [], [], [], [], block.copy() + for entry in block2: + q4 = query4(entry) + if q4: + i = 0 + for item in q4: + reg.append(item['reg']['id']) + reg_type.append(item['reg']['type']['label']) + print(item['reg']['type']['label']) + var.append(item['index']['value']) + version.append(item['version']['value']) + var_id.append(item['index']['id']) + i += 1 + else: + block.remove(entry) + i = len(var) - 1 + + # Find is the bounds of the loop counter are checked + var2 = [] + q5 = query5() + i = 0 + for entry in q5: + var2.append(q5[i]['var']['value']) + i += 1 + + # Any variables in var[] but not var2[] are potential vulnerabilities + i = 0 + for entry in var: + if entry not in var2: + q6 = query6(reg_type[i], reg[i]) + print('CWE-788: Array index missing bounds check at ' + q6[0]['adr']['value'] + ' associated with '+ var[i] + '#' + str(version[i]) + ' id = ' + var_id[i] + ' sub of ' + reg_type[i] + ' id = ' + reg[i]) + i += 1 + +if __name__ == "__main__": + if len(sys.argv) > 1: + keyspace = sys.argv[1] + else: + keyspace = "grakn" + main(keyspace) diff --git a/templates/binja_mlil_ssa.gql b/templates/binja_mlil_ssa.gql new file mode 100755 index 0000000..27ac190 --- /dev/null +++ b/templates/binja_mlil_ssa.gql @@ -0,0 +1,289 @@ +define + +## ENTITIES ##################################### +function sub entity + plays in-function + has func-name + has asm-address + has stack; + +basic-block sub entity + plays from-basic-block + plays to-basic-block + plays in-basic-block + plays contains-basic-block + has bb-name + has bb-start + has bb-end; + +instruction sub entity + plays from-node + plays to-node + plays in-instruction + plays contains-instruction + has name + has il-index + has asm-address + has ins-text + has operation-type + has in-bb; + +operation sub entity + plays from-node + plays to-node + plays in-operation + plays contains-operation + has name + has parent-hash + has edge-label; + +constant sub entity + plays from-node + plays to-node + has name + has parent-hash + has constant-value + has edge-label; + +variable sub entity + plays from-node + plays to-node + has name + has parent-hash + has var + has edge-label + has var-type + has var-size + has var-func; + +variable-ssa sub entity + plays from-node + plays to-node + plays trace + has name + has parent-hash + has var + has version + has edge-label + has var-type + has var-size + has var-func; + +list sub entity + plays from-node + plays to-node + has name + has parent-hash + has list-size + has edge-label; + + +## SUB ENTITIES ################################# +#### OPERATIONS ################################# +MLIL_NOP sub operation; +MLIL_SET_VAR sub operation; +MLIL_SET_VAR_FIELD sub operation; +MLIL_SET_VAR_SPLIT sub operation; +MLIL_LOAD sub operation; +MLIL_STORE sub operation; +MLIL_VAR sub operation; +MLIL_VAR_FIELD sub operation; +MLIL_ADDRESS_OF sub operation; +MLIL_ADDRESS_OF_FIELD sub operation; +MLIL_CONST sub operation; +MLIL_CONST_PTR sub operation; +MLIL_ADD sub operation; +MLIL_ADC sub operation; +MLIL_SUB sub operation; +MLIL_SBB sub operation; +MLIL_AND sub operation; +MLIL_OR sub operation; +MLIL_XOR sub operation; +MLIL_LSL sub operation; +MLIL_LSR sub operation; +MLIL_ASR sub operation; +MLIL_ROL sub operation; +MLIL_RLC sub operation; +MLIL_ROR sub operation; +MLIL_RRC sub operation; +MLIL_MUL sub operation; +MLIL_MULU_DP sub operation; +MLIL_MULS_DP sub operation; +MLIL_DIVU sub operation; +MLIL_DIVU_DP sub operation; +MLIL_DIVS sub operation; +MLIL_DIVS_DP sub operation; +MLIL_MODU sub operation; +MLIL_MODU_DP sub operation; +MLIL_MODS sub operation; +MLIL_MODS_DP sub operation; +MLIL_NEG sub operation; +MLIL_NOT sub operation; +MLIL_SX sub operation; +MLIL_ZX sub operation; +MLIL_LOW_PART sub operation; +MLIL_JUMP sub operation; +MLIL_JUMP_TO sub operation; +MLIL_CALL sub operation; +MLIL_CALL_UNTYPED sub operation; +MLIL_CALL_OUTPUT sub operation; +MLIL_CALL_PARAM sub operation; +MLIL_RET sub operation; +MLIL_NORET sub operation; +MLIL_IF sub operation; +MLIL_GOTO sub operation; +MLIL_CMP_E sub operation; +MLIL_CMP_NE sub operation; +MLIL_CMP_SLT sub operation; +MLIL_CMP_ULT sub operation; +MLIL_CMP_SLE sub operation; +MLIL_CMP_ULE sub operation; +MLIL_CMP_SGE sub operation; +MLIL_CMP_UGE sub operation; +MLIL_CMP_SGT sub operation; +MLIL_CMP_UGT sub operation; +MLIL_TEST_BIT sub operation; +MLIL_BOOL_TO_INT sub operation; +MLIL_ADD_OVERFLOW sub operation; +MLIL_SYSCALL sub operation; +MLIL_SYSCALL_UNTYPED sub operation; +MLIL_BP sub operation; +MLIL_TRAP sub operation; +MLIL_UNDEF sub operation; +MLIL_UNIMPL sub operation; +MLIL_UNIMPL_MEM sub operation; +MLIL_IMPORT sub operation; +MLIL_SET_VAR_SSA sub operation; +MLIL_SET_VAR_SSA_FIELD sub operation; +MLIL_SET_VAR_SPLIT_SSA sub operation; +MLIL_SET_VAR_ALIASED sub operation; +MLIL_SET_VAR_ALIASED_FIELD sub operation; +MLIL_VAR_SSA sub operation; +MLIL_VAR_SSA_FIELD sub operation; +MLIL_VAR_ALIASED sub operation; +MLIL_VAR_ALIASED_FIELD sub operation; +MLIL_CALL_SSA sub operation; +MLIL_CALL_UNTYPED_SSA sub operation; +MLIL_SYSCALL_SSA sub operation; +MLIL_SYSCALL_UNTYPED_SSA sub operation; +MLIL_CALL_OUTPUT_SSA sub operation; +MLIL_CALL_PARAM_SSA sub operation; +MLIL_LOAD_SSA sub operation; +MLIL_STORE_SSA sub operation; +MLIL_VAR_PHI sub operation; +MLIL_MEM_PHI sub operation; + + +## Attribute (has) ############################## +stack sub attribute datatype string; +operation-type sub attribute datatype string; +ins-text sub attribute datatype string; +func-name sub attribute datatype string; +bb-name sub attribute datatype string; +name sub attribute datatype string; +in-bb sub attribute datatype string; +asm-address sub attribute datatype string; +edge-label sub attribute datatype string; +constant-value sub attribute datatype string; +parent-hash sub attribute datatype string; +var sub attribute datatype string; +var-type sub attribute datatype string; +var-func sub attribute datatype string; +var-size sub attribute datatype long; +bb-start sub attribute datatype long; +bb-end sub attribute datatype long; +il-index sub attribute datatype long; +list-size sub attribute datatype long; +int sub attribute datatype long; +version sub attribute datatype long; +size sub attribute datatype long; +if-true sub attribute datatype long; +if-false sub attribute datatype long; + + +## ROLES (plays) ################################ +in-function sub role; +from-basic-block sub role; +to-basic-block sub role; +in-basic-block sub role; +contains-basic-block sub role; +in-instruction sub role; +contains-instruction sub role; +in-operation sub role; +contains-operation sub role; +from-node sub role; +to-node sub role; +trace sub role; + +## RELATIONSHIP #################################### +has-basic-block sub relationship + relates in-function + relates contains-basic-block; + +basic-block-edge sub relationship + relates from-basic-block + relates to-basic-block; + +has-instruction sub relationship + relates contains-instruction + relates in-basic-block; + +instruction-has-operation sub relationship + relates contains-operation + relates in-instruction; + +operation-has-operation sub relationship + relates contains-operation + relates in-operation; + +node-link sub relationship + relates from-node + relates to-node; + +trace-link sub relationship + relates trace; + +trace-instruction sub relationship + relates trace; + +trace-full sub relationship + relates trace; + +## INFERRENCE RULES ############################# +share-var sub rule +when { + $v1 isa variable-ssa, has var $var; + $v2 isa variable-ssa, has var $var; + $v1 != $v2; +}, +then { + (trace:$v1, trace:$v2) isa trace-link; +}; + +share-instruction sub rule +when { + $v1 isa variable-ssa; + $v2 isa MLIL_VAR_SSA; + (to-node:$v1, from-node:$v2); + $v4 isa variable-ssa; + $v3 isa MLIL_VAR_SSA; + (to-node:$v4, from-node:$v3); + (from-node:$inst, to-node:$v2); + (from-node:$inst, to-node:$v3); + $inst isa instruction; + $v1 != $v2;$v2 != $v3;$v3 != $v4;$v1 != $v4;$v2 != $v4;$v1 != $v3; +}, +then { + (trace:$v1, trace:$v4) isa trace-instruction; +}; + +trace-goal sub rule +when { + (trace:$v1,trace:$v2) isa trace-link; + (trace:$v2,trace:$v3) isa trace-instruction; + $v1 != $v2;$v2 != $v3;$v1 != $v3; +}, +then { + (trace:$v1, trace:$v3) isa trace-full; +}; + diff --git a/templates/binja_mlil_ssa_1.tpl b/templates/binja_mlil_ssa_1.tpl new file mode 100755 index 0000000..ee83e0e --- /dev/null +++ b/templates/binja_mlil_ssa_1.tpl @@ -0,0 +1,9 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : inserts functions + +## Loop over all functions in the binary +for() do { + insert + $f isa function + has func-name + has asm-address ; +} diff --git a/templates/binja_mlil_ssa_2.tpl b/templates/binja_mlil_ssa_2.tpl new file mode 100755 index 0000000..ece9502 --- /dev/null +++ b/templates/binja_mlil_ssa_2.tpl @@ -0,0 +1,19 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : inserts basic-blocks + +## Loop over all functions in the binary +for() do { + match + $f isa function + has func-name + has asm-address ; + + ## Loop over all basic-blocks in this function and link basic-blocks to the function they are in + insert + for() do { + $ isa basic-block + has bb-name + has bb-start + has bb-end ; + (contains-basic-block: $, in-function: $f) isa has-basic-block; + } +} diff --git a/templates/binja_mlil_ssa_3.tpl b/templates/binja_mlil_ssa_3.tpl new file mode 100755 index 0000000..55851e1 --- /dev/null +++ b/templates/binja_mlil_ssa_3.tpl @@ -0,0 +1,17 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : links basic-blocks + +## Loop over all functions in the binary +for() do { + + ## Now loop over bb-edges and link the source and target basic-blocks in this function + for() do { + match + $ isa basic-block + has bb-name ; + $ isa basic-block + has bb-name ; + + insert + (from-basic-block: $, to-basic-block: $) isa basic-block-edge; + } +} diff --git a/templates/binja_mlil_ssa_4.tpl b/templates/binja_mlil_ssa_4.tpl new file mode 100755 index 0000000..3c02f07 --- /dev/null +++ b/templates/binja_mlil_ssa_4.tpl @@ -0,0 +1,19 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : inserts instructions + +## Loop over all functions in the binary +for() do { + + ## Loop over all basic-blocks in this function and link basic-blocks to the function they are in + for() do { + + ## Loop over all instructions in this basic-block, add them, and link them to their basic-block + for() do { + insert + $ins isa instruction + has name + has il-index + has asm-address + has operation-type ; + } + } +} diff --git a/templates/binja_mlil_ssa_5.tpl b/templates/binja_mlil_ssa_5.tpl new file mode 100755 index 0000000..0752168 --- /dev/null +++ b/templates/binja_mlil_ssa_5.tpl @@ -0,0 +1,24 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : link instructions to their basic-blocks + +## Loop over all functions in the binary +for() do { + + ## Loop over all basic-blocks in this function and link basic-blocks to the function they are in + for() do { + + ## Loop over all instructions in this basic-block, add them, and link them to their basic-block + ## in_bb is a resource of 'instruction' that helps locate a basic-block by it's hash name + for() do { + match + + $bb isa basic-block + has bb-name ; + + $ins isa instruction + has name ; + + insert + (contains-instruction: $ins, in-basic-block: $bb) isa has-instruction; + } + } +} diff --git a/templates/binja_mlil_ssa_6.tpl b/templates/binja_mlil_ssa_6.tpl new file mode 100755 index 0000000..f076319 --- /dev/null +++ b/templates/binja_mlil_ssa_6.tpl @@ -0,0 +1,68 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : inserts instruction nodes (AST nodes) + +## Loop over all functions in the binary +for() do { + + ## Loop over all basic-blocks in this function and link basic-blocks to the function they are in + for() do { + + ## Loop over all instructions in this basic-block, add them, and link them to their basic-block + for() do { + + ## Loop over all nodes in this instruction and add them + for() do { + insert + ## list nodes + if (@equals(, "list")) do { + $ isa + has name + has parent-hash + has edge-label + has list-size ; + } + + ## constant nodes + elseif (@equals(, "constant")) do { + $ isa + has name + has parent-hash + has edge-label + has constant-value ; + } + + ## variable-ssa nodes + elseif (@equals(, "variable-ssa")) do { + $ isa + has name + has parent-hash + has edge-label + has var + has version + has var-type + has var-size + has var-func ; + } + + ## variable nodes + elseif (@equals(, "variable")) do { + $ isa + has name + has parent-hash + has edge-label + has var + has var-type + has var-size + has var-func ; + } + + ## all other nodes (operations) + else { + $ isa + has name + has parent-hash + has edge-label ; + } + } + } + } +} diff --git a/templates/binja_mlil_ssa_7.tpl b/templates/binja_mlil_ssa_7.tpl new file mode 100755 index 0000000..0e5b4a3 --- /dev/null +++ b/templates/binja_mlil_ssa_7.tpl @@ -0,0 +1,25 @@ +## Grakn JSON migration template for binja_mlil_ssa.gql : links instruction nodes (AST nodes) + +## Loop over all functions in the binary +for() do { + + ## Loop over all basic-blocks in this function and link basic-blocks to the function they are in + for() do { + + ## Loop over all instructions in this basic-block, add them, and link them to their basic-block + for() do { + + ## Loop over all nodes in this instruction and add them + for() do { + match + $ isa entity + has name ; + $ isa entity + has name ; + + insert + (from-node: $, to-node: $) isa node-link; + } + } + } +}