diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e0cfbc931..8b77d3524 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -89,7 +89,7 @@ repos: - "--config" - ".github/flake8.ini" - "--extend-exclude" - - "capa/render/proto/capa_pb2.py" + - "capa/render/proto/capa_pb2.py,capa/features/extractors/binexport2/binexport2_pb2.py" - "capa/" - "scripts/" - "tests/" diff --git a/CHANGELOG.md b/CHANGELOG.md index d7c7c0a2f..8e5ec4041 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ Unlock powerful malware analysis with capa's new [VMRay sandbox](https://www.vmr - dynamic: add support for VMRay dynamic sandbox traces #2208 @mike-hunhoff @r-sm2024 @mr-tz - cli: use modern terminal features to hyperlink to the rules website #2337 @williballenthin - update IDAPython to IDA Pro 9.0 @mr-tz +- support analyzing BinExport2 files generated by Ghidra #1950 @williballenthin @mehunhoff @mr-tz +- add support for Android OS #1950 @williballenthin @mehunhoff @mr-tz +- add support for aarch64 architecture via BinExport2 backend #1950 @williballenthin @mehunhoff @mr-tz ### Breaking Changes diff --git a/capa/features/common.py b/capa/features/common.py index 18c5b9e58..e3401f7c8 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -424,10 +424,11 @@ def __init__(self, value: str, description=None): OS_WINDOWS = "windows" OS_LINUX = "linux" OS_MACOS = "macos" +OS_ANDROID = "android" # dotnet OS_ANY = "any" VALID_OS = {os.value for os in capa.features.extractors.elf.OS} -VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY}) +VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY, OS_ANDROID}) # internal only, not to be used in rules OS_AUTO = "auto" @@ -463,6 +464,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): FORMAT_CAPE = "cape" FORMAT_DRAKVUF = "drakvuf" FORMAT_VMRAY = "vmray" +FORMAT_BINEXPORT2 = "binexport2" FORMAT_FREEZE = "freeze" FORMAT_RESULT = "result" STATIC_FORMATS = { @@ -473,6 +475,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): FORMAT_DOTNET, FORMAT_FREEZE, FORMAT_RESULT, + FORMAT_BINEXPORT2, } DYNAMIC_FORMATS = { FORMAT_CAPE, diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py new file mode 100644 index 000000000..d3ce77d22 --- /dev/null +++ b/capa/features/extractors/binexport2/__init__.py @@ -0,0 +1,416 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +""" +Proto files generated via protobuf v24.4: + + protoc --python_out=. --mypy_out=. binexport2.proto + +from BinExport2 at 6916731d5f6693c4a4f0a052501fd3bd92cfd08b +https://github.com/google/binexport/blob/6916731/binexport2.proto +""" +import io +import hashlib +import logging +import contextlib +from typing import Set, Dict, List, Tuple, Iterator +from pathlib import Path +from collections import defaultdict +from dataclasses import dataclass + +from pefile import PE +from elftools.elf.elffile import ELFFile + +import capa.features.common +import capa.features.extractors.common +import capa.features.extractors.binexport2.helpers +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger(__name__) + + +def get_binexport2(sample: Path) -> BinExport2: + be2: BinExport2 = BinExport2() + be2.ParseFromString(sample.read_bytes()) + return be2 + + +def compute_common_prefix_length(m: str, n: str) -> int: + # ensure #m < #n + if len(n) < len(m): + m, n = n, m + + for i, c in enumerate(m): + if n[i] != c: + return i + + return len(m) + + +def get_sample_from_binexport2(input_file: Path, be2: BinExport2, search_paths: List[Path]) -> Path: + """attempt to find the sample file, given a BinExport2 file. + + searches in the same directory as the BinExport2 file, and then in search_paths. + """ + + def filename_similarity_key(p: Path) -> Tuple[int, str]: + # note closure over input_file. + # sort first by length of common prefix, then by name (for stability) + return (compute_common_prefix_length(p.name, input_file.name), p.name) + + wanted_sha256: str = be2.meta_information.executable_id.lower() + + input_directory: Path = input_file.parent + siblings: List[Path] = [p for p in input_directory.iterdir() if p.is_file()] + siblings.sort(key=filename_similarity_key, reverse=True) + for sibling in siblings: + # e.g. with open IDA files in the same directory on Windows + with contextlib.suppress(PermissionError): + if hashlib.sha256(sibling.read_bytes()).hexdigest().lower() == wanted_sha256: + return sibling + + for search_path in search_paths: + candidates: List[Path] = [p for p in search_path.iterdir() if p.is_file()] + candidates.sort(key=filename_similarity_key, reverse=True) + for candidate in candidates: + with contextlib.suppress(PermissionError): + if hashlib.sha256(candidate.read_bytes()).hexdigest().lower() == wanted_sha256: + return candidate + + raise ValueError("cannot find sample, you may specify the path using the CAPA_SAMPLES_DIR environment variable") + + +class BinExport2Index: + def __init__(self, be2: BinExport2): + self.be2: BinExport2 = be2 + + self.callers_by_vertex_index: Dict[int, List[int]] = defaultdict(list) + self.callees_by_vertex_index: Dict[int, List[int]] = defaultdict(list) + + # note: flow graph != call graph (vertex) + self.flow_graph_index_by_address: Dict[int, int] = {} + self.flow_graph_address_by_index: Dict[int, int] = {} + + # edges that come from the given basic block + self.source_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list) + # edges that end up at the given basic block + self.target_edges_by_basic_block_index: Dict[int, List[BinExport2.FlowGraph.Edge]] = defaultdict(list) + + self.vertex_index_by_address: Dict[int, int] = {} + + self.data_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list) + self.data_reference_index_by_target_address: Dict[int, List[int]] = defaultdict(list) + self.string_reference_index_by_source_instruction_index: Dict[int, List[int]] = defaultdict(list) + + self.insn_address_by_index: Dict[int, int] = {} + self.insn_index_by_address: Dict[int, int] = {} + self.insn_by_address: Dict[int, BinExport2.Instruction] = {} + + # must index instructions first + self._index_insn_addresses() + self._index_vertex_edges() + self._index_flow_graph_nodes() + self._index_flow_graph_edges() + self._index_call_graph_vertices() + self._index_data_references() + self._index_string_references() + + def get_insn_address(self, insn_index: int) -> int: + assert insn_index in self.insn_address_by_index, f"insn must be indexed, missing {insn_index}" + return self.insn_address_by_index[insn_index] + + def get_basic_block_address(self, basic_block_index: int) -> int: + basic_block: BinExport2.BasicBlock = self.be2.basic_block[basic_block_index] + first_instruction_index: int = next(self.instruction_indices(basic_block)) + return self.get_insn_address(first_instruction_index) + + def _index_vertex_edges(self): + for edge in self.be2.call_graph.edge: + if not edge.source_vertex_index: + continue + if not edge.target_vertex_index: + continue + + self.callers_by_vertex_index[edge.target_vertex_index].append(edge.source_vertex_index) + self.callees_by_vertex_index[edge.source_vertex_index].append(edge.target_vertex_index) + + def _index_flow_graph_nodes(self): + for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph): + function_address: int = self.get_basic_block_address(flow_graph.entry_basic_block_index) + self.flow_graph_index_by_address[function_address] = flow_graph_index + self.flow_graph_address_by_index[flow_graph_index] = function_address + + def _index_flow_graph_edges(self): + for flow_graph in self.be2.flow_graph: + for edge in flow_graph.edge: + if not edge.HasField("source_basic_block_index") or not edge.HasField("target_basic_block_index"): + continue + + self.source_edges_by_basic_block_index[edge.source_basic_block_index].append(edge) + self.target_edges_by_basic_block_index[edge.target_basic_block_index].append(edge) + + def _index_call_graph_vertices(self): + for vertex_index, vertex in enumerate(self.be2.call_graph.vertex): + if not vertex.HasField("address"): + continue + + vertex_address: int = vertex.address + self.vertex_index_by_address[vertex_address] = vertex_index + + def _index_data_references(self): + for data_reference_index, data_reference in enumerate(self.be2.data_reference): + self.data_reference_index_by_source_instruction_index[data_reference.instruction_index].append( + data_reference_index + ) + self.data_reference_index_by_target_address[data_reference.address].append(data_reference_index) + + def _index_string_references(self): + for string_reference_index, string_reference in enumerate(self.be2.string_reference): + self.string_reference_index_by_source_instruction_index[string_reference.instruction_index].append( + string_reference_index + ) + + def _index_insn_addresses(self): + # see https://github.com/google/binexport/blob/39f6445c232bb5caf5c4a2a996de91dfa20c48e8/binexport.cc#L45 + if len(self.be2.instruction) == 0: + return + + assert self.be2.instruction[0].HasField("address"), "first insn must have explicit address" + + addr: int = 0 + next_addr: int = 0 + for idx, insn in enumerate(self.be2.instruction): + if insn.HasField("address"): + addr = insn.address + next_addr = addr + len(insn.raw_bytes) + else: + addr = next_addr + next_addr += len(insn.raw_bytes) + self.insn_address_by_index[idx] = addr + self.insn_index_by_address[addr] = idx + self.insn_by_address[addr] = insn + + @staticmethod + def instruction_indices(basic_block: BinExport2.BasicBlock) -> Iterator[int]: + """ + For a given basic block, enumerate the instruction indices. + """ + for index_range in basic_block.instruction_index: + if not index_range.HasField("end_index"): + yield index_range.begin_index + continue + else: + yield from range(index_range.begin_index, index_range.end_index) + + def basic_block_instructions( + self, basic_block: BinExport2.BasicBlock + ) -> Iterator[Tuple[int, BinExport2.Instruction, int]]: + """ + For a given basic block, enumerate the instruction indices, + the instruction instances, and their addresses. + """ + for instruction_index in self.instruction_indices(basic_block): + instruction: BinExport2.Instruction = self.be2.instruction[instruction_index] + instruction_address: int = self.get_insn_address(instruction_index) + + yield instruction_index, instruction, instruction_address + + def get_function_name_by_vertex(self, vertex_index: int) -> str: + vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_index] + name: str = f"sub_{vertex.address:x}" + if vertex.HasField("mangled_name"): + name = vertex.mangled_name + + if vertex.HasField("demangled_name"): + name = vertex.demangled_name + + if vertex.HasField("library_index"): + library: BinExport2.Library = self.be2.library[vertex.library_index] + if library.HasField("name"): + name = f"{library.name}!{name}" + + return name + + def get_function_name_by_address(self, address: int) -> str: + if address not in self.vertex_index_by_address: + return "" + + vertex_index: int = self.vertex_index_by_address[address] + return self.get_function_name_by_vertex(vertex_index) + + def get_instruction_by_address(self, address: int) -> BinExport2.Instruction: + assert address in self.insn_by_address, f"address must be indexed, missing {address:x}" + return self.insn_by_address[address] + + +class BinExport2Analysis: + def __init__(self, be2: BinExport2, idx: BinExport2Index, buf: bytes): + self.be2: BinExport2 = be2 + self.idx: BinExport2Index = idx + self.buf: bytes = buf + self.base_address: int = 0 + self.thunks: Dict[int, int] = {} + + self._find_base_address() + self._compute_thunks() + + def _find_base_address(self): + sections_with_perms: Iterator[BinExport2.Section] = filter( + lambda s: s.flag_r or s.flag_w or s.flag_x, self.be2.section + ) + # assume the lowest address is the base address. + # this works as long as BinExport doesn't record other + # libraries mapped into memory. + self.base_address = min(s.address for s in sections_with_perms) + + logger.debug("found base address: %x", self.base_address) + + def _compute_thunks(self): + for addr, idx in self.idx.vertex_index_by_address.items(): + vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[idx] + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): + continue + + curr_idx: int = idx + for _ in range(capa.features.common.THUNK_CHAIN_DEPTH_DELTA): + thunk_callees: List[int] = self.idx.callees_by_vertex_index[curr_idx] + # if this doesn't hold, then it doesn't seem like this is a thunk, + # because either, len is: + # 0 and the thunk doesn't point to anything, or + # >1 and the thunk may end up at many functions. + assert len(thunk_callees) == 1, f"thunk @ {hex(addr)} failed" + + thunked_idx: int = thunk_callees[0] + thunked_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[thunked_idx] + + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + thunked_vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): + assert thunked_vertex.HasField("address") + + self.thunks[addr] = thunked_vertex.address + break + + curr_idx = thunked_idx + + +@dataclass +class MemoryRegion: + # location of the bytes, potentially relative to a base address + address: int + buf: bytes + + @property + def end(self) -> int: + return self.address + len(self.buf) + + def contains(self, address: int) -> bool: + # note: address must be relative to any base address + return self.address <= address < self.end + + +class ReadMemoryError(ValueError): ... + + +class AddressNotMappedError(ReadMemoryError): ... + + +@dataclass +class AddressSpace: + base_address: int + memory_regions: Tuple[MemoryRegion, ...] + + def read_memory(self, address: int, length: int) -> bytes: + rva: int = address - self.base_address + for region in self.memory_regions: + if region.contains(rva): + offset: int = rva - region.address + return region.buf[offset : offset + length] + + raise AddressNotMappedError(address) + + @classmethod + def from_pe(cls, pe: PE, base_address: int): + regions: List[MemoryRegion] = [] + for section in pe.sections: + address: int = section.VirtualAddress + size: int = section.Misc_VirtualSize + buf: bytes = section.get_data() + + if len(buf) != size: + # pad the section with NULLs + # assume page alignment is already handled. + # might need more hardening here. + buf += b"\x00" * (size - len(buf)) + + regions.append(MemoryRegion(address, buf)) + + return cls(base_address, tuple(regions)) + + @classmethod + def from_elf(cls, elf: ELFFile, base_address: int): + regions: List[MemoryRegion] = [] + + # ELF segments are for runtime data, + # ELF sections are for link-time data. + for segment in elf.iter_segments(): + # assume p_align is consistent with addresses here. + # otherwise, should harden this loader. + segment_rva: int = segment.header.p_vaddr + segment_size: int = segment.header.p_memsz + segment_data: bytes = segment.data() + + if len(segment_data) < segment_size: + # pad the section with NULLs + # assume page alignment is already handled. + # might need more hardening here. + segment_data += b"\x00" * (segment_size - len(segment_data)) + + regions.append(MemoryRegion(segment_rva, segment_data)) + + return cls(base_address, tuple(regions)) + + @classmethod + def from_buf(cls, buf: bytes, base_address: int): + if buf.startswith(capa.features.extractors.common.MATCH_PE): + pe: PE = PE(data=buf) + return cls.from_pe(pe, base_address) + elif buf.startswith(capa.features.extractors.common.MATCH_ELF): + elf: ELFFile = ELFFile(io.BytesIO(buf)) + return cls.from_elf(elf, base_address) + else: + raise NotImplementedError("file format address space") + + +@dataclass +class AnalysisContext: + sample_bytes: bytes + be2: BinExport2 + idx: BinExport2Index + analysis: BinExport2Analysis + address_space: AddressSpace + + +@dataclass +class FunctionContext: + ctx: AnalysisContext + flow_graph_index: int + format: Set[str] + os: Set[str] + arch: Set[str] + + +@dataclass +class BasicBlockContext: + basic_block_index: int + + +@dataclass +class InstructionContext: + instruction_index: int diff --git a/capa/features/extractors/binexport2/arch/__init__.py b/capa/features/extractors/binexport2/arch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/binexport2/arch/arm/__init__.py b/capa/features/extractors/binexport2/arch/arm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/binexport2/arch/arm/helpers.py b/capa/features/extractors/binexport2/arch/arm/helpers.py new file mode 100644 index 000000000..13e1f8b64 --- /dev/null +++ b/capa/features/extractors/binexport2/arch/arm/helpers.py @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + + +def is_stack_register_expression(be2: BinExport2, expression: BinExport2.Expression) -> bool: + return bool( + expression and expression.type == BinExport2.Expression.REGISTER and expression.symbol.lower().endswith("sp") + ) diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py new file mode 100644 index 000000000..7af93aaff --- /dev/null +++ b/capa/features/extractors/binexport2/arch/arm/insn.py @@ -0,0 +1,155 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import List, Tuple, Iterator, Optional + +import capa.features.extractors.binexport2.helpers +from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset +from capa.features.common import Feature, Characteristic +from capa.features.address import Address +from capa.features.extractors.binexport2 import FunctionContext, InstructionContext +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.binexport2.helpers import ( + BinExport2InstructionPatternMatcher, + mask_immediate, + is_address_mapped, + get_instruction_mnemonic, + get_operand_register_expression, + get_operand_immediate_expression, +) +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression + +logger = logging.getLogger(__name__) + + +def extract_insn_number_features( + fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction_index: int = ii.instruction_index + instruction: BinExport2.Instruction = be2.instruction[instruction_index] + + if len(instruction.operand_index) == 0: + # skip things like: + # .text:0040116e leave + return + + mnemonic: str = get_instruction_mnemonic(be2, instruction) + + if mnemonic in ("add", "sub"): + assert len(instruction.operand_index) == 3 + + operand1_expression: Optional[BinExport2.Expression] = get_operand_register_expression( + be2, be2.operand[instruction.operand_index[1]] + ) + if operand1_expression and is_stack_register_expression(be2, operand1_expression): + # skip things like: + # add x0,sp,#0x8 + return + + for i, operand_index in enumerate(instruction.operand_index): + operand: BinExport2.Operand = be2.operand[operand_index] + + immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand) + if not immediate_expression: + continue + + value: int = mask_immediate(fhi.arch, immediate_expression.immediate) + if is_address_mapped(be2, value): + continue + + yield Number(value), ih.address + yield OperandNumber(i, value), ih.address + + if mnemonic == "add" and i == 2: + if 0 < value < MAX_STRUCTURE_SIZE: + yield Offset(value), ih.address + yield OperandOffset(i, value), ih.address + + +OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack), #int] ; capture #int + ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack), #int]! ; capture #int + ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack)], #int ; capture #int + ldp|ldpd|stp|stpd reg, reg, [reg(not-stack), #int] ; capture #int + ldp|ldpd|stp|stpd reg, reg, [reg(not-stack), #int]! ; capture #int + ldp|ldpd|stp|stpd reg, reg, [reg(not-stack)], #int ; capture #int + """ +) + + +def extract_insn_offset_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + match = OFFSET_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + return + + value = match.expression.immediate + + value = mask_immediate(fhi.arch, value) + if not is_address_mapped(be2, value): + value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value) + yield Offset(value), ih.address + yield OperandOffset(match.operand_index, value), ih.address + + +NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + eor reg, reg, reg + eor reg, reg, #int + """ +) + + +def extract_insn_nzxor_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + be2: BinExport2 = fhi.ctx.be2 + + if NZXOR_PATTERNS.match_with_be2(be2, ii.instruction_index) is None: + return + + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + # guaranteed to be simple int/reg operands + # so we don't have to realize the tree/list. + operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] + + if operands[1] != operands[2]: + yield Characteristic("nzxor"), ih.address + + +INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + blx|bx|blr reg + """ +) + + +def extract_function_indirect_call_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + be2: BinExport2 = fhi.ctx.be2 + + if INDIRECT_CALL_PATTERNS.match_with_be2(be2, ii.instruction_index) is not None: + yield Characteristic("indirect call"), ih.address diff --git a/capa/features/extractors/binexport2/arch/intel/__init__.py b/capa/features/extractors/binexport2/arch/intel/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/binexport2/arch/intel/helpers.py b/capa/features/extractors/binexport2/arch/intel/helpers.py new file mode 100644 index 000000000..3696c0d93 --- /dev/null +++ b/capa/features/extractors/binexport2/arch/intel/helpers.py @@ -0,0 +1,135 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +from typing import List, Optional +from dataclasses import dataclass + +from capa.features.extractors.binexport2.helpers import get_operand_expressions +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +# security cookie checks may perform non-zeroing XORs, these are expected within a certain +# byte range within the first and returning basic blocks, this helps to reduce FP features +SECURITY_COOKIE_BYTES_DELTA: int = 0x40 + + +@dataclass +class OperandPhraseInfo: + scale: Optional[BinExport2.Expression] = None + index: Optional[BinExport2.Expression] = None + base: Optional[BinExport2.Expression] = None + displacement: Optional[BinExport2.Expression] = None + + +def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]: + # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory): + # + # Scale: A 2-bit constant factor + # Index: Any general purpose register + # Base: Any general purpose register + # Displacement: An integral offset + + expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand) + + # skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller + # has checked for BinExport2.Expression.DEREFERENCE + for i, expression in enumerate(expressions): + if expression.type == BinExport2.Expression.DEREFERENCE: + expressions = expressions[i + 1 :] + break + + expression0: BinExport2.Expression + expression1: BinExport2.Expression + expression2: BinExport2.Expression + expression3: BinExport2.Expression + expression4: BinExport2.Expression + + if len(expressions) == 1: + expression0 = expressions[0] + + assert ( + expression0.type == BinExport2.Expression.IMMEDIATE_INT + or expression0.type == BinExport2.Expression.REGISTER + ) + + if expression0.type == BinExport2.Expression.IMMEDIATE_INT: + # Displacement + return OperandPhraseInfo(displacement=expression0) + elif expression0.type == BinExport2.Expression.REGISTER: + # Base + return OperandPhraseInfo(base=expression0) + + elif len(expressions) == 3: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert ( + expression2.type == BinExport2.Expression.IMMEDIATE_INT + or expression2.type == BinExport2.Expression.REGISTER + ) + + if expression2.type == BinExport2.Expression.REGISTER: + # Base + Index + return OperandPhraseInfo(base=expression0, index=expression2) + elif expression2.type == BinExport2.Expression.IMMEDIATE_INT: + # Base + Displacement + return OperandPhraseInfo(base=expression0, displacement=expression2) + + elif len(expressions) == 5: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + expression3 = expressions[3] + expression4 = expressions[4] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert ( + expression2.type == BinExport2.Expression.REGISTER + or expression2.type == BinExport2.Expression.IMMEDIATE_INT + ) + assert expression3.type == BinExport2.Expression.OPERATOR + assert expression4.type == BinExport2.Expression.IMMEDIATE_INT + + if expression1.symbol == "+" and expression3.symbol == "+": + # Base + Index + Displacement + return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4) + elif expression1.symbol == "+" and expression3.symbol == "*": + # Base + (Index * Scale) + return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3) + elif expression1.symbol == "*" and expression3.symbol == "+": + # (Index * Scale) + Displacement + return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3) + else: + raise NotImplementedError(expression1.symbol, expression3.symbol) + + elif len(expressions) == 7: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + expression3 = expressions[3] + expression4 = expressions[4] + expression5 = expressions[5] + expression6 = expressions[6] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert expression2.type == BinExport2.Expression.REGISTER + assert expression3.type == BinExport2.Expression.OPERATOR + assert expression4.type == BinExport2.Expression.IMMEDIATE_INT + assert expression5.type == BinExport2.Expression.OPERATOR + assert expression6.type == BinExport2.Expression.IMMEDIATE_INT + + # Base + (Index * Scale) + Displacement + return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6) + + else: + raise NotImplementedError(len(expressions)) + + return None diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py new file mode 100644 index 000000000..efb4a6fe5 --- /dev/null +++ b/capa/features/extractors/binexport2/arch/intel/insn.py @@ -0,0 +1,248 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import List, Tuple, Iterator + +import capa.features.extractors.strings +import capa.features.extractors.binexport2.helpers +from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset +from capa.features.common import Feature, Characteristic +from capa.features.address import Address +from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext, BasicBlockContext, InstructionContext +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.binexport2.helpers import ( + BinExport2InstructionPatternMatcher, + mask_immediate, + is_address_mapped, + get_instruction_mnemonic, +) +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +from capa.features.extractors.binexport2.arch.intel.helpers import SECURITY_COOKIE_BYTES_DELTA + +logger = logging.getLogger(__name__) + + +IGNORE_NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + ret #int + retn #int + add reg(stack), #int + sub reg(stack), #int + """ +) + +NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + push #int0 ; capture #int0 + + # its a little tedious to enumerate all the address forms + # but at least we are explicit + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar reg, #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [#int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + #int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg + #int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg * #int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg * #int + #int], #int0 ; capture #int0 + + imul reg, reg, #int ; capture #int + # note that int is first + cmp|test #int0, reg ; capture #int0 + + # imagine reg is zero'd out, then this is like `mov reg, #int` + # which is not uncommon. + lea reg, [reg + #int] ; capture #int + """ +) + + +def extract_insn_number_features( + fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + if IGNORE_NUMBER_PATTERNS.match_with_be2(be2, ii.instruction_index): + return + + match = NUMBER_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + return + + value: int = mask_immediate(fhi.arch, match.expression.immediate) + if is_address_mapped(be2, value): + return + + yield Number(value), ih.address + yield OperandNumber(match.operand_index, value), ih.address + + instruction_index: int = ii.instruction_index + instruction: BinExport2.Instruction = be2.instruction[instruction_index] + + mnemonic: str = get_instruction_mnemonic(be2, instruction) + if mnemonic.startswith("add"): + if 0 < value < MAX_STRUCTURE_SIZE: + yield Offset(value), ih.address + yield OperandOffset(match.operand_index, value), ih.address + + +OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + mov|movzx|movsb|cmp [reg + reg * #int + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg * #int + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg + reg + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg(not-stack) + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg + reg * #int + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp [reg * #int + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp [reg + reg + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp [reg(not-stack) + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg + reg * #int + #int0] ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg * #int + #int0] ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg + reg + #int0] ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 + """ +) + +# these are patterns that access offset 0 from some pointer +# (pointer is not the stack pointer). +OFFSET_ZERO_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + mov|movzx|movsb [reg(not-stack)], reg + mov|movzx|movsb [reg(not-stack)], #int + lea reg, [reg(not-stack)] + """ +) + + +def extract_insn_offset_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + match = OFFSET_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + match = OFFSET_ZERO_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + return + + yield Offset(0), ih.address + yield OperandOffset(match.operand_index, 0), ih.address + + value = mask_immediate(fhi.arch, match.expression.immediate) + if is_address_mapped(be2, value): + return + + value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32) + yield Offset(value), ih.address + yield OperandOffset(match.operand_index, value), ih.address + + +def is_security_cookie( + fhi: FunctionContext, + bbi: BasicBlockContext, + instruction_address: int, + instruction: BinExport2.Instruction, +) -> bool: + """ + check if an instruction is related to security cookie checks. + """ + be2: BinExport2 = fhi.ctx.be2 + idx: BinExport2Index = fhi.ctx.idx + + # security cookie check should use SP or BP + op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]] + op1_exprs: List[BinExport2.Expression] = [be2.expression[expr_i] for expr_i in op1.expression_index] + if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs): + return False + + # check_nzxor_security_cookie_delta + # if insn falls at the start of first entry block of the parent function. + flow_graph: BinExport2.FlowGraph = be2.flow_graph[fhi.flow_graph_index] + basic_block_index: int = bbi.basic_block_index + bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index] + if flow_graph.entry_basic_block_index == basic_block_index: + first_addr: int = min((idx.insn_address_by_index[ir.begin_index] for ir in bb.instruction_index)) + if instruction_address < first_addr + SECURITY_COOKIE_BYTES_DELTA: + return True + # or insn falls at the end before return in a terminal basic block. + if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge): + last_addr: int = max((idx.insn_address_by_index[ir.end_index - 1] for ir in bb.instruction_index)) + if instruction_address > last_addr - SECURITY_COOKIE_BYTES_DELTA: + return True + return False + + +NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + xor|xorpd|xorps|pxor reg, reg + xor|xorpd|xorps|pxor reg, #int + """ +) + + +def extract_insn_nzxor_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse non-zeroing XOR instruction from the given instruction. + ignore expected non-zeroing XORs, e.g. security cookies. + """ + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + idx: BinExport2Index = fhi.ctx.idx + + if NZXOR_PATTERNS.match_with_be2(be2, ii.instruction_index) is None: + return + + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + # guaranteed to be simple int/reg operands + # so we don't have to realize the tree/list. + operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] + + if operands[0] == operands[1]: + return + + instruction_address: int = idx.insn_address_by_index[ii.instruction_index] + if is_security_cookie(fhi, bbh.inner, instruction_address, instruction): + return + + yield Characteristic("nzxor"), ih.address + + +INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + call|jmp reg0 + call|jmp [reg + reg * #int + #int] + call|jmp [reg + reg * #int] + call|jmp [reg * #int + #int] + call|jmp [reg + reg + #int] + call|jmp [reg + #int] + call|jmp [reg] + """ +) + + +def extract_function_indirect_call_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + be2: BinExport2 = fhi.ctx.be2 + + match = INDIRECT_CALL_PATTERNS.match_with_be2(be2, ii.instruction_index) + if match is None: + return + + yield Characteristic("indirect call"), ih.address diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py new file mode 100644 index 000000000..bcb7977b4 --- /dev/null +++ b/capa/features/extractors/binexport2/basicblock.py @@ -0,0 +1,40 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +from typing import List, Tuple, Iterator + +from capa.features.common import Feature, Characteristic +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.basicblock import BasicBlock +from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext +from capa.features.extractors.base_extractor import BBHandle, FunctionHandle +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + + +def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + bbi: BasicBlockContext = bbh.inner + + idx = fhi.ctx.idx + + basic_block_index: int = bbi.basic_block_index + target_edges: List[BinExport2.FlowGraph.Edge] = idx.target_edges_by_basic_block_index[basic_block_index] + if basic_block_index in (e.source_basic_block_index for e in target_edges): + basic_block_address: int = idx.get_basic_block_address(basic_block_index) + yield Characteristic("tight loop"), AbsoluteVirtualAddress(basic_block_address) + + +def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + """extract basic block features""" + for bb_handler in BASIC_BLOCK_HANDLERS: + for feature, addr in bb_handler(fh, bbh): + yield feature, addr + yield BasicBlock(), bbh.address + + +BASIC_BLOCK_HANDLERS = (extract_bb_tight_loop,) diff --git a/capa/features/extractors/binexport2/binexport2_pb2.py b/capa/features/extractors/binexport2/binexport2_pb2.py new file mode 100644 index 000000000..4d11d1a6e --- /dev/null +++ b/capa/features/extractors/binexport2/binexport2_pb2.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: binexport2.proto +"""Generated protocol buffer code.""" +from google.protobuf.internal import builder as _builder +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x10\x62inexport2.proto\"\xa5\x17\n\nBinExport2\x12*\n\x10meta_information\x18\x01 \x01(\x0b\x32\x10.BinExport2.Meta\x12*\n\nexpression\x18\x02 \x03(\x0b\x32\x16.BinExport2.Expression\x12$\n\x07operand\x18\x03 \x03(\x0b\x32\x13.BinExport2.Operand\x12&\n\x08mnemonic\x18\x04 \x03(\x0b\x32\x14.BinExport2.Mnemonic\x12,\n\x0binstruction\x18\x05 \x03(\x0b\x32\x17.BinExport2.Instruction\x12+\n\x0b\x62\x61sic_block\x18\x06 \x03(\x0b\x32\x16.BinExport2.BasicBlock\x12)\n\nflow_graph\x18\x07 \x03(\x0b\x32\x15.BinExport2.FlowGraph\x12)\n\ncall_graph\x18\x08 \x01(\x0b\x32\x15.BinExport2.CallGraph\x12\x14\n\x0cstring_table\x18\t \x03(\t\x12\x32\n\x0f\x61\x64\x64ress_comment\x18\n \x03(\x0b\x32\x15.BinExport2.ReferenceB\x02\x18\x01\x12$\n\x07\x63omment\x18\x11 \x03(\x0b\x32\x13.BinExport2.Comment\x12/\n\x10string_reference\x18\x0b \x03(\x0b\x32\x15.BinExport2.Reference\x12\x36\n\x17\x65xpression_substitution\x18\x0c \x03(\x0b\x32\x15.BinExport2.Reference\x12$\n\x07section\x18\r \x03(\x0b\x32\x13.BinExport2.Section\x12$\n\x07library\x18\x0e \x03(\x0b\x32\x13.BinExport2.Library\x12\x31\n\x0e\x64\x61ta_reference\x18\x0f \x03(\x0b\x32\x19.BinExport2.DataReference\x12\"\n\x06module\x18\x10 \x03(\x0b\x32\x12.BinExport2.Module\x1aj\n\x04Meta\x12\x17\n\x0f\x65xecutable_name\x18\x01 \x01(\t\x12\x15\n\rexecutable_id\x18\x02 \x01(\t\x12\x19\n\x11\x61rchitecture_name\x18\x03 \x01(\t\x12\x11\n\ttimestamp\x18\x04 \x01(\x03J\x04\x08\x05\x10\x06\x1a\x9c\x03\n\tCallGraph\x12,\n\x06vertex\x18\x01 \x03(\x0b\x32\x1c.BinExport2.CallGraph.Vertex\x12(\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32\x1a.BinExport2.CallGraph.Edge\x1a\xf4\x01\n\x06Vertex\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\x04\x12\x37\n\x04type\x18\x02 \x01(\x0e\x32!.BinExport2.CallGraph.Vertex.Type:\x06NORMAL\x12\x14\n\x0cmangled_name\x18\x03 \x01(\t\x12\x16\n\x0e\x64\x65mangled_name\x18\x04 \x01(\t\x12\x15\n\rlibrary_index\x18\x05 \x01(\x05\x12\x14\n\x0cmodule_index\x18\x06 \x01(\x05\"E\n\x04Type\x12\n\n\x06NORMAL\x10\x00\x12\x0b\n\x07LIBRARY\x10\x01\x12\x0c\n\x08IMPORTED\x10\x02\x12\t\n\x05THUNK\x10\x03\x12\x0b\n\x07INVALID\x10\x04\x1a@\n\x04\x45\x64ge\x12\x1b\n\x13source_vertex_index\x18\x01 \x01(\x05\x12\x1b\n\x13target_vertex_index\x18\x02 \x01(\x05\x1a\x90\x02\n\nExpression\x12\x38\n\x04type\x18\x01 \x01(\x0e\x32\x1b.BinExport2.Expression.Type:\rIMMEDIATE_INT\x12\x0e\n\x06symbol\x18\x02 \x01(\t\x12\x11\n\timmediate\x18\x03 \x01(\x04\x12\x14\n\x0cparent_index\x18\x04 \x01(\x05\x12\x15\n\ris_relocation\x18\x05 \x01(\x08\"x\n\x04Type\x12\n\n\x06SYMBOL\x10\x01\x12\x11\n\rIMMEDIATE_INT\x10\x02\x12\x13\n\x0fIMMEDIATE_FLOAT\x10\x03\x12\x0c\n\x08OPERATOR\x10\x04\x12\x0c\n\x08REGISTER\x10\x05\x12\x0f\n\x0bSIZE_PREFIX\x10\x06\x12\x0f\n\x0b\x44\x45REFERENCE\x10\x07\x1a#\n\x07Operand\x12\x18\n\x10\x65xpression_index\x18\x01 \x03(\x05\x1a\x18\n\x08Mnemonic\x12\x0c\n\x04name\x18\x01 \x01(\t\x1a\x8f\x01\n\x0bInstruction\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\x04\x12\x13\n\x0b\x63\x61ll_target\x18\x02 \x03(\x04\x12\x19\n\x0emnemonic_index\x18\x03 \x01(\x05:\x01\x30\x12\x15\n\roperand_index\x18\x04 \x03(\x05\x12\x11\n\traw_bytes\x18\x05 \x01(\x0c\x12\x15\n\rcomment_index\x18\x06 \x03(\x05\x1a\x80\x01\n\nBasicBlock\x12<\n\x11instruction_index\x18\x01 \x03(\x0b\x32!.BinExport2.BasicBlock.IndexRange\x1a\x34\n\nIndexRange\x12\x13\n\x0b\x62\x65gin_index\x18\x01 \x01(\x05\x12\x11\n\tend_index\x18\x02 \x01(\x05\x1a\xe9\x02\n\tFlowGraph\x12\x19\n\x11\x62\x61sic_block_index\x18\x01 \x03(\x05\x12\x1f\n\x17\x65ntry_basic_block_index\x18\x03 \x01(\x05\x12(\n\x04\x65\x64ge\x18\x02 \x03(\x0b\x32\x1a.BinExport2.FlowGraph.Edge\x1a\xf5\x01\n\x04\x45\x64ge\x12 \n\x18source_basic_block_index\x18\x01 \x01(\x05\x12 \n\x18target_basic_block_index\x18\x02 \x01(\x05\x12<\n\x04type\x18\x03 \x01(\x0e\x32\x1f.BinExport2.FlowGraph.Edge.Type:\rUNCONDITIONAL\x12\x1b\n\x0cis_back_edge\x18\x04 \x01(\x08:\x05\x66\x61lse\"N\n\x04Type\x12\x12\n\x0e\x43ONDITION_TRUE\x10\x01\x12\x13\n\x0f\x43ONDITION_FALSE\x10\x02\x12\x11\n\rUNCONDITIONAL\x10\x03\x12\n\n\x06SWITCH\x10\x04\x1a\x8d\x01\n\tReference\x12\x19\n\x11instruction_index\x18\x01 \x01(\x05\x12$\n\x19instruction_operand_index\x18\x02 \x01(\x05:\x01\x30\x12#\n\x18operand_expression_index\x18\x03 \x01(\x05:\x01\x30\x12\x1a\n\x12string_table_index\x18\x04 \x01(\x05\x1a;\n\rDataReference\x12\x19\n\x11instruction_index\x18\x01 \x01(\x05\x12\x0f\n\x07\x61\x64\x64ress\x18\x02 \x01(\x04\x1a\xd4\x02\n\x07\x43omment\x12\x19\n\x11instruction_index\x18\x01 \x01(\x05\x12$\n\x19instruction_operand_index\x18\x02 \x01(\x05:\x01\x30\x12#\n\x18operand_expression_index\x18\x03 \x01(\x05:\x01\x30\x12\x1a\n\x12string_table_index\x18\x04 \x01(\x05\x12\x12\n\nrepeatable\x18\x05 \x01(\x08\x12/\n\x04type\x18\x06 \x01(\x0e\x32\x18.BinExport2.Comment.Type:\x07\x44\x45\x46\x41ULT\"\x81\x01\n\x04Type\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\x0c\n\x08\x41NTERIOR\x10\x01\x12\r\n\tPOSTERIOR\x10\x02\x12\x0c\n\x08\x46UNCTION\x10\x03\x12\x08\n\x04\x45NUM\x10\x04\x12\x0c\n\x08LOCATION\x10\x05\x12\x14\n\x10GLOBAL_REFERENCE\x10\x06\x12\x13\n\x0fLOCAL_REFERENCE\x10\x07\x1aX\n\x07Section\x12\x0f\n\x07\x61\x64\x64ress\x18\x01 \x01(\x04\x12\x0c\n\x04size\x18\x02 \x01(\x04\x12\x0e\n\x06\x66lag_r\x18\x03 \x01(\x08\x12\x0e\n\x06\x66lag_w\x18\x04 \x01(\x08\x12\x0e\n\x06\x66lag_x\x18\x05 \x01(\x08\x1a\x43\n\x07Library\x12\x11\n\tis_static\x18\x01 \x01(\x08\x12\x17\n\x0cload_address\x18\x02 \x01(\x04:\x01\x30\x12\x0c\n\x04name\x18\x03 \x01(\t\x1a\x16\n\x06Module\x12\x0c\n\x04name\x18\x01 \x01(\t*\x0b\x08\x80\xc2\xd7/\x10\x80\x80\x80\x80\x02\x42)\n\x1c\x63om.google.security.zynamicsB\tBinExport') + +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'binexport2_pb2', globals()) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\034com.google.security.zynamicsB\tBinExport' + _BINEXPORT2.fields_by_name['address_comment']._options = None + _BINEXPORT2.fields_by_name['address_comment']._serialized_options = b'\030\001' + _BINEXPORT2._serialized_start=21 + _BINEXPORT2._serialized_end=3002 + _BINEXPORT2_META._serialized_start=758 + _BINEXPORT2_META._serialized_end=864 + _BINEXPORT2_CALLGRAPH._serialized_start=867 + _BINEXPORT2_CALLGRAPH._serialized_end=1279 + _BINEXPORT2_CALLGRAPH_VERTEX._serialized_start=969 + _BINEXPORT2_CALLGRAPH_VERTEX._serialized_end=1213 + _BINEXPORT2_CALLGRAPH_VERTEX_TYPE._serialized_start=1144 + _BINEXPORT2_CALLGRAPH_VERTEX_TYPE._serialized_end=1213 + _BINEXPORT2_CALLGRAPH_EDGE._serialized_start=1215 + _BINEXPORT2_CALLGRAPH_EDGE._serialized_end=1279 + _BINEXPORT2_EXPRESSION._serialized_start=1282 + _BINEXPORT2_EXPRESSION._serialized_end=1554 + _BINEXPORT2_EXPRESSION_TYPE._serialized_start=1434 + _BINEXPORT2_EXPRESSION_TYPE._serialized_end=1554 + _BINEXPORT2_OPERAND._serialized_start=1556 + _BINEXPORT2_OPERAND._serialized_end=1591 + _BINEXPORT2_MNEMONIC._serialized_start=1593 + _BINEXPORT2_MNEMONIC._serialized_end=1617 + _BINEXPORT2_INSTRUCTION._serialized_start=1620 + _BINEXPORT2_INSTRUCTION._serialized_end=1763 + _BINEXPORT2_BASICBLOCK._serialized_start=1766 + _BINEXPORT2_BASICBLOCK._serialized_end=1894 + _BINEXPORT2_BASICBLOCK_INDEXRANGE._serialized_start=1842 + _BINEXPORT2_BASICBLOCK_INDEXRANGE._serialized_end=1894 + _BINEXPORT2_FLOWGRAPH._serialized_start=1897 + _BINEXPORT2_FLOWGRAPH._serialized_end=2258 + _BINEXPORT2_FLOWGRAPH_EDGE._serialized_start=2013 + _BINEXPORT2_FLOWGRAPH_EDGE._serialized_end=2258 + _BINEXPORT2_FLOWGRAPH_EDGE_TYPE._serialized_start=2180 + _BINEXPORT2_FLOWGRAPH_EDGE_TYPE._serialized_end=2258 + _BINEXPORT2_REFERENCE._serialized_start=2261 + _BINEXPORT2_REFERENCE._serialized_end=2402 + _BINEXPORT2_DATAREFERENCE._serialized_start=2404 + _BINEXPORT2_DATAREFERENCE._serialized_end=2463 + _BINEXPORT2_COMMENT._serialized_start=2466 + _BINEXPORT2_COMMENT._serialized_end=2806 + _BINEXPORT2_COMMENT_TYPE._serialized_start=2677 + _BINEXPORT2_COMMENT_TYPE._serialized_end=2806 + _BINEXPORT2_SECTION._serialized_start=2808 + _BINEXPORT2_SECTION._serialized_end=2896 + _BINEXPORT2_LIBRARY._serialized_start=2898 + _BINEXPORT2_LIBRARY._serialized_end=2965 + _BINEXPORT2_MODULE._serialized_start=2967 + _BINEXPORT2_MODULE._serialized_end=2989 +# @@protoc_insertion_point(module_scope) diff --git a/capa/features/extractors/binexport2/binexport2_pb2.pyi b/capa/features/extractors/binexport2/binexport2_pb2.pyi new file mode 100644 index 000000000..1620aee7a --- /dev/null +++ b/capa/features/extractors/binexport2/binexport2_pb2.pyi @@ -0,0 +1,784 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +The representation is generic to accommodate various source architectures. +In particular 32 and 64 bit versions of x86, ARM, PowerPC and MIPS have been +tested. + +Multiple levels of deduping have been applied to make the format more compact +and avoid redundant data duplication. Some of this due to hard-earned +experience trying to cope with intentionally obfuscated malicious binaries. +Note in particular that the same instruction may occur in multiple basic +blocks and the same basic block in multiple functions (instruction and basic +block sharing). Implemented naively, malware can use this to cause +combinatorial explosion in memory usage, DOSing the analyst. This format +should store every unique expression, mnemonic, operand, instruction and +basic block only once instead of duplicating the information for every +instance of it. + +This format does _not_ try to be 100% backwards compatible with the old +version. In particular, we do not store IDA's comment types, making lossless +porting of IDA comments impossible. We do however, store comments and +expression substitutions, so porting the actual data is possible, just not +the exact IDA type. + +While it would be more natural to use addresses when defining call graph and +flow graph edges and other such references, it is more efficient to employ +one more level of indirection and use indices into the basic block or +function arrays instead. This is because addresses will usually use most of +the available 64 bit space while indices will be much smaller and compress +much better (less randomly distributed). + +We omit all fields that are set to their default value anyways. Note that +this has two side effects: + - changing the defaults in this proto file will, in effect, change what's + read from disk + - the generated code has_* methods are somewhat less useful +WARNING: We omit the defaults manually in the code writing the data. Do not + change the defaults here without changing the code! + +TODO(cblichmann): Link flow graphs to call graph nodes. The connection is + there via the address, but tricky to extract. +""" +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import sys +import typing + +if sys.version_info >= (3, 10): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +@typing_extensions.final +class BinExport2(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class Meta(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + EXECUTABLE_NAME_FIELD_NUMBER: builtins.int + EXECUTABLE_ID_FIELD_NUMBER: builtins.int + ARCHITECTURE_NAME_FIELD_NUMBER: builtins.int + TIMESTAMP_FIELD_NUMBER: builtins.int + executable_name: builtins.str + """Input binary filename including file extension but excluding file path. + example: "insider_gcc.exe" + """ + executable_id: builtins.str + """Application defined executable id. Often the SHA256 hash of the input + binary. + """ + architecture_name: builtins.str + """Input architecture name, e.g. x86-32.""" + timestamp: builtins.int + """When did this file get created? Unix time. This may be used for some + primitive versioning in case the file format ever changes. + """ + def __init__( + self, + *, + executable_name: builtins.str | None = ..., + executable_id: builtins.str | None = ..., + architecture_name: builtins.str | None = ..., + timestamp: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["architecture_name", b"architecture_name", "executable_id", b"executable_id", "executable_name", b"executable_name", "timestamp", b"timestamp"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["architecture_name", b"architecture_name", "executable_id", b"executable_id", "executable_name", b"executable_name", "timestamp", b"timestamp"]) -> None: ... + + @typing_extensions.final + class CallGraph(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class Vertex(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _Type: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.CallGraph.Vertex._Type.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + NORMAL: BinExport2.CallGraph.Vertex._Type.ValueType # 0 + """Regular function with full disassembly.""" + LIBRARY: BinExport2.CallGraph.Vertex._Type.ValueType # 1 + """This function is a well known library function.""" + IMPORTED: BinExport2.CallGraph.Vertex._Type.ValueType # 2 + """Imported from a dynamic link library (e.g. dll).""" + THUNK: BinExport2.CallGraph.Vertex._Type.ValueType # 3 + """A thunk function, forwarding its work via an unconditional jump.""" + INVALID: BinExport2.CallGraph.Vertex._Type.ValueType # 4 + """An invalid function (a function that contained invalid code or was + considered invalid by some heuristics). + """ + + class Type(_Type, metaclass=_TypeEnumTypeWrapper): ... + NORMAL: BinExport2.CallGraph.Vertex.Type.ValueType # 0 + """Regular function with full disassembly.""" + LIBRARY: BinExport2.CallGraph.Vertex.Type.ValueType # 1 + """This function is a well known library function.""" + IMPORTED: BinExport2.CallGraph.Vertex.Type.ValueType # 2 + """Imported from a dynamic link library (e.g. dll).""" + THUNK: BinExport2.CallGraph.Vertex.Type.ValueType # 3 + """A thunk function, forwarding its work via an unconditional jump.""" + INVALID: BinExport2.CallGraph.Vertex.Type.ValueType # 4 + """An invalid function (a function that contained invalid code or was + considered invalid by some heuristics). + """ + + ADDRESS_FIELD_NUMBER: builtins.int + TYPE_FIELD_NUMBER: builtins.int + MANGLED_NAME_FIELD_NUMBER: builtins.int + DEMANGLED_NAME_FIELD_NUMBER: builtins.int + LIBRARY_INDEX_FIELD_NUMBER: builtins.int + MODULE_INDEX_FIELD_NUMBER: builtins.int + address: builtins.int + """The function's entry point address. Messages need to be sorted, see + comment below on `vertex`. + """ + type: global___BinExport2.CallGraph.Vertex.Type.ValueType + mangled_name: builtins.str + """If the function has a user defined, real name it will be given here. + main() is a proper name, sub_BAADF00D is not (auto generated dummy + name). + """ + demangled_name: builtins.str + """Demangled name if the function is a mangled C++ function and we could + demangle it. + """ + library_index: builtins.int + """If this is a library function, what is its index in library arrays.""" + module_index: builtins.int + """If module name, such as class name for DEX files, is present - index in + module table. + """ + def __init__( + self, + *, + address: builtins.int | None = ..., + type: global___BinExport2.CallGraph.Vertex.Type.ValueType | None = ..., + mangled_name: builtins.str | None = ..., + demangled_name: builtins.str | None = ..., + library_index: builtins.int | None = ..., + module_index: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["address", b"address", "demangled_name", b"demangled_name", "library_index", b"library_index", "mangled_name", b"mangled_name", "module_index", b"module_index", "type", b"type"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "demangled_name", b"demangled_name", "library_index", b"library_index", "mangled_name", b"mangled_name", "module_index", b"module_index", "type", b"type"]) -> None: ... + + @typing_extensions.final + class Edge(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + SOURCE_VERTEX_INDEX_FIELD_NUMBER: builtins.int + TARGET_VERTEX_INDEX_FIELD_NUMBER: builtins.int + source_vertex_index: builtins.int + """source and target index into the vertex repeated field.""" + target_vertex_index: builtins.int + def __init__( + self, + *, + source_vertex_index: builtins.int | None = ..., + target_vertex_index: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["source_vertex_index", b"source_vertex_index", "target_vertex_index", b"target_vertex_index"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["source_vertex_index", b"source_vertex_index", "target_vertex_index", b"target_vertex_index"]) -> None: ... + + VERTEX_FIELD_NUMBER: builtins.int + EDGE_FIELD_NUMBER: builtins.int + @property + def vertex(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.CallGraph.Vertex]: + """vertices == functions in the call graph. + Important: Most downstream tooling (notably BinDiff), need these to be + sorted by `Vertex::address` (ascending). For C++, the + `BinExport2Writer` class enforces this invariant. + """ + @property + def edge(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.CallGraph.Edge]: + """edges == calls in the call graph.""" + def __init__( + self, + *, + vertex: collections.abc.Iterable[global___BinExport2.CallGraph.Vertex] | None = ..., + edge: collections.abc.Iterable[global___BinExport2.CallGraph.Edge] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["edge", b"edge", "vertex", b"vertex"]) -> None: ... + + @typing_extensions.final + class Expression(google.protobuf.message.Message): + """An operand consists of 1 or more expressions, linked together as a tree.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _Type: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.Expression._Type.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + SYMBOL: BinExport2.Expression._Type.ValueType # 1 + IMMEDIATE_INT: BinExport2.Expression._Type.ValueType # 2 + IMMEDIATE_FLOAT: BinExport2.Expression._Type.ValueType # 3 + OPERATOR: BinExport2.Expression._Type.ValueType # 4 + REGISTER: BinExport2.Expression._Type.ValueType # 5 + SIZE_PREFIX: BinExport2.Expression._Type.ValueType # 6 + DEREFERENCE: BinExport2.Expression._Type.ValueType # 7 + + class Type(_Type, metaclass=_TypeEnumTypeWrapper): ... + SYMBOL: BinExport2.Expression.Type.ValueType # 1 + IMMEDIATE_INT: BinExport2.Expression.Type.ValueType # 2 + IMMEDIATE_FLOAT: BinExport2.Expression.Type.ValueType # 3 + OPERATOR: BinExport2.Expression.Type.ValueType # 4 + REGISTER: BinExport2.Expression.Type.ValueType # 5 + SIZE_PREFIX: BinExport2.Expression.Type.ValueType # 6 + DEREFERENCE: BinExport2.Expression.Type.ValueType # 7 + + TYPE_FIELD_NUMBER: builtins.int + SYMBOL_FIELD_NUMBER: builtins.int + IMMEDIATE_FIELD_NUMBER: builtins.int + PARENT_INDEX_FIELD_NUMBER: builtins.int + IS_RELOCATION_FIELD_NUMBER: builtins.int + type: global___BinExport2.Expression.Type.ValueType + """IMMEDIATE_INT is by far the most common type and thus we can save some + space by omitting it as the default. + """ + symbol: builtins.str + """Symbol for this expression. Interpretation depends on type. Examples + include: "eax", "[", "+" + """ + immediate: builtins.int + """If the expression can be interpreted as an integer value (IMMEDIATE_INT) + the value is given here. + """ + parent_index: builtins.int + """The parent expression. Example expression tree for the second operand of: + mov eax, b4 [ebx + 12] + "b4" --- "[" --- "+" --- "ebx" + \\ "12" + """ + is_relocation: builtins.bool + """true if the expression has entry in relocation table""" + def __init__( + self, + *, + type: global___BinExport2.Expression.Type.ValueType | None = ..., + symbol: builtins.str | None = ..., + immediate: builtins.int | None = ..., + parent_index: builtins.int | None = ..., + is_relocation: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["immediate", b"immediate", "is_relocation", b"is_relocation", "parent_index", b"parent_index", "symbol", b"symbol", "type", b"type"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["immediate", b"immediate", "is_relocation", b"is_relocation", "parent_index", b"parent_index", "symbol", b"symbol", "type", b"type"]) -> None: ... + + @typing_extensions.final + class Operand(google.protobuf.message.Message): + """An instruction may have 0 or more operands.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + EXPRESSION_INDEX_FIELD_NUMBER: builtins.int + @property + def expression_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: + """Contains all expressions constituting this operand. All expressions + should be linked into a single tree, i.e. there should only be one + expression in this list with parent_index == NULL and all others should + descend from that. Rendering order for expressions on the same tree level + (siblings) is implicitly given by the order they are referenced in this + repeated field. + Implicit: expression sequence + """ + def __init__( + self, + *, + expression_index: collections.abc.Iterable[builtins.int] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["expression_index", b"expression_index"]) -> None: ... + + @typing_extensions.final + class Mnemonic(google.protobuf.message.Message): + """An instruction has exactly 1 mnemonic.""" + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NAME_FIELD_NUMBER: builtins.int + name: builtins.str + """Literal representation of the mnemonic, e.g.: "mov".""" + def __init__( + self, + *, + name: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["name", b"name"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["name", b"name"]) -> None: ... + + @typing_extensions.final + class Instruction(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ADDRESS_FIELD_NUMBER: builtins.int + CALL_TARGET_FIELD_NUMBER: builtins.int + MNEMONIC_INDEX_FIELD_NUMBER: builtins.int + OPERAND_INDEX_FIELD_NUMBER: builtins.int + RAW_BYTES_FIELD_NUMBER: builtins.int + COMMENT_INDEX_FIELD_NUMBER: builtins.int + address: builtins.int + """This will only be filled for instructions that do not just flow from the + immediately preceding instruction. Regular instructions will have to + calculate their own address by adding raw_bytes.size() to the previous + instruction's address. + """ + @property + def call_target(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: + """If this is a call instruction and call targets could be determined + they'll be given here. Note that we may or may not have a flow graph for + the target and thus cannot use an index into the flow graph table here. + We could potentially use call graph nodes, but linking instructions to + the call graph directly does not seem a good choice. + """ + mnemonic_index: builtins.int + """Index into the mnemonic array of strings. Used for de-duping the data. + The default value is used for the most common mnemonic in the executable. + """ + @property + def operand_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: + """Indices into the operand tree. On X86 this can be 0, 1 or 2 elements + long, 3 elements with VEX/EVEX. + Implicit: operand sequence + """ + raw_bytes: builtins.bytes + """The unmodified input bytes corresponding to this instruction.""" + @property + def comment_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: + """Implicit: comment sequence""" + def __init__( + self, + *, + address: builtins.int | None = ..., + call_target: collections.abc.Iterable[builtins.int] | None = ..., + mnemonic_index: builtins.int | None = ..., + operand_index: collections.abc.Iterable[builtins.int] | None = ..., + raw_bytes: builtins.bytes | None = ..., + comment_index: collections.abc.Iterable[builtins.int] | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["address", b"address", "mnemonic_index", b"mnemonic_index", "raw_bytes", b"raw_bytes"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "call_target", b"call_target", "comment_index", b"comment_index", "mnemonic_index", b"mnemonic_index", "operand_index", b"operand_index", "raw_bytes", b"raw_bytes"]) -> None: ... + + @typing_extensions.final + class BasicBlock(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class IndexRange(google.protobuf.message.Message): + """This is a space optimization. The instructions for an individual basic + block will usually be in a continuous index range. Thus it is more + efficient to store the range instead of individual indices. However, this + does not hold true for all basic blocks, so we need to be able to store + multiple index ranges per block. + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + BEGIN_INDEX_FIELD_NUMBER: builtins.int + END_INDEX_FIELD_NUMBER: builtins.int + begin_index: builtins.int + """These work like begin and end iterators, i.e. the sequence is + [begin_index, end_index). If the sequence only contains a single + element end_index will be omitted. + """ + end_index: builtins.int + def __init__( + self, + *, + begin_index: builtins.int | None = ..., + end_index: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["begin_index", b"begin_index", "end_index", b"end_index"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["begin_index", b"begin_index", "end_index", b"end_index"]) -> None: ... + + INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int + @property + def instruction_index(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.BasicBlock.IndexRange]: + """Implicit: instruction sequence""" + def __init__( + self, + *, + instruction_index: collections.abc.Iterable[global___BinExport2.BasicBlock.IndexRange] | None = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index"]) -> None: ... + + @typing_extensions.final + class FlowGraph(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + @typing_extensions.final + class Edge(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _Type: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.FlowGraph.Edge._Type.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + CONDITION_TRUE: BinExport2.FlowGraph.Edge._Type.ValueType # 1 + CONDITION_FALSE: BinExport2.FlowGraph.Edge._Type.ValueType # 2 + UNCONDITIONAL: BinExport2.FlowGraph.Edge._Type.ValueType # 3 + SWITCH: BinExport2.FlowGraph.Edge._Type.ValueType # 4 + + class Type(_Type, metaclass=_TypeEnumTypeWrapper): ... + CONDITION_TRUE: BinExport2.FlowGraph.Edge.Type.ValueType # 1 + CONDITION_FALSE: BinExport2.FlowGraph.Edge.Type.ValueType # 2 + UNCONDITIONAL: BinExport2.FlowGraph.Edge.Type.ValueType # 3 + SWITCH: BinExport2.FlowGraph.Edge.Type.ValueType # 4 + + SOURCE_BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int + TARGET_BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int + TYPE_FIELD_NUMBER: builtins.int + IS_BACK_EDGE_FIELD_NUMBER: builtins.int + source_basic_block_index: builtins.int + """Source instruction will always be the last instruction of the source + basic block, target instruction the first instruction of the target + basic block. + """ + target_basic_block_index: builtins.int + type: global___BinExport2.FlowGraph.Edge.Type.ValueType + is_back_edge: builtins.bool + """Indicates whether this is a loop edge as determined by Lengauer-Tarjan.""" + def __init__( + self, + *, + source_basic_block_index: builtins.int | None = ..., + target_basic_block_index: builtins.int | None = ..., + type: global___BinExport2.FlowGraph.Edge.Type.ValueType | None = ..., + is_back_edge: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["is_back_edge", b"is_back_edge", "source_basic_block_index", b"source_basic_block_index", "target_basic_block_index", b"target_basic_block_index", "type", b"type"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["is_back_edge", b"is_back_edge", "source_basic_block_index", b"source_basic_block_index", "target_basic_block_index", b"target_basic_block_index", "type", b"type"]) -> None: ... + + BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int + ENTRY_BASIC_BLOCK_INDEX_FIELD_NUMBER: builtins.int + EDGE_FIELD_NUMBER: builtins.int + @property + def basic_block_index(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.int]: + """Basic blocks are sorted by address.""" + entry_basic_block_index: builtins.int + """The flow graph's entry point address is the first instruction of the + entry_basic_block. + """ + @property + def edge(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.FlowGraph.Edge]: ... + def __init__( + self, + *, + basic_block_index: collections.abc.Iterable[builtins.int] | None = ..., + entry_basic_block_index: builtins.int | None = ..., + edge: collections.abc.Iterable[global___BinExport2.FlowGraph.Edge] | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["entry_basic_block_index", b"entry_basic_block_index"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["basic_block_index", b"basic_block_index", "edge", b"edge", "entry_basic_block_index", b"entry_basic_block_index"]) -> None: ... + + @typing_extensions.final + class Reference(google.protobuf.message.Message): + """Generic reference class used for address comments (deprecated), string + references and expression substitutions. It allows referencing from an + instruction, operand, expression subtree tuple to a de-duped string in the + string table. + """ + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int + INSTRUCTION_OPERAND_INDEX_FIELD_NUMBER: builtins.int + OPERAND_EXPRESSION_INDEX_FIELD_NUMBER: builtins.int + STRING_TABLE_INDEX_FIELD_NUMBER: builtins.int + instruction_index: builtins.int + """Index into the global instruction table.""" + instruction_operand_index: builtins.int + """Index into the operand array local to an instruction.""" + operand_expression_index: builtins.int + """Index into the expression array local to an operand.""" + string_table_index: builtins.int + """Index into the global string table.""" + def __init__( + self, + *, + instruction_index: builtins.int | None = ..., + instruction_operand_index: builtins.int | None = ..., + operand_expression_index: builtins.int | None = ..., + string_table_index: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "string_table_index", b"string_table_index"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "string_table_index", b"string_table_index"]) -> None: ... + + @typing_extensions.final + class DataReference(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int + ADDRESS_FIELD_NUMBER: builtins.int + instruction_index: builtins.int + """Index into the global instruction table.""" + address: builtins.int + """Address being referred.""" + def __init__( + self, + *, + instruction_index: builtins.int | None = ..., + address: builtins.int | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["address", b"address", "instruction_index", b"instruction_index"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "instruction_index", b"instruction_index"]) -> None: ... + + @typing_extensions.final + class Comment(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + class _Type: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + + class _TypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[BinExport2.Comment._Type.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + DEFAULT: BinExport2.Comment._Type.ValueType # 0 + """A regular instruction comment. Typically displayed next to the + instruction disassembly. + """ + ANTERIOR: BinExport2.Comment._Type.ValueType # 1 + """A comment line that is typically displayed before (above) the + instruction it refers to. + """ + POSTERIOR: BinExport2.Comment._Type.ValueType # 2 + """Like ANTERIOR, but a typically displayed after (below).""" + FUNCTION: BinExport2.Comment._Type.ValueType # 3 + """Similar to an ANTERIOR comment, but applies to the beginning of an + identified function. Programs displaying the proto may choose to render + these differently (e.g. above an inferred function signature). + """ + ENUM: BinExport2.Comment._Type.ValueType # 4 + """Named constants, bitfields and similar.""" + LOCATION: BinExport2.Comment._Type.ValueType # 5 + """Named locations, usually the target of a jump.""" + GLOBAL_REFERENCE: BinExport2.Comment._Type.ValueType # 6 + """Data cross references.""" + LOCAL_REFERENCE: BinExport2.Comment._Type.ValueType # 7 + """Local/stack variables.""" + + class Type(_Type, metaclass=_TypeEnumTypeWrapper): ... + DEFAULT: BinExport2.Comment.Type.ValueType # 0 + """A regular instruction comment. Typically displayed next to the + instruction disassembly. + """ + ANTERIOR: BinExport2.Comment.Type.ValueType # 1 + """A comment line that is typically displayed before (above) the + instruction it refers to. + """ + POSTERIOR: BinExport2.Comment.Type.ValueType # 2 + """Like ANTERIOR, but a typically displayed after (below).""" + FUNCTION: BinExport2.Comment.Type.ValueType # 3 + """Similar to an ANTERIOR comment, but applies to the beginning of an + identified function. Programs displaying the proto may choose to render + these differently (e.g. above an inferred function signature). + """ + ENUM: BinExport2.Comment.Type.ValueType # 4 + """Named constants, bitfields and similar.""" + LOCATION: BinExport2.Comment.Type.ValueType # 5 + """Named locations, usually the target of a jump.""" + GLOBAL_REFERENCE: BinExport2.Comment.Type.ValueType # 6 + """Data cross references.""" + LOCAL_REFERENCE: BinExport2.Comment.Type.ValueType # 7 + """Local/stack variables.""" + + INSTRUCTION_INDEX_FIELD_NUMBER: builtins.int + INSTRUCTION_OPERAND_INDEX_FIELD_NUMBER: builtins.int + OPERAND_EXPRESSION_INDEX_FIELD_NUMBER: builtins.int + STRING_TABLE_INDEX_FIELD_NUMBER: builtins.int + REPEATABLE_FIELD_NUMBER: builtins.int + TYPE_FIELD_NUMBER: builtins.int + instruction_index: builtins.int + """Index into the global instruction table. This is here to enable + comment processing without having to iterate over all instructions. + There is an N:M mapping of instructions to comments. + """ + instruction_operand_index: builtins.int + """Index into the operand array local to an instruction.""" + operand_expression_index: builtins.int + """Index into the expression array local to an operand, like in Reference. + This is not currently used, but allows to implement expression + substitutions. + """ + string_table_index: builtins.int + """Index into the global string table.""" + repeatable: builtins.bool + """Comment is propagated to all locations that reference the original + location. + """ + type: global___BinExport2.Comment.Type.ValueType + def __init__( + self, + *, + instruction_index: builtins.int | None = ..., + instruction_operand_index: builtins.int | None = ..., + operand_expression_index: builtins.int | None = ..., + string_table_index: builtins.int | None = ..., + repeatable: builtins.bool | None = ..., + type: global___BinExport2.Comment.Type.ValueType | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "repeatable", b"repeatable", "string_table_index", b"string_table_index", "type", b"type"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["instruction_index", b"instruction_index", "instruction_operand_index", b"instruction_operand_index", "operand_expression_index", b"operand_expression_index", "repeatable", b"repeatable", "string_table_index", b"string_table_index", "type", b"type"]) -> None: ... + + @typing_extensions.final + class Section(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + ADDRESS_FIELD_NUMBER: builtins.int + SIZE_FIELD_NUMBER: builtins.int + FLAG_R_FIELD_NUMBER: builtins.int + FLAG_W_FIELD_NUMBER: builtins.int + FLAG_X_FIELD_NUMBER: builtins.int + address: builtins.int + """Section start address.""" + size: builtins.int + """Section size.""" + flag_r: builtins.bool + """Read flag of the section, True when section is readable.""" + flag_w: builtins.bool + """Write flag of the section, True when section is writable.""" + flag_x: builtins.bool + """Execute flag of the section, True when section is executable.""" + def __init__( + self, + *, + address: builtins.int | None = ..., + size: builtins.int | None = ..., + flag_r: builtins.bool | None = ..., + flag_w: builtins.bool | None = ..., + flag_x: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["address", b"address", "flag_r", b"flag_r", "flag_w", b"flag_w", "flag_x", b"flag_x", "size", b"size"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["address", b"address", "flag_r", b"flag_r", "flag_w", b"flag_w", "flag_x", b"flag_x", "size", b"size"]) -> None: ... + + @typing_extensions.final + class Library(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + IS_STATIC_FIELD_NUMBER: builtins.int + LOAD_ADDRESS_FIELD_NUMBER: builtins.int + NAME_FIELD_NUMBER: builtins.int + is_static: builtins.bool + """If this library is statically linked.""" + load_address: builtins.int + """Address where this library was loaded, 0 if unknown.""" + name: builtins.str + """Name of the library (format is platform-dependent).""" + def __init__( + self, + *, + is_static: builtins.bool | None = ..., + load_address: builtins.int | None = ..., + name: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["is_static", b"is_static", "load_address", b"load_address", "name", b"name"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["is_static", b"is_static", "load_address", b"load_address", "name", b"name"]) -> None: ... + + @typing_extensions.final + class Module(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NAME_FIELD_NUMBER: builtins.int + name: builtins.str + """Name, such as Java class name. Platform-dependent.""" + def __init__( + self, + *, + name: builtins.str | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["name", b"name"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["name", b"name"]) -> None: ... + + META_INFORMATION_FIELD_NUMBER: builtins.int + EXPRESSION_FIELD_NUMBER: builtins.int + OPERAND_FIELD_NUMBER: builtins.int + MNEMONIC_FIELD_NUMBER: builtins.int + INSTRUCTION_FIELD_NUMBER: builtins.int + BASIC_BLOCK_FIELD_NUMBER: builtins.int + FLOW_GRAPH_FIELD_NUMBER: builtins.int + CALL_GRAPH_FIELD_NUMBER: builtins.int + STRING_TABLE_FIELD_NUMBER: builtins.int + ADDRESS_COMMENT_FIELD_NUMBER: builtins.int + COMMENT_FIELD_NUMBER: builtins.int + STRING_REFERENCE_FIELD_NUMBER: builtins.int + EXPRESSION_SUBSTITUTION_FIELD_NUMBER: builtins.int + SECTION_FIELD_NUMBER: builtins.int + LIBRARY_FIELD_NUMBER: builtins.int + DATA_REFERENCE_FIELD_NUMBER: builtins.int + MODULE_FIELD_NUMBER: builtins.int + @property + def meta_information(self) -> global___BinExport2.Meta: ... + @property + def expression(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Expression]: ... + @property + def operand(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Operand]: ... + @property + def mnemonic(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Mnemonic]: ... + @property + def instruction(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Instruction]: ... + @property + def basic_block(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.BasicBlock]: ... + @property + def flow_graph(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.FlowGraph]: ... + @property + def call_graph(self) -> global___BinExport2.CallGraph: ... + @property + def string_table(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... + @property + def address_comment(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Reference]: + """No longer written. This is here so that BinDiff can work with older + BinExport files. + """ + @property + def comment(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Comment]: + """Rich comment index used for BinDiff's comment porting.""" + @property + def string_reference(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Reference]: ... + @property + def expression_substitution(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Reference]: ... + @property + def section(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Section]: ... + @property + def library(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Library]: ... + @property + def data_reference(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.DataReference]: ... + @property + def module(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___BinExport2.Module]: ... + def __init__( + self, + *, + meta_information: global___BinExport2.Meta | None = ..., + expression: collections.abc.Iterable[global___BinExport2.Expression] | None = ..., + operand: collections.abc.Iterable[global___BinExport2.Operand] | None = ..., + mnemonic: collections.abc.Iterable[global___BinExport2.Mnemonic] | None = ..., + instruction: collections.abc.Iterable[global___BinExport2.Instruction] | None = ..., + basic_block: collections.abc.Iterable[global___BinExport2.BasicBlock] | None = ..., + flow_graph: collections.abc.Iterable[global___BinExport2.FlowGraph] | None = ..., + call_graph: global___BinExport2.CallGraph | None = ..., + string_table: collections.abc.Iterable[builtins.str] | None = ..., + address_comment: collections.abc.Iterable[global___BinExport2.Reference] | None = ..., + comment: collections.abc.Iterable[global___BinExport2.Comment] | None = ..., + string_reference: collections.abc.Iterable[global___BinExport2.Reference] | None = ..., + expression_substitution: collections.abc.Iterable[global___BinExport2.Reference] | None = ..., + section: collections.abc.Iterable[global___BinExport2.Section] | None = ..., + library: collections.abc.Iterable[global___BinExport2.Library] | None = ..., + data_reference: collections.abc.Iterable[global___BinExport2.DataReference] | None = ..., + module: collections.abc.Iterable[global___BinExport2.Module] | None = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["call_graph", b"call_graph", "meta_information", b"meta_information"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["address_comment", b"address_comment", "basic_block", b"basic_block", "call_graph", b"call_graph", "comment", b"comment", "data_reference", b"data_reference", "expression", b"expression", "expression_substitution", b"expression_substitution", "flow_graph", b"flow_graph", "instruction", b"instruction", "library", b"library", "meta_information", b"meta_information", "mnemonic", b"mnemonic", "module", b"module", "operand", b"operand", "section", b"section", "string_reference", b"string_reference", "string_table", b"string_table"]) -> None: ... + +global___BinExport2 = BinExport2 diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py new file mode 100644 index 000000000..40d61e694 --- /dev/null +++ b/capa/features/extractors/binexport2/extractor.py @@ -0,0 +1,130 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import Set, List, Tuple, Iterator + +import capa.features.extractors.elf +import capa.features.extractors.common +import capa.features.extractors.binexport2.file +import capa.features.extractors.binexport2.insn +import capa.features.extractors.binexport2.helpers +import capa.features.extractors.binexport2.function +import capa.features.extractors.binexport2.basicblock +from capa.features.common import OS, Arch, Format, Feature +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.binexport2 import ( + AddressSpace, + AnalysisContext, + BinExport2Index, + FunctionContext, + BasicBlockContext, + BinExport2Analysis, + InstructionContext, +) +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger(__name__) + + +class BinExport2FeatureExtractor(StaticFeatureExtractor): + def __init__(self, be2: BinExport2, buf: bytes): + super().__init__(hashes=SampleHashes.from_bytes(buf)) + self.be2: BinExport2 = be2 + self.buf: bytes = buf + self.idx: BinExport2Index = BinExport2Index(self.be2) + self.analysis: BinExport2Analysis = BinExport2Analysis(self.be2, self.idx, self.buf) + address_space: AddressSpace = AddressSpace.from_buf(buf, self.analysis.base_address) + self.ctx: AnalysisContext = AnalysisContext(self.buf, self.be2, self.idx, self.analysis, address_space) + + self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(list(capa.features.extractors.common.extract_format(self.buf))) + self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf))) + self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf))) + + self.format: Set[str] = set() + self.os: Set[str] = set() + self.arch: Set[str] = set() + + for feature, _ in self.global_features: + assert isinstance(feature.value, str) + + if isinstance(feature, Format): + self.format.add(feature.value) + elif isinstance(feature, OS): + self.os.add(feature.value) + elif isinstance(feature, Arch): + self.arch.add(feature.value) + else: + raise ValueError("unexpected global feature: %s", feature) + + def get_base_address(self) -> AbsoluteVirtualAddress: + return AbsoluteVirtualAddress(self.analysis.base_address) + + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from self.global_features + + def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.binexport2.file.extract_features(self.be2, self.buf) + + def get_functions(self) -> Iterator[FunctionHandle]: + for flow_graph_index, flow_graph in enumerate(self.be2.flow_graph): + entry_basic_block_index: int = flow_graph.entry_basic_block_index + flow_graph_address: int = self.idx.get_basic_block_address(entry_basic_block_index) + + vertex_idx: int = self.idx.vertex_index_by_address[flow_graph_address] + be2_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx] + + # skip thunks + if capa.features.extractors.binexport2.helpers.is_vertex_type( + be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): + continue + + yield FunctionHandle( + AbsoluteVirtualAddress(flow_graph_address), + inner=FunctionContext(self.ctx, flow_graph_index, self.format, self.os, self.arch), + ) + + def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.binexport2.function.extract_features(fh) + + def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]: + fhi: FunctionContext = fh.inner + flow_graph_index: int = fhi.flow_graph_index + flow_graph: BinExport2.FlowGraph = self.be2.flow_graph[flow_graph_index] + + for basic_block_index in flow_graph.basic_block_index: + basic_block_address: int = self.idx.get_basic_block_address(basic_block_index) + yield BBHandle( + address=AbsoluteVirtualAddress(basic_block_address), + inner=BasicBlockContext(basic_block_index), + ) + + def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.binexport2.basicblock.extract_features(fh, bbh) + + def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]: + bbi: BasicBlockContext = bbh.inner + basic_block: BinExport2.BasicBlock = self.be2.basic_block[bbi.basic_block_index] + for instruction_index, _, instruction_address in self.idx.basic_block_instructions(basic_block): + yield InsnHandle( + address=AbsoluteVirtualAddress(instruction_address), + inner=InstructionContext(instruction_index), + ) + + def extract_insn_features( + self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle + ) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.binexport2.insn.extract_features(fh, bbh, ih) diff --git a/capa/features/extractors/binexport2/file.py b/capa/features/extractors/binexport2/file.py new file mode 100644 index 000000000..9d9872bc2 --- /dev/null +++ b/capa/features/extractors/binexport2/file.py @@ -0,0 +1,80 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import io +import logging +from typing import Tuple, Iterator + +import pefile +from elftools.elf.elffile import ELFFile + +import capa.features.common +import capa.features.extractors.common +import capa.features.extractors.pefile +import capa.features.extractors.elffile +from capa.features.common import Feature +from capa.features.address import Address +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger(__name__) + + +def extract_file_export_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]: + if buf.startswith(capa.features.extractors.common.MATCH_PE): + pe: pefile.PE = pefile.PE(data=buf) + yield from capa.features.extractors.pefile.extract_file_export_names(pe) + elif buf.startswith(capa.features.extractors.common.MATCH_ELF): + elf: ELFFile = ELFFile(io.BytesIO(buf)) + yield from capa.features.extractors.elffile.extract_file_export_names(elf) + else: + logger.warning("unsupported format") + + +def extract_file_import_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]: + if buf.startswith(capa.features.extractors.common.MATCH_PE): + pe: pefile.PE = pefile.PE(data=buf) + yield from capa.features.extractors.pefile.extract_file_import_names(pe) + elif buf.startswith(capa.features.extractors.common.MATCH_ELF): + elf: ELFFile = ELFFile(io.BytesIO(buf)) + yield from capa.features.extractors.elffile.extract_file_import_names(elf) + else: + logger.warning("unsupported format") + + +def extract_file_section_names(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]: + if buf.startswith(capa.features.extractors.common.MATCH_PE): + pe: pefile.PE = pefile.PE(data=buf) + yield from capa.features.extractors.pefile.extract_file_section_names(pe) + elif buf.startswith(capa.features.extractors.common.MATCH_ELF): + elf: ELFFile = ELFFile(io.BytesIO(buf)) + yield from capa.features.extractors.elffile.extract_file_section_names(elf) + else: + logger.warning("unsupported format") + + +def extract_file_strings(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.common.extract_file_strings(buf) + + +def extract_file_format(_be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.common.extract_format(buf) + + +def extract_features(be2: BinExport2, buf: bytes) -> Iterator[Tuple[Feature, Address]]: + """extract file features""" + for file_handler in FILE_HANDLERS: + for feature, addr in file_handler(be2, buf): + yield feature, addr + + +FILE_HANDLERS = ( + extract_file_export_names, + extract_file_import_names, + extract_file_strings, + extract_file_section_names, + extract_file_format, +) diff --git a/capa/features/extractors/binexport2/function.py b/capa/features/extractors/binexport2/function.py new file mode 100644 index 000000000..0c49036d1 --- /dev/null +++ b/capa/features/extractors/binexport2/function.py @@ -0,0 +1,72 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +from typing import List, Tuple, Iterator + +from capa.features.file import FunctionName +from capa.features.common import Feature, Characteristic +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors import loops +from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext +from capa.features.extractors.base_extractor import FunctionHandle +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + + +def extract_function_calls_to(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + be2: BinExport2 = fhi.ctx.be2 + idx: BinExport2Index = fhi.ctx.idx + + flow_graph_index: int = fhi.flow_graph_index + flow_graph_address: int = idx.flow_graph_address_by_index[flow_graph_index] + vertex_index: int = idx.vertex_index_by_address[flow_graph_address] + + for caller_index in idx.callers_by_vertex_index[vertex_index]: + caller: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[caller_index] + caller_address: int = caller.address + yield Characteristic("calls to"), AbsoluteVirtualAddress(caller_address) + + +def extract_function_loop(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + be2: BinExport2 = fhi.ctx.be2 + + flow_graph_index: int = fhi.flow_graph_index + flow_graph: BinExport2.FlowGraph = be2.flow_graph[flow_graph_index] + + edges: List[Tuple[int, int]] = [] + for edge in flow_graph.edge: + edges.append((edge.source_basic_block_index, edge.target_basic_block_index)) + + if loops.has_loop(edges): + yield Characteristic("loop"), fh.address + + +def extract_function_name(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + be2: BinExport2 = fhi.ctx.be2 + idx: BinExport2Index = fhi.ctx.idx + flow_graph_index: int = fhi.flow_graph_index + + flow_graph_address: int = idx.flow_graph_address_by_index[flow_graph_index] + vertex_index: int = idx.vertex_index_by_address[flow_graph_address] + vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_index] + + if vertex.HasField("mangled_name"): + yield FunctionName(vertex.mangled_name), fh.address + + +def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + for func_handler in FUNCTION_HANDLERS: + for feature, addr in func_handler(fh): + yield feature, addr + + +FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_function_name) diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py new file mode 100644 index 000000000..e4e7f7b76 --- /dev/null +++ b/capa/features/extractors/binexport2/helpers.py @@ -0,0 +1,650 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import re +from typing import Set, Dict, List, Tuple, Union, Iterator, Optional +from collections import defaultdict +from dataclasses import dataclass + +import capa.features.extractors.helpers +import capa.features.extractors.binexport2.helpers +from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64 +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +HAS_ARCH32 = {ARCH_I386} +HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64} + +HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64} +HAS_ARCH_ARM = {ARCH_AARCH64} + + +def mask_immediate(arch: Set[str], immediate: int) -> int: + if arch & HAS_ARCH64: + immediate &= 0xFFFFFFFFFFFFFFFF + elif arch & HAS_ARCH32: + immediate &= 0xFFFFFFFF + return immediate + + +def twos_complement(arch: Set[str], immediate: int, default: Optional[int] = None) -> int: + if default is not None: + return capa.features.extractors.helpers.twos_complement(immediate, default) + elif arch & HAS_ARCH64: + return capa.features.extractors.helpers.twos_complement(immediate, 64) + elif arch & HAS_ARCH32: + return capa.features.extractors.helpers.twos_complement(immediate, 32) + return immediate + + +def is_address_mapped(be2: BinExport2, address: int) -> bool: + """return True if the given address is mapped""" + sections_with_perms: Iterator[BinExport2.Section] = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section) + return any(section.address <= address < section.address + section.size for section in sections_with_perms) + + +def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool: + return vertex.HasField("type") and vertex.type == type_ + + +# internal to `build_expression_tree` +# this is unstable: it is subject to change, so don't rely on it! +def _prune_expression_tree_empty_shifts( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: List[List[int]], + tree_index: int, +): + expression_index = operand.expression_index[tree_index] + expression = be2.expression[expression_index] + children_tree_indexes: List[int] = expression_tree[tree_index] + + if expression.type == BinExport2.Expression.OPERATOR: + if len(children_tree_indexes) == 0 and expression.symbol in ("lsl", "lsr"): + # Ghidra may emit superfluous lsl nodes with no children. + # https://github.com/mandiant/capa/pull/2340/files#r1750003919 + # Which is maybe: https://github.com/NationalSecurityAgency/ghidra/issues/6821#issuecomment-2295394697 + # + # Which seems to be as if the shift wasn't there (shift of #0) + # so we want to remove references to this node from any parent nodes. + for tree_node in expression_tree: + if tree_index in tree_node: + tree_node.remove(tree_index) + + return + + for child_tree_index in children_tree_indexes: + _prune_expression_tree_empty_shifts(be2, operand, expression_tree, child_tree_index) + + +# internal to `build_expression_tree` +# this is unstable: it is subject to change, so don't rely on it! +def _prune_expression_tree_empty_commas( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: List[List[int]], + tree_index: int, +): + expression_index = operand.expression_index[tree_index] + expression = be2.expression[expression_index] + children_tree_indexes: List[int] = expression_tree[tree_index] + + if expression.type == BinExport2.Expression.OPERATOR: + if len(children_tree_indexes) == 1 and expression.symbol == ",": + # Due to the above pruning of empty LSL or LSR expressions, + # the parents might need to be fixed up. + # + # Specifically, if the pruned node was part of a comma list with two children, + # now there's only a single child, which renders as an extra comma, + # so we replace references to the comma node with the immediate child. + # + # A more correct way of doing this might be to walk up the parents and do fixups, + # but I'm not quite sure how to do this yet. Just do two passes right now. + child = children_tree_indexes[0] + + for tree_node in expression_tree: + tree_node.index + if tree_index in tree_node: + tree_node[tree_node.index(tree_index)] = child + + return + + for child_tree_index in children_tree_indexes: + _prune_expression_tree_empty_commas(be2, operand, expression_tree, child_tree_index) + + +# internal to `build_expression_tree` +# this is unstable: it is subject to change, so don't rely on it! +def _prune_expression_tree( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: List[List[int]], +): + _prune_expression_tree_empty_shifts(be2, operand, expression_tree, 0) + _prune_expression_tree_empty_commas(be2, operand, expression_tree, 0) + + +# this is unstable: it is subject to change, so don't rely on it! +def _build_expression_tree( + be2: BinExport2, + operand: BinExport2.Operand, +) -> List[List[int]]: + # The reconstructed expression tree layout, linking parent nodes to their children. + # + # There is one list of integers for each expression in the operand. + # These integers are indexes of other expressions in the same operand, + # which are the children of that expression. + # + # So: + # + # [ [1, 3], [2], [], [4], [5], []] + # + # means the first expression has two children, at index 1 and 3, + # and the tree looks like: + # + # 0 + # / \ + # 1 3 + # | | + # 2 4 + # | + # 5 + # + # Remember, these are the indices into the entries in operand.expression_index. + if len(operand.expression_index) == 0: + # Ghidra bug where empty operands (no expressions) may + # exist (see https://github.com/NationalSecurityAgency/ghidra/issues/6817) + return [] + + tree: List[List[int]] = [] + for i, expression_index in enumerate(operand.expression_index): + children = [] + + # scan all subsequent expressions, looking for those that have parent_index == current.expression_index + for j, candidate_index in enumerate(operand.expression_index[i + 1 :]): + candidate = be2.expression[candidate_index] + + if candidate.parent_index == expression_index: + children.append(i + j + 1) + + tree.append(children) + + _prune_expression_tree(be2, operand, tree) + _prune_expression_tree(be2, operand, tree) + + return tree + + +def _fill_operand_expression_list( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: List[List[int]], + tree_index: int, + expression_list: List[BinExport2.Expression], +): + """ + Walk the given expression tree and collect the expression nodes in-order. + """ + expression_index = operand.expression_index[tree_index] + expression = be2.expression[expression_index] + children_tree_indexes: List[int] = expression_tree[tree_index] + + if expression.type == BinExport2.Expression.REGISTER: + assert len(children_tree_indexes) == 0 + expression_list.append(expression) + return + + elif expression.type == BinExport2.Expression.SYMBOL: + assert len(children_tree_indexes) <= 1 + expression_list.append(expression) + + if len(children_tree_indexes) == 0: + return + elif len(children_tree_indexes) == 1: + # like: v + # from: mov v0.D[0x1], x9 + # | + # 0 + # . + # | + # D + child_index = children_tree_indexes[0] + _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + else: + raise NotImplementedError(len(children_tree_indexes)) + + elif expression.type == BinExport2.Expression.IMMEDIATE_INT: + assert len(children_tree_indexes) == 0 + expression_list.append(expression) + return + + elif expression.type == BinExport2.Expression.SIZE_PREFIX: + # like: b4 + # + # We might want to use this occasionally, such as to disambiguate the + # size of MOVs into/out of memory. But I'm not sure when/where we need that yet. + # + # IDA spams this size prefix hint *everywhere*, so we can't rely on the exporter + # to provide it only when necessary. + assert len(children_tree_indexes) == 1 + child_index = children_tree_indexes[0] + _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + + elif expression.type == BinExport2.Expression.OPERATOR: + if len(children_tree_indexes) == 1: + # prefix operator, like "ds:" + expression_list.append(expression) + child_index = children_tree_indexes[0] + _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + + elif len(children_tree_indexes) == 2: + # infix operator: like "+" in "ebp+10" + child_a = children_tree_indexes[0] + child_b = children_tree_indexes[1] + _fill_operand_expression_list(be2, operand, expression_tree, child_a, expression_list) + expression_list.append(expression) + _fill_operand_expression_list(be2, operand, expression_tree, child_b, expression_list) + return + + elif len(children_tree_indexes) == 3: + # infix operator: like "+" in "ebp+ecx+10" + child_a = children_tree_indexes[0] + child_b = children_tree_indexes[1] + child_c = children_tree_indexes[2] + _fill_operand_expression_list(be2, operand, expression_tree, child_a, expression_list) + expression_list.append(expression) + _fill_operand_expression_list(be2, operand, expression_tree, child_b, expression_list) + expression_list.append(expression) + _fill_operand_expression_list(be2, operand, expression_tree, child_c, expression_list) + return + + else: + raise NotImplementedError(len(children_tree_indexes)) + + elif expression.type == BinExport2.Expression.DEREFERENCE: + assert len(children_tree_indexes) == 1 + expression_list.append(expression) + + child_index = children_tree_indexes[0] + _fill_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + + elif expression.type == BinExport2.Expression.IMMEDIATE_FLOAT: + raise NotImplementedError(expression.type) + + else: + raise NotImplementedError(expression.type) + + +def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[BinExport2.Expression]: + tree = _build_expression_tree(be2, op) + + expressions: List[BinExport2.Expression] = [] + _fill_operand_expression_list(be2, op, tree, 0, expressions) + + return expressions + + +def get_operand_register_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]: + if len(operand.expression_index) == 1: + expression: BinExport2.Expression = be2.expression[operand.expression_index[0]] + if expression.type == BinExport2.Expression.REGISTER: + return expression + return None + + +def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]: + if len(operand.expression_index) == 1: + # - type: IMMEDIATE_INT + # immediate: 20588728364 + # parent_index: 0 + expression: BinExport2.Expression = be2.expression[operand.expression_index[0]] + if expression.type == BinExport2.Expression.IMMEDIATE_INT: + return expression + + elif len(operand.expression_index) == 2: + # from IDA, which provides a size hint for every operand, + # we get the following pattern for immediate constants: + # + # - type: SIZE_PREFIX + # symbol: "b8" + # - type: IMMEDIATE_INT + # immediate: 20588728364 + # parent_index: 0 + expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]] + expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]] + + if expression0.type == BinExport2.Expression.SIZE_PREFIX: + if expression1.type == BinExport2.Expression.IMMEDIATE_INT: + return expression1 + + return None + + +def get_instruction_mnemonic(be2: BinExport2, instruction: BinExport2.Instruction) -> str: + return be2.mnemonic[instruction.mnemonic_index].name.lower() + + +def get_instruction_operands(be2: BinExport2, instruction: BinExport2.Instruction) -> List[BinExport2.Operand]: + return [be2.operand[operand_index] for operand_index in instruction.operand_index] + + +def split_with_delimiters(s: str, delimiters: Tuple[str, ...]) -> Iterator[str]: + """ + Splits a string by any of the provided delimiter characters, + including the delimiters in the results. + + Args: + string: The string to split. + delimiters: A string containing the characters to use as delimiters. + """ + start = 0 + for i, char in enumerate(s): + if char in delimiters: + yield s[start:i] + yield char + start = i + 1 + + if start < len(s): + yield s[start:] + + +BinExport2OperandPattern = Union[str, Tuple[str, ...]] + + +@dataclass +class BinExport2InstructionPattern: + """ + This describes a way to match disassembled instructions, with mnemonics and operands. + + You can specify constraints on the instruction, via: + - the mnemonics, like "mov", + - number of operands, and + - format of each operand, "[reg, reg, #int]". + + During matching, you can also capture a single element, to see its concrete value. + For example, given the pattern: + + mov reg0, #int0 ; capture int0 + + and the instruction: + + mov eax, 1 + + Then the capture will contain the immediate integer 1. + + This matcher uses the BinExport2 data layout under the hood. + """ + + mnemonics: Tuple[str, ...] + operands: Tuple[Union[str, BinExport2OperandPattern], ...] + capture: Optional[str] + + @classmethod + def from_str(cls, query: str): + """ + Parse a pattern string into a Pattern instance. + The supported syntax is like this: + + br reg + br reg ; capture reg + br reg(stack) ; capture reg + br reg(not-stack) ; capture reg + mov reg0, reg1 ; capture reg0 + adrp reg, #int ; capture #int + add reg, reg, #int ; capture #int + ldr reg0, [reg1] ; capture reg1 + ldr|str reg, [reg, #int] ; capture #int + ldr|str reg, [reg(stack), #int] ; capture #int + ldr|str reg, [reg(not-stack), #int] ; capture #int + ldr|str reg, [reg, #int]! ; capture #int + ldr|str reg, [reg], #int ; capture #int + ldp|stp reg, reg, [reg, #int] ; capture #int + ldp|stp reg, reg, [reg, #int]! ; capture #int + ldp|stp reg, reg, [reg], #int ; capture #int + """ + # + # The implementation of the parser here is obviously ugly. + # Its handwritten and probably fragile. But since we don't + # expect this to be widely used, its probably ok. + # Don't hesitate to rewrite this if it becomes more important. + # + # Note that this doesn't have to be very performant. + # We expect these patterns to be parsed once upfront and then reused + # (globally at the module level?) rather than within any loop. + # + + pattern, _, comment = query.strip().partition(";") + + # we don't support fs: yet + assert ":" not in pattern + + # from "capture #int" to "#int" + if comment: + comment = comment.strip() + assert comment.startswith("capture ") + capture = comment[len("capture ") :] + else: + capture = None + + # from "ldr|str ..." to ["ldr", "str"] + pattern = pattern.strip() + mnemonic, _, rest = pattern.partition(" ") + mnemonics = mnemonic.split("|") + + operands: List[Union[str, Tuple[str, ...]]] = [] + while rest: + rest = rest.strip() + if not rest.startswith("["): + # If its not a dereference, which looks like `[op, op, op, ...]`, + # then its a simple operand, which we can split by the next comma. + operand, _, rest = rest.partition(", ") + rest = rest.strip() + operands.append(operand) + + else: + # This looks like a dereference, something like `[op, op, op, ...]`. + # Since these can't be nested, look for the next ] and then parse backwards. + deref_end = rest.index("]") + try: + deref_end = rest.index(", ", deref_end) + deref_end += len(", ") + except ValueError: + deref = rest + rest = "" + else: + deref = rest[:deref_end] + rest = rest[deref_end:] + rest = rest.strip() + deref = deref.rstrip(" ") + deref = deref.rstrip(",") + + # like: [reg, #int]! + has_postindex_writeback = deref.endswith("!") + + deref = deref.rstrip("!") + deref = deref.rstrip("]") + deref = deref.lstrip("[") + + parts = tuple(split_with_delimiters(deref, (",", "+", "*"))) + parts = tuple(s.strip() for s in parts) + + # emit operands in this order to match + # how BinExport2 expressions are flatted + # by get_operand_expressions + if has_postindex_writeback: + operands.append(("!", "[") + parts) + else: + operands.append(("[",) + parts) + + for operand in operands: # type: ignore + # Try to ensure we've parsed the operands correctly. + # This is just sanity checking. + for o in (operand,) if isinstance(operand, str) else operand: + # operands can look like: + # - reg + # - reg0 + # - reg(stack) + # - reg0(stack) + # - reg(not-stack) + # - reg0(not-stack) + # - #int + # - #int0 + # and a limited set of supported operators. + # use an inline regex so that its easy to read. not perf critical. + assert re.match(r"^(reg|#int)[0-9]?(\(stack\)|\(not-stack\))?$", o) or o in ("[", ",", "!", "+", "*") + + return cls(tuple(mnemonics), tuple(operands), capture) + + @dataclass + class MatchResult: + operand_index: int + expression_index: int + expression: BinExport2.Expression + + def match( + self, mnemonic: str, operand_expressions: List[List[BinExport2.Expression]] + ) -> Optional["BinExport2InstructionPattern.MatchResult"]: + """ + Match the given BinExport2 data against this pattern. + + The BinExport2 expression tree must have been flattened, such as with + capa.features.extractors.binexport2.helpers.get_operand_expressions. + + If there's a match, the captured Expression instance is returned. + Otherwise, you get None back. + """ + if mnemonic not in self.mnemonics: + return None + + if len(self.operands) != len(operand_expressions): + return None + + captured = None + + for operand_index, found_expressions in enumerate(operand_expressions): + wanted_expressions = self.operands[operand_index] + + # from `"reg"` to `("reg", )` + if isinstance(wanted_expressions, str): + wanted_expressions = (wanted_expressions,) + assert isinstance(wanted_expressions, tuple) + + if len(wanted_expressions) != len(found_expressions): + return None + + for expression_index, (wanted_expression, found_expression) in enumerate( + zip(wanted_expressions, found_expressions) + ): + if wanted_expression.startswith("reg"): + if found_expression.type != BinExport2.Expression.REGISTER: + return None + + if wanted_expression.endswith(")"): + if wanted_expression.endswith("(not-stack)"): + # intel 64: rsp, esp, sp, + # intel 32: ebp, ebp, bp + # arm: sp + register_name = found_expression.symbol.lower() + if register_name in ("rsp", "esp", "sp", "rbp", "ebp", "bp"): + return None + + elif wanted_expression.endswith("(stack)"): + register_name = found_expression.symbol.lower() + if register_name not in ("rsp", "esp", "sp", "rbp", "ebp", "bp"): + return None + + else: + raise ValueError("unexpected expression suffix", wanted_expression) + + if self.capture == wanted_expression: + captured = BinExport2InstructionPattern.MatchResult( + operand_index, expression_index, found_expression + ) + + elif wanted_expression.startswith("#int"): + if found_expression.type != BinExport2.Expression.IMMEDIATE_INT: + return None + + if self.capture == wanted_expression: + captured = BinExport2InstructionPattern.MatchResult( + operand_index, expression_index, found_expression + ) + + elif wanted_expression == "[": + if found_expression.type != BinExport2.Expression.DEREFERENCE: + return None + + elif wanted_expression in (",", "!", "+", "*"): + if found_expression.type != BinExport2.Expression.OPERATOR: + return None + + if found_expression.symbol != wanted_expression: + return None + + else: + raise ValueError(found_expression) + + if captured: + return captured + else: + # There were no captures, so + # return arbitrary non-None expression + return BinExport2InstructionPattern.MatchResult(operand_index, expression_index, found_expression) + + +class BinExport2InstructionPatternMatcher: + """Index and match a collection of instruction patterns.""" + + def __init__(self, queries: List[BinExport2InstructionPattern]): + self.queries = queries + # shard the patterns by (mnemonic, #operands) + self._index: Dict[Tuple[str, int], List[BinExport2InstructionPattern]] = defaultdict(list) + + for query in queries: + for mnemonic in query.mnemonics: + self._index[(mnemonic.lower(), len(query.operands))].append(query) + + @classmethod + def from_str(cls, patterns: str): + return cls( + [ + BinExport2InstructionPattern.from_str(line) + for line in filter( + lambda line: not line.startswith("#"), (line.strip() for line in patterns.split("\n")) + ) + ] + ) + + def match( + self, mnemonic: str, operand_expressions: List[List[BinExport2.Expression]] + ) -> Optional[BinExport2InstructionPattern.MatchResult]: + queries = self._index.get((mnemonic.lower(), len(operand_expressions)), []) + for query in queries: + captured = query.match(mnemonic.lower(), operand_expressions) + if captured: + return captured + + return None + + def match_with_be2( + self, be2: BinExport2, instruction_index: int + ) -> Optional[BinExport2InstructionPattern.MatchResult]: + instruction: BinExport2.Instruction = be2.instruction[instruction_index] + mnemonic: str = get_instruction_mnemonic(be2, instruction) + + if (mnemonic.lower(), len(instruction.operand_index)) not in self._index: + # verify that we might have a hit before we realize the operand expression list + return None + + operands = [] + for operand_index in instruction.operand_index: + operands.append(get_operand_expressions(be2, be2.operand[operand_index])) + + return self.match(mnemonic, operands) diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py new file mode 100644 index 000000000..8f2e6af99 --- /dev/null +++ b/capa/features/extractors/binexport2/insn.py @@ -0,0 +1,254 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import List, Tuple, Iterator + +import capa.features.extractors.helpers +import capa.features.extractors.strings +import capa.features.extractors.binexport2.helpers +import capa.features.extractors.binexport2.arch.arm.insn +import capa.features.extractors.binexport2.arch.intel.insn +from capa.features.insn import API, Mnemonic +from capa.features.common import Bytes, String, Feature, Characteristic +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.binexport2 import ( + AddressSpace, + AnalysisContext, + BinExport2Index, + FunctionContext, + ReadMemoryError, + BinExport2Analysis, + InstructionContext, +) +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.binexport2.helpers import HAS_ARCH_ARM, HAS_ARCH_INTEL +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger(__name__) + + +def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + be2_index: BinExport2Index = fhi.ctx.idx + be2_analysis: BinExport2Analysis = fhi.ctx.analysis + insn: BinExport2.Instruction = be2.instruction[ii.instruction_index] + + for addr in insn.call_target: + addr = be2_analysis.thunks.get(addr, addr) + + if addr not in be2_index.vertex_index_by_address: + # disassembler did not define function at address + logger.debug("0x%x is not a vertex", addr) + continue + + vertex_idx: int = be2_index.vertex_index_by_address[addr] + vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx] + + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED + ): + continue + + if not vertex.HasField("mangled_name"): + logger.debug("vertex %d does not have mangled_name", vertex_idx) + continue + + api_name: str = vertex.mangled_name + for name in capa.features.extractors.helpers.generate_symbols("", api_name): + yield API(name), ih.address + + +def extract_insn_number_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_number_features(fh, bbh, ih) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_number_features(fh, bbh, ih) + + +def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + ctx: AnalysisContext = fhi.ctx + be2: BinExport2 = ctx.be2 + idx: BinExport2Index = ctx.idx + address_space: AddressSpace = ctx.address_space + + instruction_index: int = ii.instruction_index + + if instruction_index in idx.string_reference_index_by_source_instruction_index: + # disassembler already identified string reference from instruction + return + + reference_addresses: List[int] = [] + + if instruction_index in idx.data_reference_index_by_source_instruction_index: + for data_reference_index in idx.data_reference_index_by_source_instruction_index[instruction_index]: + data_reference: BinExport2.DataReference = be2.data_reference[data_reference_index] + data_reference_address: int = data_reference.address + + if data_reference_address in idx.insn_address_by_index: + # appears to be code + continue + + reference_addresses.append(data_reference_address) + + for reference_address in reference_addresses: + try: + # if at end of segment then there might be an overrun here. + buf: bytes = address_space.read_memory(reference_address, 0x100) + except ReadMemoryError: + logger.debug("failed to read memory: 0x%x", reference_address) + continue + + if capa.features.extractors.helpers.all_zeros(buf): + continue + + is_string: bool = False + + # note: we *always* break after the first iteration + for s in capa.features.extractors.strings.extract_ascii_strings(buf): + if s.offset != 0: + break + + yield String(s.s), ih.address + is_string = True + break + + # note: we *always* break after the first iteration + for s in capa.features.extractors.strings.extract_unicode_strings(buf): + if s.offset != 0: + break + + yield String(s.s), ih.address + is_string = True + break + + if not is_string: + yield Bytes(buf), ih.address + + +def extract_insn_string_features( + fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + idx: BinExport2Index = fhi.ctx.idx + + instruction_index: int = ii.instruction_index + + if instruction_index in idx.string_reference_index_by_source_instruction_index: + for string_reference_index in idx.string_reference_index_by_source_instruction_index[instruction_index]: + string_reference: BinExport2.Reference = be2.string_reference[string_reference_index] + string_index: int = string_reference.string_table_index + string: str = be2.string_table[string_index] + yield String(string), ih.address + + +def extract_insn_offset_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_offset_features(fh, bbh, ih) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_offset_features(fh, bbh, ih) + + +def extract_insn_nzxor_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_nzxor_characteristic_features( + fh, bbh, ih + ) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_nzxor_characteristic_features( + fh, bbh, ih + ) + + +def extract_insn_mnemonic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index] + mnemonic_name: str = mnemonic.name.lower() + yield Mnemonic(mnemonic_name), ih.address + + +def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + """extract functions calls from features + + most relevant at the function scope; + however, its most efficient to extract at the instruction scope. + """ + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + for call_target_address in instruction.call_target: + addr: AbsoluteVirtualAddress = AbsoluteVirtualAddress(call_target_address) + yield Characteristic("calls from"), addr + + if fh.address == addr: + yield Characteristic("recursive call"), addr + + +def extract_function_indirect_call_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_function_indirect_call_characteristic_features( + fh, bbh, ih + ) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_function_indirect_call_characteristic_features( + fh, bbh, ih + ) + + +def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + """extract instruction features""" + for inst_handler in INSTRUCTION_HANDLERS: + for feature, ea in inst_handler(f, bbh, insn): + yield feature, ea + + +INSTRUCTION_HANDLERS = ( + extract_insn_api_features, + extract_insn_number_features, + extract_insn_bytes_features, + extract_insn_string_features, + extract_insn_offset_features, + extract_insn_nzxor_characteristic_features, + extract_insn_mnemonic_features, + extract_function_calls_from, + extract_function_indirect_call_characteristic_features, +) diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index ab25c00da..aa2144c73 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -75,7 +75,7 @@ def extract_format(buf: bytes) -> Iterator[Tuple[Feature, Address]]: # 1. handling a file format (e.g. macho) # # for (1), this logic will need to be updated as the format is implemented. - logger.debug("unsupported file format: %s", binascii.hexlify(buf[:4]).decode("ascii")) + logger.debug("unknown file format: %s", buf[:4].hex()) return diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index 0cbabda11..82c8c3da9 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -310,6 +310,9 @@ def ei_osabi(self) -> Optional[OS]: 98: "TPC", 99: "SNP1K", 100: "ST200", + # https://www.sco.com/developers/gabi/latest/ch4.eheader.html + 183: "aarch64", + 243: "riscv", } @property diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py index b75c938e9..630f8024e 100644 --- a/capa/features/extractors/elffile.py +++ b/capa/features/extractors/elffile.py @@ -158,6 +158,10 @@ def extract_file_arch(elf: ELFFile, **kwargs): yield Arch("i386"), NO_ADDRESS elif arch == "x64": yield Arch("amd64"), NO_ADDRESS + elif arch == "ARM": + yield Arch("arm"), NO_ADDRESS + elif arch == "AArch64": + yield Arch("aarch64"), NO_ADDRESS else: logger.warning("unsupported architecture: %s", arch) diff --git a/capa/features/extractors/helpers.py b/capa/features/extractors/helpers.py index 541a6eae5..09f76f589 100644 --- a/capa/features/extractors/helpers.py +++ b/capa/features/extractors/helpers.py @@ -63,6 +63,7 @@ def generate_symbols(dll: str, symbol: str, include_dll=False) -> Iterator[str]: # trim extensions observed in dynamic traces dll = dll[0:-4] if dll.endswith(".dll") else dll dll = dll[0:-4] if dll.endswith(".drv") else dll + dll = dll[0:-3] if dll.endswith(".so") else dll if include_dll or is_ordinal(symbol): # ws2_32.#1 diff --git a/capa/features/insn.py b/capa/features/insn.py index f4be23c87..47f18dfc3 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -100,9 +100,10 @@ def __init__(self, value: str, description=None): # max number of operands to consider for a given instruction. -# since we only support Intel and .NET, we can assume this is 3 +# for Intel and .NET, this is 3 # which covers cases up to e.g. "vinserti128 ymm0,ymm0,ymm5,1" -MAX_OPERAND_COUNT = 4 +# for ARM/aarch64, we assume 4 +MAX_OPERAND_COUNT = 5 MAX_OPERAND_INDEX = MAX_OPERAND_COUNT - 1 diff --git a/capa/helpers.py b/capa/helpers.py index ec0a99716..237a67f62 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -31,6 +31,7 @@ FORMAT_FREEZE, FORMAT_DRAKVUF, FORMAT_UNKNOWN, + FORMAT_BINEXPORT2, Format, ) @@ -40,6 +41,7 @@ # DRAKVUF (.log, .log.gz) # VMRay (.zip) EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz", ".zip") +EXTENSIONS_BINEXPORT2 = ("BinExport", "BinExport2") EXTENSIONS_ELF = "elf_" EXTENSIONS_FREEZE = "frz" @@ -156,6 +158,8 @@ def get_format_from_extension(sample: Path) -> str: format_ = get_format_from_report(sample) elif sample.name.endswith(EXTENSIONS_FREEZE): format_ = FORMAT_FREEZE + elif sample.name.endswith(EXTENSIONS_BINEXPORT2): + format_ = FORMAT_BINEXPORT2 return format_ @@ -225,7 +229,8 @@ def log_unsupported_format_error(): logger.error(" Input file does not appear to be a supported file.") logger.error(" ") logger.error(" See all supported file formats via capa's help output (-h).") - logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") + logger.error(" If you don't know the input file type,") + logger.error(" you can try using the `file` utility to guess it.") logger.error("-" * 80) @@ -274,9 +279,8 @@ def log_unsupported_os_error(): logger.error("-" * 80) logger.error(" Input file does not appear to target a supported OS.") logger.error(" ") - logger.error( - " capa currently only supports analyzing executables for some operating systems (including Windows and Linux)." - ) + logger.error(" capa currently only analyzes executables for some operating systems") + logger.error(" (including Windows, Linux, and Android).") logger.error("-" * 80) @@ -294,9 +298,8 @@ def log_unsupported_runtime_error(): logger.error(" ") logger.error(" capa supports running under Python 3.8 and higher.") logger.error(" ") - logger.error( - " If you're seeing this message on the command line, please ensure you're running a supported Python version." - ) + logger.error(" If you're seeing this message on the command line,") + logger.error(" please ensure you're running a supported Python version.") logger.error("-" * 80) diff --git a/capa/loader.py b/capa/loader.py index 949308c5e..818198710 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -5,6 +5,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import os import sys import logging import datetime @@ -47,6 +48,7 @@ FORMAT_VMRAY, FORMAT_DOTNET, FORMAT_DRAKVUF, + FORMAT_BINEXPORT2, ) from capa.features.address import Address from capa.features.extractors.base_extractor import ( @@ -66,6 +68,7 @@ BACKEND_DRAKVUF = "drakvuf" BACKEND_VMRAY = "vmray" BACKEND_FREEZE = "freeze" +BACKEND_BINEXPORT2 = "binexport2" class CorruptFile(ValueError): @@ -308,10 +311,42 @@ def get_extractor( elif backend == BACKEND_FREEZE: return frz.load(input_path.read_bytes()) + elif backend == BACKEND_BINEXPORT2: + import capa.features.extractors.binexport2 + import capa.features.extractors.binexport2.extractor + + be2 = capa.features.extractors.binexport2.get_binexport2(input_path) + assert sample_path is not None + buf = sample_path.read_bytes() + + return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf) + else: raise ValueError("unexpected backend: " + backend) +def _get_binexport2_file_extractors(input_file: Path) -> List[FeatureExtractor]: + # I'm not sure this is where this logic should live, but it works for now. + # we'll keep this a "private" routine until we're sure. + import capa.features.extractors.binexport2 + + be2 = capa.features.extractors.binexport2.get_binexport2(input_file) + sample_path = capa.features.extractors.binexport2.get_sample_from_binexport2( + input_file, be2, [Path(os.environ.get("CAPA_SAMPLES_DIR", "."))] + ) + + with sample_path.open("rb") as f: + taste = f.read() + + if taste.startswith(capa.features.extractors.common.MATCH_PE): + return get_file_extractors(sample_path, FORMAT_PE) + elif taste.startswith(capa.features.extractors.common.MATCH_ELF): + return get_file_extractors(sample_path, FORMAT_ELF) + else: + logger.warning("unsupported format") + return [] + + def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtractor]: file_extractors: List[FeatureExtractor] = [] @@ -354,6 +389,9 @@ def get_file_extractors(input_file: Path, input_format: str) -> List[FeatureExtr file_extractors.append(capa.features.extractors.vmray.extractor.VMRayExtractor.from_zipfile(input_file)) + elif input_format == FORMAT_BINEXPORT2: + file_extractors = _get_binexport2_file_extractors(input_file) + return file_extractors diff --git a/capa/main.py b/capa/main.py index 80002b086..8035eafa2 100644 --- a/capa/main.py +++ b/capa/main.py @@ -51,6 +51,7 @@ BACKEND_FREEZE, BACKEND_PEFILE, BACKEND_DRAKVUF, + BACKEND_BINEXPORT2, ) from capa.helpers import ( get_file_taste, @@ -89,6 +90,7 @@ FORMAT_DRAKVUF, STATIC_FORMATS, DYNAMIC_FORMATS, + FORMAT_BINEXPORT2, ) from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities from capa.features.extractors.base_extractor import ( @@ -193,12 +195,13 @@ def simple_message_exception_handler(exctype, value: BaseException, traceback: T """ if exctype is KeyboardInterrupt: - print("KeyboardInterrupt detected, program terminated") + print("KeyboardInterrupt detected, program terminated", file=sys.stderr) else: print( f"Unexpected exception raised: {exctype}. Please run capa in debug mode (-d/--debug) " + "to see the stack trace. Please also report your issue on the capa GitHub page so we " - + "can improve the code! (https://github.com/mandiant/capa/issues)" + + "can improve the code! (https://github.com/mandiant/capa/issues)", + file=sys.stderr, ) @@ -264,6 +267,7 @@ def install_common_args(parser, wanted=None): (FORMAT_DRAKVUF, "DRAKVUF sandbox report"), (FORMAT_VMRAY, "VMRay sandbox report"), (FORMAT_FREEZE, "features previously frozen by capa"), + (FORMAT_BINEXPORT2, "BinExport2"), ] format_help = ", ".join([f"{f[0]}: {f[1]}" for f in formats]) @@ -282,6 +286,7 @@ def install_common_args(parser, wanted=None): (BACKEND_PEFILE, "pefile (file features only)"), (BACKEND_BINJA, "Binary Ninja"), (BACKEND_DOTNET, ".NET"), + (BACKEND_BINEXPORT2, "BinExport2"), (BACKEND_FREEZE, "capa freeze"), (BACKEND_CAPE, "CAPE"), (BACKEND_DRAKVUF, "DRAKVUF"), @@ -450,8 +455,12 @@ def handle_common_args(args): if args.rules == [RULES_PATH_DEFAULT_STRING]: logger.debug("-" * 80) logger.debug(" Using default embedded rules.") - logger.debug(" To provide your own rules, use the form `capa.exe -r ./path/to/rules/ /path/to/mal.exe`.") + logger.debug(" To provide your own rules, use the form:") + logger.debug("") + logger.debug(" `capa.exe -r ./path/to/rules/ /path/to/mal.exe`.") + logger.debug("") logger.debug(" You can see the current default rule set here:") + logger.debug("") logger.debug(" https://github.com/mandiant/capa-rules") logger.debug("-" * 80) @@ -566,6 +575,9 @@ def get_backend_from_cli(args, input_format: str) -> str: elif input_format == FORMAT_FREEZE: return BACKEND_FREEZE + elif input_format == FORMAT_BINEXPORT2: + return BACKEND_BINEXPORT2 + else: return BACKEND_VIV @@ -586,6 +598,13 @@ def get_sample_path_from_cli(args, backend: str) -> Optional[Path]: """ if backend in (BACKEND_CAPE, BACKEND_DRAKVUF, BACKEND_VMRAY): return None + elif backend == BACKEND_BINEXPORT2: + import capa.features.extractors.binexport2 + + be2 = capa.features.extractors.binexport2.get_binexport2(args.input_file) + return capa.features.extractors.binexport2.get_sample_from_binexport2( + args.input_file, be2, [Path(os.environ.get("CAPA_SAMPLES_DIR", "."))] + ) else: return args.input_file @@ -802,6 +821,9 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr sample_path = get_sample_path_from_cli(args, backend) extractor_filters = get_extractor_filters_from_cli(args, input_format) + logger.debug("format: %s", input_format) + logger.debug("backend: %s", backend) + try: extractor = capa.loader.get_extractor( args.input_file, diff --git a/scripts/detect-binexport2-capabilities.py b/scripts/detect-binexport2-capabilities.py new file mode 100644 index 000000000..3c914de2c --- /dev/null +++ b/scripts/detect-binexport2-capabilities.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python2 +""" +Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. +You may obtain a copy of the License at: [package root]/LICENSE.txt +Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. + +detect-binexport2-capabilities.py + +Detect capabilities in a BinExport2 file and write the results into the protobuf format. + +Example: + + $ python detect-binexport2-capabilities.py suspicious.BinExport2 | xxd | head + ┌────────┬─────────────────────────┬─────────────────────────┬────────┬────────┐ + │00000000│ 0a d4 05 0a 1a 32 30 32 ┊ 33 2d 30 32 2d 31 30 20 │_.•_•202┊3-02-10 │ + │00000010│ 31 31 3a 34 39 3a 35 32 ┊ 2e 36 39 33 34 30 30 12 │11:49:52┊.693400•│ + │00000020│ 05 35 2e 30 2e 30 1a 34 ┊ 74 65 73 74 73 2f 64 61 │•5.0.0•4┊tests/da│ + │00000030│ 74 61 2f 50 72 61 63 74 ┊ 69 63 61 6c 20 4d 61 6c │ta/Pract┊ical Mal│ + │00000040│ 77 61 72 65 20 41 6e 61 ┊ 6c 79 73 69 73 20 4c 61 │ware Ana┊lysis La│ + │00000050│ 62 20 30 31 2d 30 31 2e ┊ 64 6c 6c 5f 1a 02 2d 6a │b 01-01.┊dll_••-j│ + │00000060│ 22 c4 01 0a 20 32 39 30 ┊ 39 33 34 63 36 31 64 65 │".•_ 290┊934c61de│ + │00000070│ 39 31 37 36 61 64 36 38 ┊ 32 66 66 64 64 36 35 66 │9176ad68┊2ffdd65f│ + │00000080│ 30 61 36 36 39 12 28 61 ┊ 34 62 33 35 64 65 37 31 │0a669•(a┊4b35de71│ +""" +import sys +import logging +import argparse + +import capa.main +import capa.rules +import capa.engine +import capa.loader +import capa.helpers +import capa.features +import capa.exceptions +import capa.render.proto +import capa.render.verbose +import capa.features.freeze +import capa.capabilities.common +import capa.render.result_document as rd +from capa.loader import FORMAT_BINEXPORT2, BACKEND_BINEXPORT2 + +logger = logging.getLogger("capa.detect-binexport2-capabilities") + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser(description="detect capabilities in programs.") + capa.main.install_common_args( + parser, + wanted={"format", "os", "backend", "input_file", "signatures", "rules", "tag"}, + ) + args = parser.parse_args(args=argv) + + try: + capa.main.handle_common_args(args) + capa.main.ensure_input_exists_from_cli(args) + + input_format = capa.main.get_input_format_from_cli(args) + assert input_format == FORMAT_BINEXPORT2 + + backend = capa.main.get_backend_from_cli(args, input_format) + assert backend == BACKEND_BINEXPORT2 + + sample_path = capa.main.get_sample_path_from_cli(args, backend) + assert sample_path is not None + os_ = capa.loader.get_os(sample_path) + + rules = capa.main.get_rules_from_cli(args) + + extractor = capa.main.get_extractor_from_cli(args, input_format, backend) + # alternatively, if you have all this handy in your library code: + # + # extractor = capa.loader.get_extractor( + # args.input_file, + # FORMAT_BINEXPORT2, + # os_, + # BACKEND_BINEXPORT2, + # sig_paths=[], + # sample_path=sample_path, + # ) + # + # or even more concisely: + # + # be2 = capa.features.extractors.binexport2.get_binexport2(input_path) + # buf = sample_path.read_bytes() + # extractor = capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf) + except capa.main.ShouldExitError as e: + return e.status_code + + capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor) + + meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts) + meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities) + + doc = rd.ResultDocument.from_capa(meta, rules, capabilities) + pb = capa.render.proto.doc_to_pb2(doc) + + sys.stdout.buffer.write(pb.SerializeToString(deterministic=True)) + sys.stdout.flush() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py new file mode 100644 index 000000000..de2c82d86 --- /dev/null +++ b/scripts/inspect-binexport2.py @@ -0,0 +1,463 @@ +#!/usr/bin/env python +""" +Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. +You may obtain a copy of the License at: [package root]/LICENSE.txt +Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. +""" +import io +import sys +import time +import logging +import argparse +import contextlib +from typing import Dict, List, Optional + +import capa.main +import capa.features.extractors.binexport2 +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger("inspect-binexport2") + + +@contextlib.contextmanager +def timing(msg: str): + t0 = time.time() + yield + t1 = time.time() + logger.debug("perf: %s: %0.2fs", msg, t1 - t0) + + +class Renderer: + def __init__(self, o: io.StringIO): + self.o = o + self.indent = 0 + + @contextlib.contextmanager + def indenting(self): + self.indent += 1 + try: + yield + finally: + self.indent -= 1 + + def write(self, s): + self.o.write(s) + + def writeln(self, s): + self.o.write(" " * self.indent) + self.o.write(s) + self.o.write("\n") + + @contextlib.contextmanager + def section(self, name): + self.writeln(name) + with self.indenting(): + try: + yield + finally: + pass + self.writeln("/" + name) + self.writeln("") + + def getvalue(self): + return self.o.getvalue() + + +# internal to `render_operand` +def _render_expression_tree( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: List[List[int]], + tree_index: int, + o: io.StringIO, +): + + expression_index = operand.expression_index[tree_index] + expression = be2.expression[expression_index] + children_tree_indexes: List[int] = expression_tree[tree_index] + + if expression.type == BinExport2.Expression.REGISTER: + o.write(expression.symbol) + assert len(children_tree_indexes) == 0 + return + + elif expression.type == BinExport2.Expression.SYMBOL: + o.write(expression.symbol) + assert len(children_tree_indexes) <= 1 + + if len(children_tree_indexes) == 0: + return + elif len(children_tree_indexes) == 1: + # like: v + # from: mov v0.D[0x1], x9 + # | + # 0 + # . + # | + # D + child_index = children_tree_indexes[0] + _render_expression_tree(be2, operand, expression_tree, child_index, o) + return + else: + raise NotImplementedError(len(children_tree_indexes)) + + elif expression.type == BinExport2.Expression.IMMEDIATE_INT: + o.write(f"0x{expression.immediate:X}") + assert len(children_tree_indexes) == 0 + return + + elif expression.type == BinExport2.Expression.SIZE_PREFIX: + # like: b4 + # + # We might want to use this occasionally, such as to disambiguate the + # size of MOVs into/out of memory. But I'm not sure when/where we need that yet. + # + # IDA spams this size prefix hint *everywhere*, so we can't rely on the exporter + # to provide it only when necessary. + assert len(children_tree_indexes) == 1 + child_index = children_tree_indexes[0] + _render_expression_tree(be2, operand, expression_tree, child_index, o) + return + + elif expression.type == BinExport2.Expression.OPERATOR: + + if len(children_tree_indexes) == 1: + # prefix operator, like "ds:" + if expression.symbol != "!": + o.write(expression.symbol) + + child_index = children_tree_indexes[0] + _render_expression_tree(be2, operand, expression_tree, child_index, o) + + # postfix operator, like "!" in aarch operand "[x1, 8]!" + if expression.symbol == "!": + o.write(expression.symbol) + return + + elif len(children_tree_indexes) == 2: + # infix operator: like "+" in "ebp+10" + child_a = children_tree_indexes[0] + child_b = children_tree_indexes[1] + _render_expression_tree(be2, operand, expression_tree, child_a, o) + o.write(expression.symbol) + _render_expression_tree(be2, operand, expression_tree, child_b, o) + return + + elif len(children_tree_indexes) == 3: + # infix operator: like "+" in "ebp+ecx+10" + child_a = children_tree_indexes[0] + child_b = children_tree_indexes[1] + child_c = children_tree_indexes[2] + _render_expression_tree(be2, operand, expression_tree, child_a, o) + o.write(expression.symbol) + _render_expression_tree(be2, operand, expression_tree, child_b, o) + o.write(expression.symbol) + _render_expression_tree(be2, operand, expression_tree, child_c, o) + return + + else: + raise NotImplementedError(len(children_tree_indexes)) + + elif expression.type == BinExport2.Expression.DEREFERENCE: + o.write("[") + assert len(children_tree_indexes) == 1 + child_index = children_tree_indexes[0] + _render_expression_tree(be2, operand, expression_tree, child_index, o) + o.write("]") + return + + elif expression.type == BinExport2.Expression.IMMEDIATE_FLOAT: + raise NotImplementedError(expression.type) + + else: + raise NotImplementedError(expression.type) + + +_OPERAND_CACHE: Dict[int, str] = {} + + +def render_operand(be2: BinExport2, operand: BinExport2.Operand, index: Optional[int] = None) -> str: + # For the mimikatz example file, there are 138k distinct operands. + # Of those, only 11k are unique, which is less than 10% of the total. + # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times. + # In other words, the most common five operands account for 100k instances, + # which is around 75% of operand instances. + # Therefore, we expect caching to be fruitful, trading memory for CPU time. + # + # No caching: 6.045 s ± 0.164 s [User: 5.916 s, System: 0.129 s] + # With caching: 4.259 s ± 0.161 s [User: 4.141 s, System: 0.117 s] + # + # So we can save 30% of CPU time by caching operand rendering. + # + # Other measurements: + # + # perf: loading BinExport2: 0.06s + # perf: indexing BinExport2: 0.34s + # perf: rendering BinExport2: 1.96s + # perf: writing BinExport2: 1.13s + # ________________________________________________________ + # Executed in 4.40 secs fish external + # usr time 4.22 secs 0.00 micros 4.22 secs + # sys time 0.18 secs 842.00 micros 0.18 secs + if index and index in _OPERAND_CACHE: + return _OPERAND_CACHE[index] + + o = io.StringIO() + tree = capa.features.extractors.binexport2.helpers._build_expression_tree(be2, operand) + _render_expression_tree(be2, operand, tree, 0, o) + s = o.getvalue() + + if index: + _OPERAND_CACHE[index] = s + + return s + + +def inspect_operand(be2: BinExport2, operand: BinExport2.Operand): + expression_tree = capa.features.extractors.binexport2.helpers._build_expression_tree(be2, operand) + + def rec(tree_index, indent=0): + expression_index = operand.expression_index[tree_index] + expression = be2.expression[expression_index] + children_tree_indexes: List[int] = expression_tree[tree_index] + + NEWLINE = "\n" + print(f" {' ' * indent}expression: {str(expression).replace(NEWLINE, ', ')}") + for child_index in children_tree_indexes: + rec(child_index, indent + 1) + + rec(0) + + +def inspect_instruction(be2: BinExport2, instruction: BinExport2.Instruction, address: int): + mnemonic = be2.mnemonic[instruction.mnemonic_index] + print("instruction:") + print(f" address: {hex(address)}") + print(f" mnemonic: {mnemonic.name}") + + print(" operands:") + for i, operand_index in enumerate(instruction.operand_index): + print(f" - operand {i}: [{operand_index}]") + operand = be2.operand[operand_index] + # Ghidra bug where empty operands (no expressions) may + # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817) + if len(operand.expression_index) > 0: + inspect_operand(be2, operand) + + +def main(argv=None): + + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser(description="Inspect BinExport2 files") + capa.main.install_common_args(parser, wanted={"input_file"}) + parser.add_argument("--instruction", type=lambda v: int(v, 0)) + args = parser.parse_args(args=argv) + + try: + capa.main.handle_common_args(args) + except capa.main.ShouldExitError as e: + return e.status_code + + o = Renderer(io.StringIO()) + with timing("loading BinExport2"): + be2: BinExport2 = capa.features.extractors.binexport2.get_binexport2(args.input_file) + + with timing("indexing BinExport2"): + idx = capa.features.extractors.binexport2.BinExport2Index(be2) + + t0 = time.time() + + with o.section("meta"): + o.writeln(f"name: {be2.meta_information.executable_name}") + o.writeln(f"sha256: {be2.meta_information.executable_id}") + o.writeln(f"arch: {be2.meta_information.architecture_name}") + o.writeln(f"ts: {be2.meta_information.timestamp}") + + with o.section("modules"): + for module in be2.module: + o.writeln(f"- {module.name}") + if not be2.module: + o.writeln("(none)") + + with o.section("sections"): + for section in be2.section: + perms = "" + perms += "r" if section.flag_r else "-" + perms += "w" if section.flag_w else "-" + perms += "x" if section.flag_x else "-" + o.writeln(f"- {hex(section.address)} {perms} {hex(section.size)}") + + with o.section("libraries"): + for library in be2.library: + o.writeln( + f"- {library.name:<12s} {'(static)' if library.is_static else ''}{(' at ' + hex(library.load_address)) if library.HasField('load_address') else ''}" + ) + if not be2.library: + o.writeln("(none)") + + with o.section("functions"): + for vertex_index, vertex in enumerate(be2.call_graph.vertex): + if not vertex.HasField("address"): + continue + + with o.section(f"function {idx.get_function_name_by_vertex(vertex_index)} @ {hex(vertex.address)}"): + o.writeln(f"type: {vertex.Type.Name(vertex.type)}") + + if vertex.HasField("mangled_name"): + o.writeln(f"name: {vertex.mangled_name}") + + if vertex.HasField("demangled_name"): + o.writeln(f"demangled: {vertex.demangled_name}") + + if vertex.HasField("library_index"): + # TODO(williballenthin): this seems to be incorrect for Ghidra exporter + # https://github.com/mandiant/capa/issues/1755 + library = be2.library[vertex.library_index] + o.writeln(f"library: [{vertex.library_index}] {library.name}") + + if vertex.HasField("module_index"): + module = be2.module[vertex.module_index] + o.writeln(f"module: [{vertex.module_index}] {module.name}") + + if idx.callees_by_vertex_index[vertex_index] or idx.callers_by_vertex_index[vertex_index]: + o.writeln("xrefs:") + + for caller_index in idx.callers_by_vertex_index[vertex_index]: + o.writeln(f" ← {idx.get_function_name_by_vertex(caller_index)}") + + for callee_index in idx.callees_by_vertex_index[vertex_index]: + o.writeln(f" → {idx.get_function_name_by_vertex(callee_index)}") + + if vertex.address not in idx.flow_graph_index_by_address: + o.writeln("(no flow graph)") + else: + flow_graph_index = idx.flow_graph_index_by_address[vertex.address] + flow_graph = be2.flow_graph[flow_graph_index] + + o.writeln("") + for basic_block_index in flow_graph.basic_block_index: + basic_block = be2.basic_block[basic_block_index] + basic_block_address = idx.get_basic_block_address(basic_block_index) + + with o.section(f"basic block {hex(basic_block_address)}"): + for edge in idx.target_edges_by_basic_block_index[basic_block_index]: + if edge.type == BinExport2.FlowGraph.Edge.Type.CONDITION_FALSE: + continue + + source_basic_block_index = edge.source_basic_block_index + source_basic_block_address = idx.get_basic_block_address(source_basic_block_index) + + o.writeln( + f"↓ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(source_basic_block_address)}" + ) + + for instruction_index, instruction, instruction_address in idx.basic_block_instructions( + basic_block + ): + mnemonic = be2.mnemonic[instruction.mnemonic_index] + + operands = [] + for operand_index in instruction.operand_index: + operand = be2.operand[operand_index] + # Ghidra bug where empty operands (no expressions) may + # exist so we skip those for now (see https://github.com/NationalSecurityAgency/ghidra/issues/6817) + if len(operand.expression_index) > 0: + operands.append(render_operand(be2, operand, index=operand_index)) + + call_targets = "" + if instruction.call_target: + call_targets = " " + for call_target_address in instruction.call_target: + call_target_name = idx.get_function_name_by_address(call_target_address) + call_targets += f"→ function {call_target_name} @ {hex(call_target_address)} " + + data_references = "" + if instruction_index in idx.data_reference_index_by_source_instruction_index: + data_references = " " + for data_reference_index in idx.data_reference_index_by_source_instruction_index[ + instruction_index + ]: + data_reference = be2.data_reference[data_reference_index] + data_reference_address = data_reference.address + data_references += f"⇥ data {hex(data_reference_address)} " + + string_references = "" + if instruction_index in idx.string_reference_index_by_source_instruction_index: + string_references = " " + for ( + string_reference_index + ) in idx.string_reference_index_by_source_instruction_index[instruction_index]: + string_reference = be2.string_reference[string_reference_index] + string_index = string_reference.string_table_index + string = be2.string_table[string_index] + string_references += f'⇥ string "{string.rstrip()}" ' + + comments = "" + if instruction.comment_index: + comments = " " + for comment_index in instruction.comment_index: + comment = be2.comment[comment_index] + comment_string = be2.string_table[comment.string_table_index] + comments += f"; {BinExport2.Comment.Type.Name(comment.type)} {comment_string} " + + o.writeln( + f"{hex(instruction_address)} {mnemonic.name:<12s}{', '.join(operands):<14s}{call_targets}{data_references}{string_references}{comments}" + ) + + does_fallthrough = False + for edge in idx.source_edges_by_basic_block_index[basic_block_index]: + if edge.type == BinExport2.FlowGraph.Edge.Type.CONDITION_FALSE: + does_fallthrough = True + continue + + back_edge = "" + if edge.HasField("is_back_edge") and edge.is_back_edge: + back_edge = "↑" + + target_basic_block_index = edge.target_basic_block_index + target_basic_block_address = idx.get_basic_block_address(target_basic_block_index) + o.writeln( + f"→ {BinExport2.FlowGraph.Edge.Type.Name(edge.type)} basic block {hex(target_basic_block_address)} {back_edge}" + ) + + if does_fallthrough: + o.writeln("↓ CONDITION_FALSE") + + with o.section("data"): + for data_address in sorted(idx.data_reference_index_by_target_address.keys()): + if data_address in idx.insn_address_by_index: + # appears to be code + continue + + data_xrefs: List[int] = [] + for data_reference_index in idx.data_reference_index_by_target_address[data_address]: + data_reference = be2.data_reference[data_reference_index] + instruction_address = idx.get_insn_address(data_reference.instruction_index) + data_xrefs.append(instruction_address) + + if not data_xrefs: + continue + + o.writeln(f"{hex(data_address)} ⇤ {hex(data_xrefs[0])}") + for data_xref in data_xrefs[1:]: + o.writeln(f"{' ' * len(hex(data_address))} ↖ {hex(data_xref)}") + + t1 = time.time() + logger.debug("perf: rendering BinExport2: %0.2fs", t1 - t0) + + with timing("writing to STDOUT"): + print(o.getvalue()) + + if args.instruction: + insn = idx.insn_by_address[args.instruction] + inspect_instruction(be2, insn, args.instruction) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/fixtures.py b/tests/fixtures.py index 41a656dd9..e4d0a6fa0 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -226,6 +226,19 @@ def get_ghidra_extractor(path: Path): return extractor +@lru_cache(maxsize=1) +def get_binexport_extractor(path): + import capa.features.extractors.binexport2 + import capa.features.extractors.binexport2.extractor + + be2 = capa.features.extractors.binexport2.get_binexport2(path) + search_paths = [CD / "data", CD / "data" / "aarch64"] + path = capa.features.extractors.binexport2.get_sample_from_binexport2(path, be2, search_paths) + buf = path.read_bytes() + + return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf) + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -428,6 +441,20 @@ def get_data_path_by_name(name) -> Path: return CD / "data" / "dotnet" / "dd9098ff91717f4906afe9dafdfa2f52.exe_" elif name.startswith("nested_typeref"): return CD / "data" / "dotnet" / "2c7d60f77812607dec5085973ff76cea.dll_" + elif name.startswith("687e79.ghidra.be2"): + return ( + CD + / "data" + / "binexport2" + / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport" + ) + elif name.startswith("d1e650.ghidra.be2"): + return ( + CD + / "data" + / "binexport2" + / "d1e6506964edbfffb08c0dd32e1486b11fbced7a4bd870ffe79f110298f0efb8.elf_.ghidra.BinExport" + ) else: raise ValueError(f"unexpected sample fixture: {name}") @@ -791,7 +818,9 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False), ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False), # insn/offset: negative + # 0x4012b4 MOVZX ECX, [EAX+0xFFFFFFFFFFFFFFFF] ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True), + # 0x4012b8 MOVZX EAX, [EAX+0xFFFFFFFFFFFFFFFE] ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True), # # insn/offset from mnemonic: add @@ -814,7 +843,7 @@ def parametrize(params, values, **kwargs): # should not be considered, lea operand invalid encoding # .text:004717B1 8D 4C 31 D0 lea ecx, [ecx+esi-30h] ("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False), - # yes, this is also a number (imagine edx is zero): + # yes, this is also a number (imagine ebx is zero): # .text:004018C0 8D 4B 02 lea ecx, [ebx+2] ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True), # insn/api @@ -1368,9 +1397,9 @@ def parametrize(params, values, **kwargs): FEATURE_COUNT_TESTS_GHIDRA = [ # Ghidra may render functions as labels, as well as provide differing amounts of call references - # (Colton) TODO: Add more test cases ("mimikatz", "function=0x4702FD", capa.features.common.Characteristic("calls from"), 0), - ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("calls to"), 0), + ("mimikatz", "function=0x401bf1", capa.features.common.Characteristic("calls to"), 2), + ("mimikatz", "function=0x401000", capa.features.basicblock.BasicBlock(), 3), ] diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py new file mode 100644 index 000000000..bc9ea6db1 --- /dev/null +++ b/tests/test_binexport_accessors.py @@ -0,0 +1,602 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import re +import logging +from typing import Any, Dict +from pathlib import Path + +import pytest +import fixtures +from google.protobuf.json_format import ParseDict + +import capa.features.extractors.binexport2.helpers +from capa.features.extractors.binexport2.helpers import ( + BinExport2InstructionPattern, + BinExport2InstructionPatternMatcher, + split_with_delimiters, + get_operand_expressions, + get_instruction_mnemonic, + get_instruction_operands, + get_operand_register_expression, + get_operand_immediate_expression, +) +from capa.features.extractors.binexport2.extractor import BinExport2FeatureExtractor +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression + +logger = logging.getLogger(__name__) + +CD = Path(__file__).resolve().parent + + +# found via https://www.virustotal.com/gui/search/type%253Aelf%2520and%2520size%253A1.2kb%252B%2520and%2520size%253A1.4kb-%2520and%2520tag%253Aarm%2520and%2520not%2520tag%253Arelocatable%2520and%2520tag%253A64bits/files +# Ghidra disassembly of c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486 +GHIDRA_DISASSEMBLY = """ + // + // segment_1 + // Loadable segment [0x200000 - 0x200157] + // ram:00200000-ram:00200157 + // + 00200000 7f 45 4c Elf64_Ehdr +... + // + // .text + // SHT_PROGBITS [0x210158 - 0x2101c7] + // ram:00210158-ram:002101c7 + // + ************************************************************** + * FUNCTION * + ************************************************************** + undefined entry() + undefined w0:1 + _start XREF[4]: Entry Point(*), 00200018(*), + entry 002000c0(*), + _elfSectionHeaders::00000050(*) + 00210158 20 00 80 d2 mov x0,#0x1 + 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 = "Hello World!\n" + = 00000000002201C8h + 00210160 c2 02 00 58 ldr x2,DAT_002101b8 = 000000000000000Eh + 00210164 08 08 80 d2 mov x8,#0x40 + 00210168 01 00 00 d4 svc 0x0 + 0021016c a0 02 00 58 ldr x0=>$stringWith_Weird_Name,DAT_002101c0 = "This string has a very strang + = 00000000002201D6h + 00210170 04 00 00 94 bl printString undefined printString() + 00210174 60 0f 80 d2 mov x0,#0x7b + 00210178 a8 0b 80 d2 mov x8,#0x5d + 0021017c 01 00 00 d4 svc 0x0 + ************************************************************** + * FUNCTION * + ************************************************************** + undefined printString() + undefined w0:1 + printString XREF[1]: entry:00210170(c) + 00210180 01 00 80 d2 mov x1,#0x0 + strlenLoop XREF[1]: 00210194(j) + 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + 00210188 5f 00 00 71 cmp w2,#0x0 + 0021018c 60 00 00 54 b.eq strlenDone + 00210190 21 04 00 91 add x1,x1,#0x1 + 00210194 fc ff ff 17 b strlenLoop + strlenDone XREF[1]: 0021018c(j) + 00210198 e2 03 01 aa mov x2,x1 + 0021019c e1 03 00 aa mov x1,x0 + 002101a0 20 00 80 d2 mov x0,#0x1 + 002101a4 08 08 80 d2 mov x8,#0x40 + 002101a8 01 00 00 d4 svc 0x0 + 002101ac c0 03 5f d6 ret + DAT_002101b0 XREF[1]: entry:0021015c(R) + 002101b0 c8 01 22 undefined8 00000000002201C8h ? -> 002201c8 + 00 00 00 + 00 00 + DAT_002101b8 XREF[1]: entry:00210160(R) + 002101b8 0e 00 00 undefined8 000000000000000Eh + 00 00 00 + 00 00 + DAT_002101c0 XREF[1]: entry:0021016c(R) + 002101c0 d6 01 22 undefined8 00000000002201D6h ? -> 002201d6 + 00 00 00 + 00 00 + // + // .data + // SHT_PROGBITS [0x2201c8 - 0x2201fb] + // ram:002201c8-ram:002201fb + // + helloWorldStr XREF[3]: 002000f8(*), entry:0021015c(*), + _elfSectionHeaders::00000090(*) + 002201c8 48 65 6c ds "Hello World!\n" + 6c 6f 20 + 57 6f 72 + $stringWith_Weird_Name XREF[1]: entry:0021016c(*) + 002201d6 54 68 69 ds "This string has a very strange label\n" + 73 20 73 + 74 72 69 +... +""" + + +def _parse_ghidra_disassembly(disasm: str) -> dict: + dd = {} + # 00210158 20 00 80 d2 mov x0,#0x1 + # ^^^^^^^^ ^^^^^^^^^^^ ^^^ ^^ ^^^^ + # address bytes mnemonic o1,o2 (,o3) + pattern = re.compile( + r"^( ){8}(?P
[0-9a-f]+) " + + r"(?P([0-9a-f]{2}[ ]){4})\s+" + + r"(?P[\w\.]+)\s*" + + r"(?P[\w#$=>]+)?,?" + + r"((?P[\w#$=>]+))?,?" + + r"((?P[\w#$=>]+))?" + ) + for line in disasm.splitlines()[20:]: + m = pattern.match(line) + if m: + logger.debug("Match found\t%s\n\t\t\t\t%s", line, m.groupdict()) + dd[int(m["address"], 0x10)] = { + "bytes": m["bytes"].strip(), + "mnemonic": m["mnemonic"], + "operands": [e for e in [m["operand1"], m["operand2"], m["operand3"]] if e is not None], + } + else: + logger.debug("No match\t%s", line) + return dd + + +BE2_EXTRACTOR = fixtures.get_binexport_extractor( + CD + / "data" + / "binexport2" + / "c7f38027552a3eca84e2bfc846ac1307fbf98657545426bb93a2d63555cbb486.elf_.ghidra.BinExport" +) +PARSED_DISASM = _parse_ghidra_disassembly(GHIDRA_DISASSEMBLY) + + +def test_instruction_bytes(): + # more a data sanity check here as we don't test our code + for addr, de in PARSED_DISASM.items(): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + assert insn.raw_bytes == bytes.fromhex(de["bytes"]) + + +def test_get_instruction_mnemonic(): + for addr, de in PARSED_DISASM.items(): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + assert get_instruction_mnemonic(BE2_EXTRACTOR.be2, insn) == de["mnemonic"] + + +def test_get_instruction_operands_count(): + for addr, de in PARSED_DISASM.items(): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + # this line is not properly parsed from the Ghidra disassembly using the current regex + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + if addr == 0x210184: + assert len(ops) == 2 + else: + assert len(ops) == len(de["operands"]) + + +@pytest.mark.parametrize( + "addr,expressions", + [ + # 00210158 20 00 80 d2 mov x0,#0x1 + ( + 0x210158, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"), + BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1), + ), + ), + # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 + ( + 0x21015C, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression( + type=BinExport2.Expression.IMMEDIATE_INT, symbol="PTR_helloWorldStr_002101b0", immediate=0x2101B0 + ), + ), + ), + # 00210184 02 68 61 38 ldrb w2,[x0, x1, LSL ] + # ^^^ issue in Ghidra? + # IDA gives LDRB W2, [X0,X1] + ( + 0x210184, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="w2"), + ( + BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="["), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x0"), + BinExport2.Expression(type=BinExport2.Expression.OPERATOR, symbol=","), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression(type=BinExport2.Expression.DEREFERENCE, symbol="]"), + ), + ), + ), + # 00210190 21 04 00 91 add x1,x1,#0x1 + ( + 0x210190, + ( + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression(type=BinExport2.Expression.REGISTER, symbol="x1"), + BinExport2.Expression(type=BinExport2.Expression.IMMEDIATE_INT, immediate=0x1), + ), + ), + ], +) +def test_get_operand_expressions(addr, expressions): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + for i, op in enumerate(ops): + op_expression = expressions[i] + exps = get_operand_expressions(BE2_EXTRACTOR.be2, op) + if len(exps) > 1: + for j, exp in enumerate(exps): + assert exp.type == op_expression[j].type + assert exp.symbol == op_expression[j].symbol + else: + assert len(exps) == 1 + assert exps[0] == op_expression + + +@pytest.mark.parametrize( + "addr,expressions", + [ + # 00210158 20 00 80 d2 mov x0,#0x1 + (0x210158, ("x0", None)), + # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 + (0x21015C, ("x1", None)), + # 0021019c e1 03 00 aa mov x1,x0 + (0x21019C, ("x1", "x0")), + # 00210190 21 04 00 91 add x1,x1,#0x1 + (0x210190, ("x1", "x1", None)), + ], +) +def test_get_operand_register_expression(addr, expressions): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + for i, op in enumerate(ops): + reg_exp = get_operand_register_expression(BE2_EXTRACTOR.be2, op) + if reg_exp is None: + assert reg_exp == expressions[i] + else: + assert reg_exp.symbol == expressions[i] + + +@pytest.mark.parametrize( + "addr,expressions", + [ + # 00210158 20 00 80 d2 mov x0,#0x1 + (0x210158, (None, 0x1)), + # 0021015c a1 02 00 58 ldr x1=>helloWorldStr,DAT_002101b0 + (0x21015C, (None, 0x2101B0)), + # 002101a8 01 00 00 d4 svc 0x0 + (0x2101A8, (0x0,)), + # 00210190 21 04 00 91 add x1,x1,#0x1 + (0x210190, (None, None, 0x1)), + ], +) +def test_get_operand_immediate_expression(addr, expressions): + insn = BE2_EXTRACTOR.idx.get_instruction_by_address(addr) + ops = get_instruction_operands(BE2_EXTRACTOR.be2, insn) + for i, op in enumerate(ops): + reg_exp = get_operand_immediate_expression(BE2_EXTRACTOR.be2, op) + if reg_exp is None: + assert reg_exp == expressions[i] + else: + assert reg_exp.immediate == expressions[i] + + +""" +mov x0, 0x20 +bl 0x100 +add x0, sp, 0x10 +""" +BE2_DICT: Dict[str, Any] = { + "expression": [ + {"type": BinExport2.Expression.REGISTER, "symbol": "x0"}, + {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x20}, + {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x100}, + {"type": BinExport2.Expression.REGISTER, "symbol": "sp"}, + {"type": BinExport2.Expression.IMMEDIATE_INT, "immediate": 0x10}, + ], + # operand consists of 1 or more expressions, linked together as a tree + "operand": [ + {"expression_index": [0]}, + {"expression_index": [1]}, + {"expression_index": [2]}, + {"expression_index": [3]}, + {"expression_index": [4]}, + ], + "mnemonic": [ + {"name": "mov"}, # mnem 0 + {"name": "bl"}, # mnem 1 + {"name": "add"}, # mnem 2 + ], + # instruction may have 0 or more operands + "instruction": [ + {"mnemonic_index": 0, "operand_index": [0, 1]}, + {"mnemonic_index": 1, "operand_index": [2]}, + {"mnemonic_index": 2, "operand_index": [0, 3, 4]}, + ], +} +BE2 = ParseDict( + BE2_DICT, + BinExport2(), +) + + +def test_is_stack_register_expression(): + mov = ParseDict(BE2_DICT["instruction"][0], BinExport2.Instruction()) + add = ParseDict(BE2_DICT["instruction"][2], BinExport2.Instruction()) + + mov_op0, mov_op1 = get_instruction_operands(BE2, mov) + op0_exp0 = get_operand_expressions(BE2, mov_op0)[0] + assert is_stack_register_expression(BE2, op0_exp0) is False + op0_exp1 = get_operand_expressions(BE2, mov_op1)[0] + assert is_stack_register_expression(BE2, op0_exp1) is False + + add_op0, add_op1, add_op2 = get_instruction_operands(BE2, add) + op0_exp0 = get_operand_expressions(BE2, add_op0)[0] + assert is_stack_register_expression(BE2, op0_exp0) is False + op1_exp0 = get_operand_expressions(BE2, add_op1)[0] + assert is_stack_register_expression(BE2, op1_exp0) is True + op2_exp0 = get_operand_expressions(BE2, add_op2)[0] + assert is_stack_register_expression(BE2, op2_exp0) is False + + +def test_split_with_delimiters(): + assert tuple(split_with_delimiters("abc|def", ("|",))) == ("abc", "|", "def") + assert tuple(split_with_delimiters("abc|def|", ("|",))) == ("abc", "|", "def", "|") + assert tuple(split_with_delimiters("abc||def", ("|",))) == ("abc", "|", "", "|", "def") + assert tuple(split_with_delimiters("abc|def-ghi", ("|", "-"))) == ("abc", "|", "def", "-", "ghi") + + +def test_pattern_parsing(): + assert BinExport2InstructionPattern.from_str( + "br reg ; capture reg" + ) == BinExport2InstructionPattern(mnemonics=("br",), operands=("reg",), capture="reg") + + assert BinExport2InstructionPattern.from_str( + "mov reg0, reg1 ; capture reg0" + ) == BinExport2InstructionPattern(mnemonics=("mov",), operands=("reg0", "reg1"), capture="reg0") + + assert BinExport2InstructionPattern.from_str( + "adrp reg, #int ; capture #int" + ) == BinExport2InstructionPattern(mnemonics=("adrp",), operands=("reg", "#int"), capture="#int") + + assert BinExport2InstructionPattern.from_str( + "add reg, reg, #int ; capture #int" + ) == BinExport2InstructionPattern(mnemonics=("add",), operands=("reg", "reg", "#int"), capture="#int") + + assert BinExport2InstructionPattern.from_str( + "ldr reg0, [reg1] ; capture reg1" + ) == BinExport2InstructionPattern(mnemonics=("ldr",), operands=("reg0", ("[", "reg1")), capture="reg1") + + assert BinExport2InstructionPattern.from_str( + "ldr|str reg, [reg, #int] ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldr", + "str", + ), + operands=("reg", ("[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldr|str reg, [reg, #int]! ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldr", + "str", + ), + operands=("reg", ("!", "[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldr|str reg, [reg], #int ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldr", + "str", + ), + operands=( + "reg", + ( + "[", + "reg", + ), + "#int", + ), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldp|stp reg, reg, [reg, #int] ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldp", + "stp", + ), + operands=("reg", "reg", ("[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldp|stp reg, reg, [reg, #int]! ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldp", + "stp", + ), + operands=("reg", "reg", ("!", "[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldp|stp reg, reg, [reg], #int ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldp", + "stp", + ), + operands=("reg", "reg", ("[", "reg"), "#int"), + capture="#int", + ) + + assert ( + BinExport2InstructionPatternMatcher.from_str( + """ + # comment + br reg + br reg(not-stack) + br reg ; capture reg + mov reg0, reg1 ; capture reg0 + adrp reg, #int ; capture #int + add reg, reg, #int ; capture #int + ldr reg0, [reg1] ; capture reg1 + ldr|str reg, [reg, #int] ; capture #int + ldr|str reg, [reg, #int]! ; capture #int + ldr|str reg, [reg], #int ; capture #int + ldp|stp reg, reg, [reg, #int] ; capture #int + ldp|stp reg, reg, [reg, #int]! ; capture #int + ldp|stp reg, reg, [reg], #int ; capture #int + ldrb reg0, [reg1, reg2] ; capture reg2 + call [reg + reg * #int + #int] + call [reg + reg * #int] + call [reg * #int + #int] + call [reg + reg + #int] + call [reg + #int] + """ + ).queries + is not None + ) + + +def match_address(extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int): + instruction = extractor.idx.insn_by_address[address] + mnemonic: str = get_instruction_mnemonic(extractor.be2, instruction) + + operands = [] + for operand_index in instruction.operand_index: + operand = extractor.be2.operand[operand_index] + operands.append(capa.features.extractors.binexport2.helpers.get_operand_expressions(extractor.be2, operand)) + + return queries.match(mnemonic, operands) + + +def match_address_with_be2( + extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int +): + instruction_index = extractor.idx.insn_index_by_address[address] + return queries.match_with_be2(extractor.be2, instruction_index) + + +def test_pattern_matching(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + br reg(stack) ; capture reg + br reg(not-stack) ; capture reg + mov reg0, reg1 ; capture reg0 + adrp reg, #int ; capture #int + add reg, reg, #int ; capture #int + ldr reg0, [reg1] ; capture reg1 + ldr|str reg, [reg, #int] ; capture #int + ldr|str reg, [reg, #int]! ; capture #int + ldr|str reg, [reg], #int ; capture #int + ldp|stp reg, reg, [reg, #int] ; capture #int + ldp|stp reg, reg, [reg, #int]! ; capture #int + ldp|stp reg, reg, [reg], #int ; capture #int + ldrb reg0, [reg1(not-stack), reg2] ; capture reg2 + """ + ) + + # 0x210184: ldrb w2, [x0, x1] + # query: ldrb reg0, [reg1(not-stack), reg2] ; capture reg2" + assert match_address(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1" + assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1" + + # 0x210198: mov x2, x1 + # query: mov reg0, reg1 ; capture reg0"), + assert match_address(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2" + assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2" + + # 0x210190: add x1, x1, 0x1 + # query: add reg, reg, #int ; capture #int + assert match_address(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1 + assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1 + + +BE2_EXTRACTOR_687 = fixtures.get_binexport_extractor( + CD + / "data" + / "binexport2" + / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport" +) + + +def test_pattern_matching_exclamation(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + stp reg, reg, [reg, #int]! ; capture #int + """ + ) + + # note this captures the sp + # 0x107918: stp x20, x19, [sp,0xFFFFFFFFFFFFFFE0]! + # query: stp reg, reg, [reg, #int]! ; capture #int + assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + + +def test_pattern_matching_stack(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + stp reg, reg, [reg(stack), #int]! ; capture #int + """ + ) + + # note this does capture the sp + # compare this with the test above (exclamation) + # 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]! + # query: stp reg, reg, [reg(stack), #int]! ; capture #int + assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + + +def test_pattern_matching_not_stack(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + stp reg, reg, [reg(not-stack), #int]! ; capture #int + """ + ) + + # note this does not capture the sp + # compare this with the test above (exclamation) + # 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]! + # query: stp reg, reg, [reg(not-stack), #int]! ; capture #int + assert match_address(BE2_EXTRACTOR_687, queries, 0x107918) is None + assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918) is None + + +BE2_EXTRACTOR_MIMI = fixtures.get_binexport_extractor(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport") + + +def test_pattern_matching_x86(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 + """ + ) + + # 0x4018c0: LEA ECX, [EBX+0x2] + # query: cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 + assert match_address(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2 + assert match_address_with_be2(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2 diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py new file mode 100644 index 000000000..3bf6d56d7 --- /dev/null +++ b/tests/test_binexport_features.py @@ -0,0 +1,442 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import binascii +from typing import cast + +import pytest +import fixtures + +import capa.features.file +import capa.features.insn +import capa.features.common +import capa.features.basicblock +from capa.features.common import ( + OS, + OS_LINUX, + ARCH_I386, + FORMAT_PE, + ARCH_AMD64, + FORMAT_ELF, + OS_ANDROID, + OS_WINDOWS, + ARCH_AARCH64, + Arch, + Format, +) + +FEATURE_PRESENCE_TESTS_BE2_ELF_AARCH64 = sorted( + [ + # file/string + ( + "687e79.ghidra.be2", + "file", + capa.features.common.String("AppDataService start"), + True, + ), + ("687e79.ghidra.be2", "file", capa.features.common.String("nope"), False), + # file/sections + ("687e79.ghidra.be2", "file", capa.features.file.Section(".text"), True), + ("687e79.ghidra.be2", "file", capa.features.file.Section(".nope"), False), + # file/exports + ( + "687e79.ghidra.be2", + "file", + capa.features.file.Export("android::clearDir"), + "xfail: name demangling is not implemented", + ), + ("687e79.ghidra.be2", "file", capa.features.file.Export("nope"), False), + # file/imports + ("687e79.ghidra.be2", "file", capa.features.file.Import("fopen"), True), + ("687e79.ghidra.be2", "file", capa.features.file.Import("exit"), True), + ( + "687e79.ghidra.be2", + "file", + capa.features.file.Import("_ZN7android10IInterfaceD0Ev"), + True, + ), + ("687e79.ghidra.be2", "file", capa.features.file.Import("nope"), False), + # function/characteristic(loop) + ( + "687e79.ghidra.be2", + "function=0x1056c0", + capa.features.common.Characteristic("loop"), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x1075c0", + capa.features.common.Characteristic("loop"), + False, + ), + # bb/characteristic(tight loop) + ( + "d1e650.ghidra.be2", + "function=0x114af4", + capa.features.common.Characteristic("tight loop"), + True, + ), + ( + "d1e650.ghidra.be2", + "function=0x118F1C", + capa.features.common.Characteristic("tight loop"), + True, + ), + ( + "d1e650.ghidra.be2", + "function=0x11464c", + capa.features.common.Characteristic("tight loop"), + False, + ), + # bb/characteristic(stack string) + ( + "687e79.ghidra.be2", + "function=0x0", + capa.features.common.Characteristic("stack string"), + "xfail: not implemented yet", + ), + ( + "687e79.ghidra.be2", + "function=0x0", + capa.features.common.Characteristic("stack string"), + "xfail: not implemented yet", + ), + # insn/mnemonic + ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("stp"), True), + ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrp"), True), + ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("bl"), True), + ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("in"), False), + ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrl"), False), + # insn/number + # 00114524 add x29,sp,#0x10 + ( + "d1e650.ghidra.be2", + "function=0x11451c", + capa.features.insn.Number(0x10), + False, + ), + # 00105128 sub sp,sp,#0xE0 + ( + "687e79.ghidra.be2", + "function=0x105128", + capa.features.insn.Number(0xE0), + False, + ), + # insn/operand.number + ( + "687e79.ghidra.be2", + "function=0x105128,bb=0x1051e4", + capa.features.insn.OperandNumber(1, 0xFFFFFFFF), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x107588,bb=0x107588", + capa.features.insn.OperandNumber(1, 0x8), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x107588,bb=0x107588,insn=0x1075a4", + capa.features.insn.OperandNumber(1, 0x8), + True, + ), + # insn/operand.offset + ( + "687e79.ghidra.be2", + "function=0x105128,bb=0x105450", + capa.features.insn.OperandOffset(2, 0x10), + True, + ), + ( + "d1e650.ghidra.be2", + "function=0x124854,bb=0x1248AC,insn=0x1248B4", + capa.features.insn.OperandOffset(2, -0x48), + True, + ), + ( + "d1e650.ghidra.be2", + "function=0x13347c,bb=0x133548,insn=0x133554", + capa.features.insn.OperandOffset(2, 0x20), + False, + ), + ("687e79.ghidra.be2", "function=0x105C88", capa.features.insn.Number(0xF000), True), + # insn/number: negative + ( + "687e79.ghidra.be2", + "function=0x1057f8,bb=0x1057f8", + capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x1057f8,bb=0x1057f8", + capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x1066e0,bb=0x1068c4", + capa.features.insn.Number(0xFFFFFFFF), + True, + ), + # insn/offset + ( + "687e79.ghidra.be2", + "function=0x105128,bb=0x105450", + capa.features.insn.Offset(0x10), + True, + ), + # ldp x29,x30,[sp, #0x20] + ( + "d1e650.ghidra.be2", + "function=0x13347c,bb=0x133548,insn=0x133554", + capa.features.insn.Offset(0x20), + False, + ), + # stp x20,x0,[x19, #0x8] + ( + "d1e650.ghidra.be2", + "function=0x1183e0,bb=0x11849c,insn=0x1184b0", + capa.features.insn.Offset(0x8), + True, + ), + # str xzr,[x8, #0x8]! + ( + "d1e650.ghidra.be2", + "function=0x138688,bb=0x138994,insn=0x1389a8", + capa.features.insn.Offset(0x8), + True, + ), + # ldr x9,[x8, #0x8]! + ( + "d1e650.ghidra.be2", + "function=0x138688,bb=0x138978,insn=0x138984", + capa.features.insn.Offset(0x8), + True, + ), + # ldr x19,[sp], #0x20 + ( + "d1e650.ghidra.be2", + "function=0x11451c", + capa.features.insn.Offset(0x20), + False, + ), + # ldrb w9,[x8, #0x1] + ( + "d1e650.ghidra.be2", + "function=0x138a9c,bb=0x138b00,insn=0x138b00", + capa.features.insn.Offset(0x1), + True, + ), + # insn/offset: negative + ( + "d1e650.ghidra.be2", + "function=0x124854,bb=0x1248AC,insn=0x1248B4", + capa.features.insn.Offset(-0x48), + True, + ), + # insn/offset from mnemonic: add + # 0010514c add x23,param_1,#0x8 + ( + "687e79.ghidra.be2", + "function=0x105128,bb=0x105128,insn=0x10514c", + capa.features.insn.Offset(0x8), + True, + ), + # insn/api + # not extracting dll name + ("687e79.ghidra.be2", "function=0x105c88", capa.features.insn.API("memset"), True), + ("687e79.ghidra.be2", "function=0x105c88", capa.features.insn.API("Nope"), False), + # insn/string + ( + "687e79.ghidra.be2", + "function=0x107588", + capa.features.common.String("AppDataService start"), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x1075c0", + capa.features.common.String("AppDataService"), + True, + ), + ("687e79.ghidra.be2", "function=0x107588", capa.features.common.String("nope"), False), + ( + "687e79.ghidra.be2", + "function=0x106d58", + capa.features.common.String("/data/misc/wifi/wpa_supplicant.conf"), + True, + ), + # insn/regex + ( + "687e79.ghidra.be2", + "function=0x105c88", + capa.features.common.Regex("innerRename"), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x106d58", + capa.features.common.Regex("/data/misc"), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x106d58", + capa.features.common.Substring("/data/misc"), + True, + ), + # insn/bytes + ( + "d1e650.ghidra.be2", + "function=0x1165a4", + capa.features.common.Bytes(binascii.unhexlify("E405B89370BA6B419CD7925275BF6FCC1E8360CC")), + True, + ), + # # don't extract byte features for obvious strings + ( + "687e79.ghidra.be2", + "function=0x1057f8", + capa.features.common.Bytes("/system/xbin/busybox".encode("utf-16le")), + False, + ), + # insn/characteristic(nzxor) + ( + "d1e650.ghidra.be2", + "function=0x114af4", + capa.features.common.Characteristic("nzxor"), + True, + ), + ( + "d1e650.ghidra.be2", + "function=0x117988", + capa.features.common.Characteristic("nzxor"), + True, + ), + # # insn/characteristic(cross section flow) + # ("a1982...", "function=0x4014D0", capa.features.common.Characteristic("cross section flow"), True), + # # insn/characteristic(cross section flow): imports don't count + # ("mimikatz", "function=0x4556E5", capa.features.common.Characteristic("cross section flow"), False), + # insn/characteristic(recursive call) + ( + "687e79.ghidra.be2", + "function=0x105b38", + capa.features.common.Characteristic("recursive call"), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x106530", + capa.features.common.Characteristic("recursive call"), + True, + ), + # insn/characteristic(indirect call) + ("d1e650.ghidra.be2", "function=0x118620", capa.features.common.Characteristic("indirect call"), True), + ( + "d1e650.ghidra.be2", + "function=0x118500", + capa.features.common.Characteristic("indirect call"), + False, + ), + ("d1e650.ghidra.be2", "function=0x118620", capa.features.common.Characteristic("indirect call"), True), + ( + "d1e650.ghidra.be2", + "function=0x11451c", + capa.features.common.Characteristic("indirect call"), + True, + ), + # insn/characteristic(calls from) + ( + "687e79.ghidra.be2", + "function=0x105080", + capa.features.common.Characteristic("calls from"), + True, + ), + ( + "687e79.ghidra.be2", + "function=0x1070e8", + capa.features.common.Characteristic("calls from"), + False, + ), + # function/characteristic(calls to) + ( + "687e79.ghidra.be2", + "function=0x1075c0", + capa.features.common.Characteristic("calls to"), + True, + ), + # file/function-name + ( + "687e79.ghidra.be2", + "file", + capa.features.file.FunctionName("__libc_init"), + "xfail: TODO should this be a function-name?", + ), + # os & format & arch + ("687e79.ghidra.be2", "file", OS(OS_ANDROID), True), + ("687e79.ghidra.be2", "file", OS(OS_LINUX), False), + ("687e79.ghidra.be2", "file", OS(OS_WINDOWS), False), + # os & format & arch are also global features + ("687e79.ghidra.be2", "function=0x107588", OS(OS_ANDROID), True), + ("687e79.ghidra.be2", "function=0x1075c0,bb=0x1076c0", OS(OS_ANDROID), True), + ("687e79.ghidra.be2", "file", Arch(ARCH_I386), False), + ("687e79.ghidra.be2", "file", Arch(ARCH_AMD64), False), + ("687e79.ghidra.be2", "file", Arch(ARCH_AARCH64), True), + ("687e79.ghidra.be2", "function=0x107588", Arch(ARCH_AARCH64), True), + ("687e79.ghidra.be2", "function=0x1075c0,bb=0x1076c0", Arch(ARCH_AARCH64), True), + ("687e79.ghidra.be2", "file", Format(FORMAT_ELF), True), + ("687e79.ghidra.be2", "file", Format(FORMAT_PE), False), + ("687e79.ghidra.be2", "function=0x107588", Format(FORMAT_ELF), True), + ("687e79.ghidra.be2", "function=0x107588", Format(FORMAT_PE), False), + ], + # order tests by (file, item) + # so that our LRU cache is most effective. + key=lambda t: (t[0], t[1]), +) + + +@fixtures.parametrize( + "sample,scope,feature,expected", + FEATURE_PRESENCE_TESTS_BE2_ELF_AARCH64, + indirect=["sample", "scope"], +) +def test_binexport_features_elf_aarch64(sample, scope, feature, expected): + if not isinstance(expected, bool): + # (for now) xfails indicates using string like: "xfail: not implemented yet" + pytest.xfail(expected) + fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected) + + +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_PRESENCE_TESTS, + indirect=["sample", "scope"], +) +def test_binexport_features_pe_x86(sample, scope, feature, expected): + if "mimikatz.exe_" not in sample.name: + pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file") + + if isinstance(feature, capa.features.common.Characteristic) and "stack string" in cast(str, feature.value): + pytest.skip("for now only testing basic features") + + sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport") + assert sample.exists() + fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected) + + +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_COUNT_TESTS_GHIDRA, + indirect=["sample", "scope"], +) +def test_binexport_feature_counts_ghidra(sample, scope, feature, expected): + if "mimikatz.exe_" not in sample.name: + pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file") + sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport") + assert sample.exists() + fixtures.do_test_feature_count(fixtures.get_binexport_extractor, sample, scope, feature, expected) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 06a6e9fef..31d079b86 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -6,6 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import os import sys import logging import textwrap @@ -38,6 +39,10 @@ def get_cape_report_file_path(): ) +def get_binexport2_file_path(): + return str(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport") + + def get_rules_path(): return str(CD / ".." / "rules") @@ -75,6 +80,22 @@ def test_scripts(script, args): assert p.returncode == 0 +@pytest.mark.parametrize( + "script,args", + [ + pytest.param("inspect-binexport2.py", [get_binexport2_file_path()]), + pytest.param("detect-binexport2-capabilities.py", [get_binexport2_file_path()]), + ], +) +def test_binexport_scripts(script, args): + # define sample bytes location + os.environ["CAPA_SAMPLES_DIR"] = str(Path(CD / "data")) + + script_path = get_script_path(script) + p = run_program(script_path, args) + assert p.returncode == 0 + + def test_bulk_process(tmp_path): # create test directory to recursively analyze t = tmp_path / "test"