From 76eed3d9fc3a1b7bab9b4295ea7da6ecd8fe2aad Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 9 Sep 2024 12:13:36 +0000 Subject: [PATCH] binexport: introduce instruction pattern matching Introduce intruction pattern matching to declaratively describe the instructions and operands that we want to extract. While there's a bit more code, its much more thoroughly tested, and is less brittle than the prior if/else/if/else/if/else implementation. --- .../extractors/binexport2/arch/arm/insn.py | 154 +++------ .../extractors/binexport2/arch/intel/insn.py | 252 +++++++------- .../features/extractors/binexport2/helpers.py | 316 +++++++++++++++++- scripts/inspect-binexport2.py | 16 +- tests/fixtures.py | 4 +- tests/test_binexport_accessors.py | 257 ++++++++++++++ 6 files changed, 749 insertions(+), 250 deletions(-) diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py index 2f2fc0f52..72cfd4ae1 100644 --- a/capa/features/extractors/binexport2/arch/arm/insn.py +++ b/capa/features/extractors/binexport2/arch/arm/insn.py @@ -15,11 +15,10 @@ from capa.features.extractors.binexport2 import FunctionContext, InstructionContext from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.binexport2.helpers import ( + BinExport2InstructionPatternMatcher, mask_immediate, is_address_mapped, - get_operand_expressions, get_instruction_mnemonic, - get_instruction_operands, get_operand_register_expression, get_operand_immediate_expression, ) @@ -50,10 +49,10 @@ def extract_insn_number_features( if mnemonic in ("add", "sub"): assert len(instruction.operand_index) == 3 - expression1: Optional[BinExport2.Expression] = get_operand_register_expression( + operand1_expression: Optional[BinExport2.Expression] = get_operand_register_expression( be2, be2.operand[instruction.operand_index[1]] ) - if expression1 and is_stack_register_expression(be2, expression1): + if operand1_expression and is_stack_register_expression(be2, operand1_expression): # skip things like: # add x0,sp,#0x8 return @@ -78,6 +77,18 @@ def extract_insn_number_features( yield OperandOffset(i, value), ih.address +OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack), #int] ; capture #int + ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack), #int]! ; capture #int + ldr|ldrb|ldrh|ldrsb|ldrsh|ldrex|ldrd|str|strb|strh|strex|strd reg, [reg(not-stack)], #int ; capture #int + ldp|ldpd|stp|stpd reg, reg, [reg(not-stack), #int] ; capture #int + ldp|ldpd|stp|stpd reg, reg, [reg(not-stack), #int]! ; capture #int + ldp|ldpd|stp|stpd reg, reg, [reg(not-stack)], #int ; capture #int + """ +) + + def extract_insn_offset_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: @@ -85,104 +96,26 @@ def extract_insn_offset_features( ii: InstructionContext = ih.inner be2: BinExport2 = fhi.ctx.be2 - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - - if len(instruction.operand_index) == 0: - # skip things like: - # .text:0040116e leave - return - mnemonic: str = get_instruction_mnemonic(be2, instruction) - value: Optional[int] = None - value_index: Optional[int] = None - - operands: List[BinExport2.Operand] - immediate_expression: Optional[BinExport2.Expression] - - if mnemonic.startswith(("ldr", "str")): - operands = get_instruction_operands(be2, instruction) - expressions1: List[BinExport2.Expression] - - if len(operands) == 2: - # like: - # ldr x0, [x1, 8] - expressions1 = get_operand_expressions(be2, operands[1]) - - if len(expressions1) == 4: - # like: - # ldr x0, [x1, 8] - if not is_stack_register_expression(be2, expressions1[1]): - if expressions1[3].type == BinExport2.Expression.IMMEDIATE_INT: - value = expressions1[3].immediate - value_index = 1 - - elif len(expressions1) == 5: - # like - # ldr x0, [x1, 8]! - if not is_stack_register_expression(be2, expressions1[2]): - if expressions1[4].type == BinExport2.Expression.IMMEDIATE_INT: - value = expressions1[4].immediate - value_index = 1 - - elif len(operands) == 3: - # like: - # ldr x0, [x1], 8 - expressions1 = get_operand_expressions(be2, operands[1]) - if not is_stack_register_expression(be2, expressions1[1]): - immediate_expression = get_operand_immediate_expression(be2, operands[2]) - - if immediate_expression: - value = immediate_expression.immediate - value_index = 2 - - elif mnemonic in ("ldp", "stp"): - operands = get_instruction_operands(be2, instruction) - expressions2: List[BinExport2.Expression] - - if len(operands) == 3: - # like: - # ldp x0, x1, [x3, 8]! - expressions2 = get_operand_expressions(be2, operands[2]) - - if len(expressions2) == 4: - # like: - # ldp x0, x1, [x3, 8] - if not is_stack_register_expression(be2, expressions2[1]): - if expressions2[3].type == BinExport2.Expression.IMMEDIATE_INT: - value = expressions2[3].immediate - value_index = 2 - - elif len(expressions2) == 5: - # like: - # ldp x0, x1, [x3, 8]! - if not is_stack_register_expression(be2, expressions2[2]): - if expressions2[4].type == BinExport2.Expression.IMMEDIATE_INT: - value = expressions2[4].immediate - value_index = 2 - - elif len(operands) == 4: - # like - # ldp x0, x1, [x3], 8 - expressions2 = get_operand_expressions(be2, operands[2]) - - if not is_stack_register_expression(be2, expressions2[1]): - immediate_expression = get_operand_immediate_expression(be2, operands[3]) - - if immediate_expression: - value = immediate_expression.immediate - value_index = 3 - - if value is None: + match = OFFSET_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: return - # we shouldn't make it here if index is not set - assert value_index is not None + value = match.expression.immediate value = mask_immediate(fhi.arch, value) if not is_address_mapped(be2, value): value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value) yield Offset(value), ih.address - yield OperandOffset(value_index, value), ih.address + yield OperandOffset(match.operand_index, value), ih.address + + +NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + eor reg, reg, reg + eor reg, reg, #int + """ +) def extract_insn_nzxor_characteristic_features( @@ -190,42 +123,33 @@ def extract_insn_nzxor_characteristic_features( ) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner ii: InstructionContext = ih.inner - be2: BinExport2 = fhi.ctx.be2 - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - mnemonic: str = get_instruction_mnemonic(be2, instruction) - - if mnemonic != "eor": + if NZXOR_PATTERNS.match_with_be2(be2, ii.instruction_index) is None: return + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + # guaranteed to be simple int/reg operands + # so we don't have to realize the tree/list. operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] - assert len(operands) == 3 - if operands[1] != operands[2]: yield Characteristic("nzxor"), ih.address +INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + blx|bx|blr reg0 ; capture reg0 + """ +) + + def extract_function_indirect_call_characteristic_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner ii: InstructionContext = ih.inner - be2: BinExport2 = fhi.ctx.be2 - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - mnemonic: str = get_instruction_mnemonic(be2, instruction) - - if mnemonic not in ("blx", "bx", "blr"): - return - - assert len(instruction.operand_index) == 1 - - expressions: List[BinExport2.Expression] = get_operand_expressions(be2, be2.operand[instruction.operand_index[0]]) - - assert len(expressions) == 1 - - if expressions[0].type == BinExport2.Expression.REGISTER: + if INDIRECT_CALL_PATTERNS.match_with_be2(be2, ii.instruction_index) is not None: yield Characteristic("indirect call"), ih.address diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py index 3b4621acd..3c3c0b767 100644 --- a/capa/features/extractors/binexport2/arch/intel/insn.py +++ b/capa/features/extractors/binexport2/arch/intel/insn.py @@ -13,25 +13,56 @@ from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset from capa.features.common import Feature, Characteristic from capa.features.address import Address -from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext, BinExport2Index +from capa.features.extractors.binexport2 import BinExport2Index, FunctionContext, BasicBlockContext, InstructionContext from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.binexport2.helpers import ( + BinExport2InstructionPatternMatcher, mask_immediate, is_address_mapped, get_instruction_mnemonic, - get_operand_register_expression, get_operand_immediate_expression, ) from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 from capa.features.extractors.binexport2.arch.intel.helpers import ( SECURITY_COOKIE_BYTES_DELTA, - OperandPhraseInfo, - get_operand_phrase_info, ) logger = logging.getLogger(__name__) +IGNORE_NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + ret #int + retn #int + add reg(stack), #int + sub reg(stack), #int + """ +) + +NUMBER_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + push #int0 ; capture #int0 + + # its a little tedious to enumerate all the address forms + # but at least we are explicit + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar reg, #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [#int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + #int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg + #int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg * #int], #int0 ; capture #int0 + cmp|and|or|test|mov|add|adc|sub|shl|shr|sal|sar [reg + reg * #int + #int], #int0 ; capture #int0 + + imul reg, reg, #int ; capture #int + # note that int is first + cmp|test #int0, reg ; capture #int0 + + # imagine reg is zero'd out, then this is like `mov reg, #int` + # which is not uncommon. + lea reg, [reg + #int] ; capture #int + """ +) + def extract_insn_number_features( fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: @@ -40,48 +71,60 @@ def extract_insn_number_features( be2: BinExport2 = fhi.ctx.be2 - instruction_index: int = ii.instruction_index - instruction: BinExport2.Instruction = be2.instruction[instruction_index] - - if len(instruction.operand_index) == 0: - # skip things like: - # .text:0040116e leave + if IGNORE_NUMBER_PATTERNS.match_with_be2(be2, ii.instruction_index): return - mnemonic: str = get_instruction_mnemonic(be2, instruction) + match = NUMBER_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + return - if mnemonic.startswith("ret"): - # skip things like: - # .text:0042250E retn 8 + value: int = mask_immediate(fhi.arch, match.expression.immediate) + if is_address_mapped(be2, value): return - if mnemonic.startswith(("add", "sub")): - register_expression: Optional[BinExport2.Expression] = get_operand_register_expression( - be2, be2.operand[instruction.operand_index[0]] - ) - if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")): - # skip things like: - # 0x415bbc ADD ESP, 0xC - return + yield Number(value), ih.address + yield OperandNumber(match.operand_index, value), ih.address - for i, operand_index in enumerate(instruction.operand_index): - operand: BinExport2.Operand = be2.operand[operand_index] + instruction_index: int = ii.instruction_index + instruction: BinExport2.Instruction = be2.instruction[instruction_index] - immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand) - if not immediate_expression: - continue + mnemonic: str = get_instruction_mnemonic(be2, instruction) + if mnemonic.startswith("add"): + if 0 < value < MAX_STRUCTURE_SIZE: + yield Offset(value), ih.address + yield OperandOffset(match.operand_index, value), ih.address - value: int = mask_immediate(fhi.arch, immediate_expression.immediate) - if is_address_mapped(be2, value): - continue - yield Number(value), ih.address - yield OperandNumber(i, value), ih.address +OFFSET_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + mov|movzx|movsb|cmp [reg + reg * #int + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg * #int + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg + reg + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg(not-stack) + #int0], #int ; capture #int0 + mov|movzx|movsb|cmp [reg + reg * #int + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp [reg * #int + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp [reg + reg + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp [reg(not-stack) + #int0], reg ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg + reg * #int + #int0] ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg * #int + #int0] ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg + reg + #int0] ; capture #int0 + mov|movzx|movsb|cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 + """ + # TODO: maybe also have to add REPxx MOVS/STOS/CMPS/LODS/SCAS + # but how are these encoded/passed fom BinExport2? +) - if mnemonic.startswith("add"): - if 0 < value < MAX_STRUCTURE_SIZE: - yield Offset(value), ih.address - yield OperandOffset(i, value), ih.address +# these are patterns that access offset 0 from some pointer +# (pointer is not the stack pointer). +OFFSET_ZERO_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + mov|movzx|movsb [reg(not-stack)], reg + mov|movzx|movsb [reg(not-stack)], #int + lea reg, [reg(not-stack)] + """ + # TODO: maybe also have to add REPxx MOVS/STOS/CMPS/LODS/SCAS + # but how are these encoded/passed fom BinExport2? +) def extract_insn_offset_features( @@ -91,55 +134,23 @@ def extract_insn_offset_features( ii: InstructionContext = ih.inner be2: BinExport2 = fhi.ctx.be2 - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - - if len(instruction.operand_index) == 0: - # skip things like: - # .text:0040116e leave - return - - mnemonic: str = get_instruction_mnemonic(be2, instruction) - value: int - - for i, operand_index in enumerate(instruction.operand_index): - operand: BinExport2.Operand = be2.operand[operand_index] - - is_dereference = False - for expression_index in operand.expression_index: - if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE: - is_dereference = True - break - if not is_dereference: - continue - - phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand) - if not phrase_info: - continue - - if phrase_info.displacement: - if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")): - # skips things like: - # 00401068 MOV dword ptr [EBP + local_8],EAX - continue - - value = mask_immediate(fhi.arch, phrase_info.displacement.immediate) - if not is_address_mapped(be2, value): - value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32) + match = OFFSET_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + match = OFFSET_ZERO_PATTERNS.match_with_be2(be2, ii.instruction_index) + if not match: + return - yield Offset(value), ih.address - yield OperandOffset(i, value), ih.address + yield Offset(0), ih.address + yield OperandOffset(match.operand_index, 0), ih.address - if mnemonic == "lea" and i == 1: - if phrase_info.base and not any((phrase_info.scale, phrase_info.index)): - yield Number(value), ih.address - yield OperandNumber(i, value), ih.address + value = mask_immediate(fhi.arch, match.expression.immediate) + if is_address_mapped(be2, value): + return - elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)): - # like: - # 00401062 MOVZX EAX,word ptr [EDI] - yield Offset(0), ih.address - yield OperandOffset(i, 0), ih.address + value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32) + yield Offset(value), ih.address + yield OperandOffset(match.operand_index, value), ih.address def is_security_cookie( @@ -177,6 +188,14 @@ def is_security_cookie( return False +NZXOR_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + xor|xorpd|xorps|pxor reg, reg + xor|xorpd|xorps|pxor reg, #int + """ +) + + def extract_insn_nzxor_characteristic_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: @@ -188,65 +207,48 @@ def extract_insn_nzxor_characteristic_features( ii: InstructionContext = ih.inner be2: BinExport2 = fhi.ctx.be2 + idx: BinExport2Index = fhi.ctx.idx - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - mnemonic: str = get_instruction_mnemonic(be2, instruction) - if mnemonic not in ( - "xor", - "xorpd", - "xorps", - "pxor", - ): + if NZXOR_PATTERNS.match_with_be2(be2, ii.instruction_index) is None: return + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + # guaranteed to be simple int/reg operands + # so we don't have to realize the tree/list. operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] - if mnemonic in ("xor", "xorpd", "xorps", "pxor"): - if operands[0] == operands[1]: - return - instruction_address: int = idx.insn_address_by_index[ii.instruction_index] - if is_security_cookie(fhi, bbh.inner, instruction_address, instruction): - return + if operands[0] == operands[1]: + return + + instruction_address: int = idx.insn_address_by_index[ii.instruction_index] + if is_security_cookie(fhi, bbh.inner, instruction_address, instruction): + return yield Characteristic("nzxor"), ih.address +INDIRECT_CALL_PATTERNS = BinExport2InstructionPatternMatcher.from_str( + """ + call|jmp reg0 + call|jmp [reg + reg * #int + #int] + call|jmp [reg + reg * #int] + call|jmp [reg * #int + #int] + call|jmp [reg + reg + #int] + call|jmp [reg + #int] + call|jmp [reg] + """ +) + + def extract_function_indirect_call_characteristic_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner ii: InstructionContext = ih.inner - be2: BinExport2 = fhi.ctx.be2 - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - if len(instruction.operand_index) == 0: - # skip things like: - # .text:0040116e leave - return - - mnemonic: str = get_instruction_mnemonic(be2, instruction) - if mnemonic not in ("call", "jmp"): + match = INDIRECT_CALL_PATTERNS.match_with_be2(be2, ii.instruction_index) + if match is None: return - assert len(instruction.operand_index) == 1 - - operand: BinExport2.Operand = be2.operand[instruction.operand_index[0]] - - if len(operand.expression_index) == 1: - expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]] - # call edx - if expression0.type == BinExport2.Expression.REGISTER: - yield Characteristic("indirect call"), ih.address - else: - is_dereference = False - for expression_index in operand.expression_index: - if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE: - is_dereference = True - break - - if is_dereference: - phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand) - if phrase_info and phrase_info.base: - # call dword ptr [eax+50h] - yield Characteristic("indirect call"), ih.address + yield Characteristic("indirect call"), ih.address diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py index 4de4623dd..447f62f69 100644 --- a/capa/features/extractors/binexport2/helpers.py +++ b/capa/features/extractors/binexport2/helpers.py @@ -5,9 +5,13 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -from typing import Set, List, Iterator, Optional +import re +from typing import Set, Dict, List, Tuple, Union, Iterator, Optional +from collections import defaultdict +from dataclasses import dataclass import capa.features.extractors.helpers +import capa.features.extractors.binexport2.helpers from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 @@ -329,3 +333,313 @@ def get_instruction_mnemonic(be2: BinExport2, instruction: BinExport2.Instructio def get_instruction_operands(be2: BinExport2, instruction: BinExport2.Instruction) -> List[BinExport2.Operand]: return [be2.operand[operand_index] for operand_index in instruction.operand_index] + + +def split_with_delimiters(s: str, delimiters: Tuple[str, ...]) -> Iterator[str]: + """ + Splits a string by any of the provided delimiter characters, + including the delimiters in the results. + + Args: + string: The string to split. + delimiters: A string containing the characters to use as delimiters. + """ + start = 0 + for i, char in enumerate(s): + if char in delimiters: + yield s[start:i] + yield char + start = i + 1 + + if start < len(s): + yield s[start:] + + +BinExport2OperandPattern = Union[str, Tuple[str, ...]] + + +@dataclass +class BinExport2InstructionPattern: + """ + This describes a way to match disassembled instructions, with mnemonics and operands. + + You can specify constraints on the instruction, via: + - the mnemonics, like "mov", + - number of operands, and + - format of each operand, "[reg, reg, #int]". + + During matching, you can also capture a single element, to see its concrete value. + For example, given the pattern: + + mov reg0, #int0 ; capture int0 + + and the instruction: + + mov eax, 1 + + Then the capture will contain the immediate integer 1. + + This matcher uses the BinExport2 data layout under the hood. + """ + + mnemonics: Tuple[str, ...] + operands: Tuple[Union[str, BinExport2OperandPattern], ...] + capture: Optional[str] + + @classmethod + def from_str(cls, query: str): + """ + Parse a pattern string into a Pattern instance. + The supported syntax is like this: + + br reg + br reg ; capture reg + br reg(stack) ; capture reg + br reg(not-stack) ; capture reg + mov reg0, reg1 ; capture reg0 + adrp reg, #int ; capture #int + add reg, reg, #int ; capture #int + ldr reg0, [reg1] ; capture reg1 + ldr|str reg, [reg, #int] ; capture #int + ldr|str reg, [reg(stack), #int] ; capture #int + ldr|str reg, [reg(not-stack), #int] ; capture #int + ldr|str reg, [reg, #int]! ; capture #int + ldr|str reg, [reg], #int ; capture #int + ldp|stp reg, reg, [reg, #int] ; capture #int + ldp|stp reg, reg, [reg, #int]! ; capture #int + ldp|stp reg, reg, [reg], #int ; capture #int + """ + # + # The implementation of the parser here is obviously ugly. + # Its handwritten and probably fragile. But since we don't + # expect this to be widely used, its probably ok. + # Don't hesitate to rewrite this if it becomes more important. + # + # Note that this doens't have to be very performant. + # We expect these patterns to be parsed once upfront and then reused + # (globally at the module level?) rather than within any loop. + # + + pattern, _, comment = query.strip().partition(";") + + # we don't support fs: yet + assert ":" not in pattern + + # from "capture #int" to "#int" + if comment: + comment = comment.strip() + assert comment.startswith("capture ") + capture = comment[len("capture ") :] + else: + capture = None + + # from "ldr|str ..." to ["ldr", "str"] + pattern = pattern.strip() + mnemonic, _, rest = pattern.partition(" ") + mnemonics = mnemonic.split("|") + + operands: List[Union[str, Tuple[str, ...]]] = [] + while rest: + rest = rest.strip() + if not rest.startswith("["): + # If its not a dereference, which looks like `[op, op, op, ...]`, + # then its a simple operand, which we can split by the next comma. + operand, _, rest = rest.partition(", ") + rest = rest.strip() + operands.append(operand) + + else: + # This looks like a dereference, something like `[op, op, op, ...]`. + # Since these can't be nested, look for the next ] and then parse backwards. + deref_end = rest.index("]") + try: + deref_end = rest.index(", ", deref_end) + deref_end += len(", ") + except ValueError: + deref = rest + rest = "" + else: + deref = rest[:deref_end] + rest = rest[deref_end:] + rest = rest.strip() + deref = deref.rstrip(" ") + deref = deref.rstrip(",") + + # like: [reg, #int]! + has_postindex_writeback = deref.endswith("!") + + deref = deref.rstrip("!") + deref = deref.rstrip("]") + deref = deref.lstrip("[") + + parts = tuple(split_with_delimiters(deref, (",", "+", "*"))) + parts = tuple(map(lambda s: s.strip(), parts)) + + if has_postindex_writeback: + operands.append(("!", "[") + parts) + else: + operands.append(("[",) + parts) + + for operand in operands: # type: ignore + # Try to ensure we've parsed the operands correctly. + # This is just sanity checking. + for o in (operand,) if isinstance(operand, str) else operand: + # operands can look like: + # - reg + # - reg0 + # - reg(stack) + # - reg0(stack) + # - reg(not-stack) + # - reg0(not-stack) + # - #int + # - #int0 + # and a limited set of supported operators. + assert re.match(r"^(reg|#int)[0-9]?(\(stack\)|\(not-stack\))?$", o) or o in ("[", ",", "!", "+", "*") + + return cls(tuple(mnemonics), tuple(operands), capture) + + @dataclass + class MatchResult: + operand_index: int + expression_index: int + expression: BinExport2.Expression + + def match( + self, mnemonic: str, operand_expressions: List[List[BinExport2.Expression]] + ) -> Optional["BinExport2InstructionPattern.MatchResult"]: + """ + Match the given BinExport2 data against this pattern. + + The BinExport2 expression tree must have been flattened, such as with + capa.features.extractors.binexport2.helpers.get_operand_expressions. + + If there's a match, the captured Expression instance is returned. + Otherwise, you get None back. + """ + if mnemonic not in self.mnemonics: + return None + + if len(self.operands) != len(operand_expressions): + return None + + captured = None + + for operand_index, found_expressions in enumerate(operand_expressions): + wanted_expressions = self.operands[operand_index] + + # from `"reg"` to `("reg", )` + if isinstance(wanted_expressions, str): + wanted_expressions = (wanted_expressions,) + assert isinstance(wanted_expressions, tuple) + + if len(wanted_expressions) != len(found_expressions): + return None + + for expression_index, (wanted_expression, found_expression) in enumerate( + zip(wanted_expressions, found_expressions) + ): + if wanted_expression.startswith("reg"): + if found_expression.type != BinExport2.Expression.REGISTER: + return None + + if wanted_expression.endswith(")"): + if wanted_expression.endswith("(not-stack)"): + # intel 64: rsp, esp, sp, + # intel 32: ebp, ebp, bp + # arm: sp + register_name = found_expression.symbol.lower() + if register_name in ("rsp", "esp", "sp", "rbp", "ebp", "bp"): + return None + + elif wanted_expression.endswith("(stack)"): + register_name = found_expression.symbol.lower() + if register_name not in ("rsp", "esp", "sp", "rbp", "ebp", "bp"): + return None + + else: + raise ValueError("unexpected expression suffix", wanted_expression) + + if self.capture == wanted_expression: + captured = BinExport2InstructionPattern.MatchResult( + operand_index, expression_index, found_expression + ) + + elif wanted_expression.startswith("#int"): + if found_expression.type != BinExport2.Expression.IMMEDIATE_INT: + return None + + if self.capture == wanted_expression: + captured = BinExport2InstructionPattern.MatchResult( + operand_index, expression_index, found_expression + ) + + elif wanted_expression == "[": + if found_expression.type != BinExport2.Expression.DEREFERENCE: + return None + + elif wanted_expression in (",", "!", "+", "*"): + if found_expression.type != BinExport2.Expression.OPERATOR: + return None + + if found_expression.symbol != wanted_expression: + return None + + else: + raise ValueError(found_expression) + + if captured: + return captured + else: + # There were no captures, so + # return arbitrary non-None expression + return BinExport2InstructionPattern.MatchResult(operand_index, expression_index, found_expression) + + +class BinExport2InstructionPatternMatcher: + """Index and match a collection of instruction patterns.""" + + def __init__(self, queries: List[BinExport2InstructionPattern]): + self.queries = queries + # shard the patterns by (mnemonic, #operands) + self._index: Dict[Tuple[str, int], List[BinExport2InstructionPattern]] = defaultdict(list) + + for query in queries: + for mnemonic in query.mnemonics: + self._index[(mnemonic.lower(), len(query.operands))].append(query) + + @classmethod + def from_str(cls, patterns: str): + return cls([ + BinExport2InstructionPattern.from_str(line) + for line in + filter(lambda line: not line.startswith("#"), + map(lambda line: line.strip(), + patterns.split("\n"))) + ]) + + def match( + self, mnemonic: str, operand_expressions: List[List[BinExport2.Expression]] + ) -> Optional[BinExport2InstructionPattern.MatchResult]: + queries = self._index.get((mnemonic.lower(), len(operand_expressions)), []) + for query in queries: + captured = query.match(mnemonic.lower(), operand_expressions) + if captured: + return captured + + return None + + def match_with_be2( + self, be2: BinExport2, instruction_index: int + ) -> Optional[BinExport2InstructionPattern.MatchResult]: + instruction: BinExport2.Instruction = be2.instruction[instruction_index] + mnemonic: str = get_instruction_mnemonic(be2, instruction) + + if (mnemonic, len(instruction.operand_index)) not in self._index: + # verify that we might have a hit before we realize the operand expression list + return None + + operands = [] + for operand_index in instruction.operand_index: + operands.append(get_operand_expressions(be2, be2.operand[operand_index])) + + return self.match(mnemonic, operands) diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py index b579fabd0..4ed353eb4 100644 --- a/scripts/inspect-binexport2.py +++ b/scripts/inspect-binexport2.py @@ -127,12 +127,15 @@ def _render_expression_tree( if len(children_tree_indexes) == 1: # prefix operator, like "ds:" - if expression.symbol != ",": - # or there's a binary operator, like ",", that's missing a child, - # such as when we prune "lsl" branches. + if expression.symbol != "!": o.write(expression.symbol) + child_index = children_tree_indexes[0] _render_expression_tree(be2, operand, expression_tree, child_index, o) + + # postfix operator, like "!" in aarch operand "[x1, 8]!" + if expression.symbol == "!": + o.write(expression.symbol) return elif len(children_tree_indexes) == 2: @@ -177,9 +180,7 @@ def _render_expression_tree( _OPERAND_CACHE: Dict[int, str] = {} -def render_operand( - be2: BinExport2, operand: BinExport2.Operand, index: Optional[int] = None -) -> str: +def render_operand(be2: BinExport2, operand: BinExport2.Operand, index: Optional[int] = None) -> str: # For the mimikatz example file, there are 138k distinct operands. # Of those, only 11k are unique, which is less than 10% of the total. # The most common operands are seen 37k, 24k, 17k, 15k, 11k, ... times. @@ -226,7 +227,7 @@ def rec(tree_index, indent=0): print(f" {' ' * indent}expression: {str(expression).replace('\n', ', ')}") for child_index in children_tree_indexes: - rec(child_index, indent+1) + rec(child_index, indent + 1) rec(0) @@ -238,7 +239,6 @@ def inspect_instruction(be2: BinExport2, instruction: BinExport2.Instruction, ad print(f" mnemonic: {mnemonic.name}") print(" operands:") - operands = [] for i, operand_index in enumerate(instruction.operand_index): print(f" - operand {i}: [{operand_index}]") operand = be2.operand[operand_index] diff --git a/tests/fixtures.py b/tests/fixtures.py index a2c16ef5e..e4d0a6fa0 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -818,7 +818,9 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x8), False), ("mimikatz", "function=0x40105D", capa.features.insn.Offset(0x10), False), # insn/offset: negative + # 0x4012b4 MOVZX ECX, [EAX+0xFFFFFFFFFFFFFFFF] ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True), + # 0x4012b8 MOVZX EAX, [EAX+0xFFFFFFFFFFFFFFFE] ("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True), # # insn/offset from mnemonic: add @@ -841,7 +843,7 @@ def parametrize(params, values, **kwargs): # should not be considered, lea operand invalid encoding # .text:004717B1 8D 4C 31 D0 lea ecx, [ecx+esi-30h] ("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False), - # yes, this is also a number (imagine edx is zero): + # yes, this is also a number (imagine ebx is zero): # .text:004018C0 8D 4B 02 lea ecx, [ebx+2] ("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True), # insn/api diff --git a/tests/test_binexport_accessors.py b/tests/test_binexport_accessors.py index ba63b89c5..3aa78982f 100644 --- a/tests/test_binexport_accessors.py +++ b/tests/test_binexport_accessors.py @@ -15,13 +15,18 @@ import fixtures from google.protobuf.json_format import ParseDict +import capa.features.extractors.binexport2.helpers from capa.features.extractors.binexport2.helpers import ( + BinExport2InstructionPattern, + BinExport2InstructionPatternMatcher, + split_with_delimiters, get_operand_expressions, get_instruction_mnemonic, get_instruction_operands, get_operand_register_expression, get_operand_immediate_expression, ) +from capa.features.extractors.binexport2.extractor import BinExport2FeatureExtractor from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 from capa.features.extractors.binexport2.arch.arm.helpers import is_stack_register_expression @@ -343,3 +348,255 @@ def test_is_stack_register_expression(): assert is_stack_register_expression(BE2, op1_exp0) is True op2_exp0 = get_operand_expressions(BE2, add_op2)[0] assert is_stack_register_expression(BE2, op2_exp0) is False + + +def test_split_with_delimiters(): + assert tuple(split_with_delimiters("abc|def", ("|",))) == ("abc", "|", "def") + assert tuple(split_with_delimiters("abc|def|", ("|",))) == ("abc", "|", "def", "|") + assert tuple(split_with_delimiters("abc||def", ("|",))) == ("abc", "|", "", "|", "def") + assert tuple(split_with_delimiters("abc|def-ghi", ("|", "-"))) == ("abc", "|", "def", "-", "ghi") + + +def test_pattern_parsing(): + assert BinExport2InstructionPattern.from_str( + "br reg ; capture reg" + ) == BinExport2InstructionPattern(mnemonics=("br",), operands=("reg",), capture="reg") + + assert BinExport2InstructionPattern.from_str( + "mov reg0, reg1 ; capture reg0" + ) == BinExport2InstructionPattern(mnemonics=("mov",), operands=("reg0", "reg1"), capture="reg0") + + assert BinExport2InstructionPattern.from_str( + "adrp reg, #int ; capture #int" + ) == BinExport2InstructionPattern(mnemonics=("adrp",), operands=("reg", "#int"), capture="#int") + + assert BinExport2InstructionPattern.from_str( + "add reg, reg, #int ; capture #int" + ) == BinExport2InstructionPattern(mnemonics=("add",), operands=("reg", "reg", "#int"), capture="#int") + + assert BinExport2InstructionPattern.from_str( + "ldr reg0, [reg1] ; capture reg1" + ) == BinExport2InstructionPattern(mnemonics=("ldr",), operands=("reg0", ("[", "reg1")), capture="reg1") + + assert BinExport2InstructionPattern.from_str( + "ldr|str reg, [reg, #int] ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldr", + "str", + ), + operands=("reg", ("[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldr|str reg, [reg, #int]! ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldr", + "str", + ), + operands=("reg", ("!", "[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldr|str reg, [reg], #int ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldr", + "str", + ), + operands=( + "reg", + ( + "[", + "reg", + ), + "#int", + ), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldp|stp reg, reg, [reg, #int] ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldp", + "stp", + ), + operands=("reg", "reg", ("[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldp|stp reg, reg, [reg, #int]! ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldp", + "stp", + ), + operands=("reg", "reg", ("!", "[", "reg", ",", "#int")), + capture="#int", + ) + + assert BinExport2InstructionPattern.from_str( + "ldp|stp reg, reg, [reg], #int ; capture #int" + ) == BinExport2InstructionPattern( + mnemonics=( + "ldp", + "stp", + ), + operands=("reg", "reg", ("[", "reg"), "#int"), + capture="#int", + ) + + assert ( + BinExport2InstructionPatternMatcher.from_str( + """ + # comment + br reg + br reg(not-stack) + br reg ; capture reg + mov reg0, reg1 ; capture reg0 + adrp reg, #int ; capture #int + add reg, reg, #int ; capture #int + ldr reg0, [reg1] ; capture reg1 + ldr|str reg, [reg, #int] ; capture #int + ldr|str reg, [reg, #int]! ; capture #int + ldr|str reg, [reg], #int ; capture #int + ldp|stp reg, reg, [reg, #int] ; capture #int + ldp|stp reg, reg, [reg, #int]! ; capture #int + ldp|stp reg, reg, [reg], #int ; capture #int + ldrb reg0, [reg1, reg2] ; capture reg2 + call [reg + reg * #int + #int] + call [reg + reg * #int] + call [reg * #int + #int] + call [reg + reg + #int] + call [reg + #int] + """ + ).queries + is not None + ) + + +def match_address(extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int): + instruction = extractor.idx.insn_by_address[address] + mnemonic: str = get_instruction_mnemonic(extractor.be2, instruction) + + operands = [] + for operand_index in instruction.operand_index: + operand = extractor.be2.operand[operand_index] + operands.append(capa.features.extractors.binexport2.helpers.get_operand_expressions(extractor.be2, operand)) + + return queries.match(mnemonic, operands) + + +def match_address_with_be2( + extractor: BinExport2FeatureExtractor, queries: BinExport2InstructionPatternMatcher, address: int +): + instruction_index = extractor.idx.insn_index_by_address[address] + return queries.match_with_be2(extractor.be2, instruction_index) + + +def test_pattern_matching(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + br reg(stack) ; capture reg + br reg(not-stack) ; capture reg + mov reg0, reg1 ; capture reg0 + adrp reg, #int ; capture #int + add reg, reg, #int ; capture #int + ldr reg0, [reg1] ; capture reg1 + ldr|str reg, [reg, #int] ; capture #int + ldr|str reg, [reg, #int]! ; capture #int + ldr|str reg, [reg], #int ; capture #int + ldp|stp reg, reg, [reg, #int] ; capture #int + ldp|stp reg, reg, [reg, #int]! ; capture #int + ldp|stp reg, reg, [reg], #int ; capture #int + ldrb reg0, [reg1(not-stack), reg2] ; capture reg2 + """ + ) + + # 0x210184: ldrb w2, [x0, x1] + # query: ldrb reg0, [reg1(not-stack), reg2] ; capture reg2" + assert match_address(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1" + assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210184).expression.symbol == "x1" + + # 0x210198: mov x2, x1 + # query: mov reg0, reg1 ; capture reg0"), + assert match_address(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2" + assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210198).expression.symbol == "x2" + + # 0x210190: add x1, x1, 0x1 + # query: add reg, reg, #int ; capture #int + assert match_address(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1 + assert match_address_with_be2(BE2_EXTRACTOR, queries, 0x210190).expression.immediate == 1 + + +BE2_EXTRACTOR_687 = fixtures.get_binexport_extractor( + CD + / "data" + / "binexport2" + / "687e79cde5b0ced75ac229465835054931f9ec438816f2827a8be5f3bd474929.elf_.ghidra.BinExport" +) + + +def test_pattern_matching_exclamation(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + stp reg, reg, [reg, #int]! ; capture #int + """ + ) + + # note this captures the sp + # 0x107918: stp x20, x19, [sp,0xFFFFFFFFFFFFFFE0]! + # query: stp reg, reg, [reg, #int]! ; capture #int + assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + + +def test_pattern_matching_stack(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + stp reg, reg, [reg(stack), #int]! ; capture #int + """ + ) + + # note this does capture the sp + # compare this with the test above (exclamation) + # 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]! + # query: stp reg, reg, [reg(stack), #int]! ; capture #int + assert match_address(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918).expression.immediate == 0xFFFFFFFFFFFFFFE0 + + +def test_pattern_matching_not_stack(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + stp reg, reg, [reg(not-stack), #int]! ; capture #int + """ + ) + + # note this does not capture the sp + # compare this with the test above (exclamation) + # 0x107918: stp x20, x19, [sp, 0xFFFFFFFFFFFFFFE0]! + # query: stp reg, reg, [reg(not-stack), #int]! ; capture #int + assert match_address(BE2_EXTRACTOR_687, queries, 0x107918) is None + assert match_address_with_be2(BE2_EXTRACTOR_687, queries, 0x107918) is None + + +BE2_EXTRACTOR_MIMI = fixtures.get_binexport_extractor(CD / "data" / "binexport2" / "mimikatz.exe_.ghidra.BinExport") + + +def test_pattern_matching_x86(): + queries = BinExport2InstructionPatternMatcher.from_str( + """ + cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 + """ + ) + + # 0x4018c0: LEA ECX, [EBX+0x2] + # query: cmp|lea reg, [reg(not-stack) + #int0] ; capture #int0 + assert match_address(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2 + assert match_address_with_be2(BE2_EXTRACTOR_MIMI, queries, 0x4018C0).expression.immediate == 2