diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index 603f2e42f..b7d06e15e 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -1,5 +1,12 @@ [mypy] +# TODO(yelhamer): remove this once proto has been added +# for the dynamic rendering +exclude = (?x)( + ^capa/render/proto/__init__.py$ + | ^tests/_test_proto.py$ + ) + [mypy-halo.*] ignore_missing_imports = True diff --git a/CHANGELOG.md b/CHANGELOG.md index af4baea4f..6f85a13b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Add a new thread scope for the dynamic analysis flavor #1517 @yelhamer - Add support for flavor-based rule scopes @yelhamer - Add ProcessesAddress and ThreadAddress #1612 @yelhamer +- Add dynamic capability extraction @yelhamer ### Breaking Changes diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 82fc3515e..c45722316 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -7,6 +7,7 @@ # See the License for the specific language governing permissions and limitations under the License. import abc +import hashlib import dataclasses from typing import Any, Dict, Tuple, Union, Iterator from dataclasses import dataclass @@ -24,6 +25,24 @@ # the feature extractor from which they were created. +@dataclass +class SampleHashes: + md5: str + sha1: str + sha256: str + + @classmethod + def from_bytes(cls, buf: bytes) -> "SampleHashes": + md5 = hashlib.md5() + sha1 = hashlib.sha1() + sha256 = hashlib.sha256() + md5.update(buf) + sha1.update(buf) + sha256.update(buf) + + return cls(md5=md5.hexdigest(), sha1=sha1.hexdigest(), sha256=sha256.hexdigest()) + + @dataclass class FunctionHandle: """reference to a function recognized by a feature extractor. @@ -104,6 +123,13 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.addres """ raise NotImplementedError() + @abc.abstractmethod + def get_sample_hashes(self) -> SampleHashes: + """ + fetch the hashes for the sample contained within the extractor. + """ + raise NotImplementedError() + @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: """ @@ -309,6 +335,23 @@ class DynamicFeatureExtractor: This class is not instantiated directly; it is the base class for other implementations. """ + __metaclass__ = abc.ABCMeta + + def __init__(self): + # + # note: a subclass should define ctor parameters for its own use. + # for example, the Vivisect feature extract might require the vw and/or path. + # this base class doesn't know what to do with that info, though. + # + super().__init__() + + @abc.abstractmethod + def get_sample_hashes(self) -> SampleHashes: + """ + fetch the hashes for the sample contained within the extractor. + """ + raise NotImplementedError() + @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: """ diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index e8ce97502..9f63aebb1 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -6,6 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. from typing import List, Tuple, Iterator +from pathlib import Path import binaryninja as binja @@ -17,7 +18,13 @@ import capa.features.extractors.binja.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) class BinjaFeatureExtractor(StaticFeatureExtractor): @@ -28,10 +35,14 @@ def __init__(self, bv: binja.BinaryView): self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) + self.sample_hashes = SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index 5758d0bd0..881802d4b 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -15,7 +15,7 @@ import capa.features.extractors.cape.process from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress, _NoAddress -from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, ThreadHandle, ProcessHandle, DynamicFeatureExtractor logger = logging.getLogger(__name__) @@ -28,6 +28,11 @@ def __init__(self, cape_version: str, static: Dict, behavior: Dict): self.cape_version = cape_version self.static = static self.behavior = behavior + self.sample_hashes = SampleHashes( + md5=static["file"]["md5"].lower(), + sha1=static["file"]["sha1"].lower(), + sha256=static["file"]["sha256"].lower(), + ) self.global_features = capa.features.extractors.cape.global_.extract_features(self.static) @@ -35,6 +40,9 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]: # value according to the PE header, the actual trace may use a different imagebase return AbsoluteVirtualAddress(self.static["pe"]["imagebase"]) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index 805dd497b..5d34b7cf4 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -22,7 +22,13 @@ from capa.features.common import Feature from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) from capa.features.extractors.dnfile.helpers import ( get_dotnet_types, get_dotnet_fields, @@ -72,6 +78,7 @@ class DnfileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: Path): super().__init__() self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) + self.sample_hashes = SampleHashes.from_bytes(path.read_bytes()) # pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction # most relevant at instruction scope @@ -86,6 +93,9 @@ def __init__(self, path: Path): def get_base_address(self): return NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py index 733fabde2..d18c325de 100644 --- a/capa/features/extractors/dnfile_.py +++ b/capa/features/extractors/dnfile_.py @@ -25,7 +25,7 @@ Feature, ) from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -86,10 +86,14 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self) -> AbsoluteVirtualAddress: return AbsoluteVirtualAddress(0x0) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index 823d9e229..70789598a 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -31,7 +31,7 @@ Characteristic, ) from capa.features.address import NO_ADDRESS, Address, DNTokenAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor from capa.features.extractors.dnfile.helpers import ( DnType, iter_dotnet_table, @@ -170,10 +170,14 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py index dbe9475b8..7e2249e08 100644 --- a/capa/features/extractors/elffile.py +++ b/capa/features/extractors/elffile.py @@ -16,7 +16,7 @@ from capa.features.file import Import, Section from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -112,6 +112,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.elf = ELFFile(io.BytesIO(path.read_bytes())) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): # virtual address of the first segment with type LOAD @@ -119,6 +120,9 @@ def get_base_address(self): if segment.header.p_type == "PT_LOAD": return AbsoluteVirtualAddress(segment.header.p_vaddr) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): buf = self.path.read_bytes() diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 9865dd9a8..62b047c44 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -18,7 +18,13 @@ import capa.features.extractors.ida.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) class IdaFeatureExtractor(StaticFeatureExtractor): @@ -28,10 +34,16 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) + self.sample_hashes = SampleHashes( + md5=idaapi.get_input_file_md5(), sha1="(unknown)", sha256=idaapi.get_input_file_sha256() + ) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/null.py b/capa/features/extractors/null.py index 65c3f6ac9..800fb7030 100644 --- a/capa/features/extractors/null.py +++ b/capa/features/extractors/null.py @@ -15,6 +15,7 @@ from capa.features.extractors.base_extractor import ( BBHandle, InsnHandle, + SampleHashes, ThreadHandle, ProcessHandle, FunctionHandle, @@ -49,6 +50,7 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor): """ base_address: Address + sample_hashes: SampleHashes global_features: List[Feature] file_features: List[Tuple[Address, Feature]] functions: Dict[Address, FunctionFeatures] @@ -60,6 +62,9 @@ def extract_global_features(self): for feature in self.global_features: yield feature, NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_file_features(self): for address, feature in self.file_features: yield feature, address @@ -103,6 +108,7 @@ class ProcessFeatures: @dataclass class NullDynamicFeatureExtractor(DynamicFeatureExtractor): base_address: Address + sample_hashes: SampleHashes global_features: List[Feature] file_features: List[Tuple[Address, Feature]] processes: Dict[Address, ProcessFeatures] @@ -111,6 +117,9 @@ def extract_global_features(self): for feature in self.global_features: yield feature, NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_file_features(self): for address, feature in self.file_features: yield feature, address diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index 9418955ff..e79134401 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -19,7 +19,7 @@ from capa.features.file import Export, Import, Section from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -190,10 +190,14 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe = pefile.PE(str(path)) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): buf = Path(self.path).read_bytes() diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index 8bef8949c..a4f9c748e 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -20,7 +20,13 @@ import capa.features.extractors.viv.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) logger = logging.getLogger(__name__) @@ -31,6 +37,7 @@ def __init__(self, vw, path: Path, os): self.vw = vw self.path = path self.buf = path.read_bytes() + self.sample_hashes = SampleHashes.from_bytes(self.buf) # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] @@ -42,6 +49,9 @@ def get_base_address(self): # assume there is only one file loaded into the vw return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"]) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py index b2dd3cc25..5c606f665 100644 --- a/capa/features/freeze/__init__.py +++ b/capa/features/freeze/__init__.py @@ -27,7 +27,12 @@ import capa.features.extractors.null as null from capa.helpers import assert_never from capa.features.freeze.features import Feature, feature_from_capa -from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor +from capa.features.extractors.base_extractor import ( + SampleHashes, + FeatureExtractor, + StaticFeatureExtractor, + DynamicFeatureExtractor, +) logger = logging.getLogger(__name__) @@ -300,6 +305,7 @@ class Config: class Freeze(BaseModel): version: int = 2 base_address: Address = Field(alias="base address") + sample_hashes: SampleHashes extractor: Extractor features: Features @@ -400,6 +406,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str: freeze = Freeze( version=2, base_address=Address.from_capa(extractor.get_base_address()), + sample_hashes=extractor.get_sample_hashes(), extractor=Extractor(name=extractor.__class__.__name__), features=features, ) # type: ignore @@ -484,6 +491,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str: freeze = Freeze( version=2, base_address=Address.from_capa(base_addr), + sample_hashes=extractor.get_sample_hashes(), extractor=Extractor(name=extractor.__class__.__name__), features=features, ) # type: ignore @@ -501,6 +509,7 @@ def loads_static(s: str) -> StaticFeatureExtractor: assert isinstance(freeze.features, StaticFeatures) return null.NullStaticFeatureExtractor( base_address=freeze.base_address.to_capa(), + sample_hashes=freeze.sample_hashes, global_features=[f.feature.to_capa() for f in freeze.features.global_], file_features=[(f.address.to_capa(), f.feature.to_capa()) for f in freeze.features.file], functions={ @@ -533,6 +542,7 @@ def loads_dynamic(s: str) -> DynamicFeatureExtractor: assert isinstance(freeze.features, DynamicFeatures) return null.NullDynamicFeatureExtractor( base_address=freeze.base_address.to_capa(), + sample_hashes=freeze.sample_hashes, global_features=[f.feature.to_capa() for f in freeze.features.global_], file_features=[(f.address.to_capa(), f.feature.to_capa()) for f in freeze.features.file], processes={ diff --git a/capa/ida/helpers.py b/capa/ida/helpers.py index 89e12c60e..f03ba444b 100644 --- a/capa/ida/helpers.py +++ b/capa/ida/helpers.py @@ -153,14 +153,14 @@ def collect_metadata(rules: List[Path]): sha256=sha256, path=idaapi.get_input_file_path(), ), - analysis=rdoc.Analysis( + analysis=rdoc.StaticAnalysis( format=idaapi.get_file_type_name(), arch=arch, os=os, extractor="ida", rules=tuple(r.resolve().absolute().as_posix() for r in rules), base_address=capa.features.freeze.Address.from_capa(idaapi.get_imagebase()), - layout=rdoc.Layout( + layout=rdoc.StaticLayout( functions=(), # this is updated after capabilities have been collected. # will look like: @@ -168,7 +168,7 @@ def collect_metadata(rules: List[Path]): # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } ), # ignore these for now - not used by IDA plugin. - feature_counts=rdoc.FeatureCounts(file=0, functions=()), + feature_counts=rdoc.StaticFeatureCounts(file=0, functions=()), library_functions=(), ), ) diff --git a/capa/ida/plugin/model.py b/capa/ida/plugin/model.py index 47a6e7f75..2d88afb1c 100644 --- a/capa/ida/plugin/model.py +++ b/capa/ida/plugin/model.py @@ -500,16 +500,16 @@ def render_capa_doc_by_program(self, doc: rd.ResultDocument): location = location_.to_capa() parent2: CapaExplorerDataItem - if rule.meta.scope == capa.rules.FILE_SCOPE: + if capa.rules.FILE_SCOPE in rule.meta.scopes: parent2 = parent - elif rule.meta.scope == capa.rules.FUNCTION_SCOPE: + elif capa.rules.FUNCTION_SCOPE in rule.meta.scopes: parent2 = CapaExplorerFunctionItem(parent, location) - elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + elif capa.rules.BASIC_BLOCK_SCOPE in rule.meta.scopes: parent2 = CapaExplorerBlockItem(parent, location) - elif rule.meta.scope == capa.rules.INSTRUCTION_SCOPE: + elif capa.rules.INSTRUCTION_SCOPE in rule.meta.scopes: parent2 = CapaExplorerInstructionItem(parent, location) else: - raise RuntimeError("unexpected rule scope: " + str(rule.meta.scope)) + raise RuntimeError("unexpected rule scope: " + str(rule.meta.scopes.static)) self.render_capa_doc_match(parent2, match, doc) diff --git a/capa/main.py b/capa/main.py index 5a0a67c4c..9ce97bc94 100644 --- a/capa/main.py +++ b/capa/main.py @@ -13,7 +13,6 @@ import sys import json import time -import hashlib import logging import argparse import datetime @@ -21,7 +20,7 @@ import itertools import contextlib import collections -from typing import Any, Dict, List, Tuple, Callable, Optional, cast +from typing import Any, Dict, List, Tuple, Callable, Optional from pathlib import Path import halo @@ -84,6 +83,9 @@ from capa.features.extractors.base_extractor import ( BBHandle, InsnHandle, + SampleHashes, + ThreadHandle, + ProcessHandle, FunctionHandle, FeatureExtractor, StaticFeatureExtractor, @@ -262,9 +264,10 @@ def find_static_capabilities( all_bb_matches = collections.defaultdict(list) # type: MatchResults all_insn_matches = collections.defaultdict(list) # type: MatchResults - feature_counts = rdoc.FeatureCounts(file=0, functions=()) + feature_counts = rdoc.StaticFeatureCounts(file=0, functions=()) library_functions: Tuple[rdoc.LibraryFunction, ...] = () + assert isinstance(extractor, StaticFeatureExtractor) with redirecting_print_to_tqdm(disable_progress): with tqdm.contrib.logging.logging_redirect_tqdm(): pbar = tqdm.tqdm @@ -320,7 +323,7 @@ def pbar(s, *args, **kwargs): # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. - function_and_lower_features: FeatureSet = collections.defaultdict(set) + function_and_lower_features = collections.defaultdict(set) # type: FeatureSet for rule_name, results in itertools.chain( all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items() ): @@ -351,13 +354,131 @@ def pbar(s, *args, **kwargs): return matches, meta +def find_thread_capabilities( + ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle +) -> Tuple[FeatureSet, MatchResults]: + """ + find matches for the given rules for the given thread. + + returns: tuple containing (features for thread, match results for thread) + """ + # all features found for the thread. + features = collections.defaultdict(set) # type: FeatureSet + + for feature, addr in itertools.chain( + extractor.extract_thread_features(ph, th), extractor.extract_global_features() + ): + features[feature].add(addr) + + # matches found at this thread. + _, matches = ruleset.match(Scope.THREAD, features, th.address) + + for rule_name, res in matches.items(): + rule = ruleset[rule_name] + for addr, _ in res: + capa.engine.index_rule_matches(features, rule, [addr]) + + return features, matches + + +def find_process_capabilities( + ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle +) -> Tuple[MatchResults, MatchResults, int]: + """ + find matches for the given rules within the given process. + + returns: tuple containing (match results for process, match results for threads, number of features) + """ + # all features found within this process, + # includes features found within threads. + process_features = collections.defaultdict(set) # type: FeatureSet + + # matches found at the thread scope. + # might be found at different threads, thats ok. + thread_matches = collections.defaultdict(list) # type: MatchResults + + for th in extractor.get_threads(ph): + features, tmatches = find_thread_capabilities(ruleset, extractor, ph, th) + for feature, vas in features.items(): + process_features[feature].update(vas) + + for rule_name, res in tmatches.items(): + thread_matches[rule_name].extend(res) + + for feature, va in itertools.chain(extractor.extract_process_features(ph), extractor.extract_global_features()): + process_features[feature].add(va) + + _, process_matches = ruleset.match(Scope.PROCESS, process_features, ph.address) + return process_matches, thread_matches, len(process_features) + + +def find_dynamic_capabilities( + ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None +) -> Tuple[MatchResults, Any]: + all_process_matches = collections.defaultdict(list) # type: MatchResults + all_thread_matches = collections.defaultdict(list) # type: MatchResults + + feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=()) + + assert isinstance(extractor, DynamicFeatureExtractor) + with redirecting_print_to_tqdm(disable_progress): + with tqdm.contrib.logging.logging_redirect_tqdm(): + pbar = tqdm.tqdm + if disable_progress: + # do not use tqdm to avoid unnecessary side effects when caller intends + # to disable progress completely + def pbar(s, *args, **kwargs): + return s + + processes = list(extractor.get_processes()) + + pb = pbar(processes, desc="matching", unit=" processes", leave=False) + for p in pb: + process_matches, thread_matches, feature_count = find_process_capabilities(ruleset, extractor, p) + feature_counts.processes += ( + rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count), + ) + logger.debug("analyzed process 0x%x and extracted %d features", p.address, feature_count) + + for rule_name, res in process_matches.items(): + all_process_matches[rule_name].extend(res) + for rule_name, res in thread_matches.items(): + all_thread_matches[rule_name].extend(res) + + # collection of features that captures the rule matches within process and thread scopes. + # mapping from feature (matched rule) to set of addresses at which it matched. + process_and_lower_features = collections.defaultdict(set) # type: FeatureSet + for rule_name, results in itertools.chain(all_process_matches.items(), all_thread_matches.items()): + locations = {p[0] for p in results} + rule = ruleset[rule_name] + capa.engine.index_rule_matches(process_and_lower_features, rule, locations) + + all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, process_and_lower_features) + feature_counts.file = feature_count + + matches = dict( + itertools.chain( + # each rule exists in exactly one scope, + # so there won't be any overlap among these following MatchResults, + # and we can merge the dictionaries naively. + all_thread_matches.items(), + all_process_matches.items(), + all_file_matches.items(), + ) + ) + + meta = { + "feature_counts": feature_counts, + } + + return matches, meta + + def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, **kwargs) -> Tuple[MatchResults, Any]: if isinstance(extractor, StaticFeatureExtractor): - extractor_: StaticFeatureExtractor = cast(StaticFeatureExtractor, extractor) - return find_static_capabilities(ruleset, extractor_, kwargs) + return find_static_capabilities(ruleset, extractor, kwargs) elif isinstance(extractor, DynamicFeatureExtractor): - # extractor_ = cast(DynamicFeatureExtractor, extractor) - raise NotImplementedError() + return find_dynamic_capabilities(ruleset, extractor, kwargs) else: raise ValueError(f"unexpected extractor type: {extractor.__class__.__name__}") @@ -773,6 +894,41 @@ def get_signatures(sigs_path: Path) -> List[Path]: return paths +def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts): + if isinstance(extractor, StaticFeatureExtractor): + return rdoc.StaticAnalysis( + format=format_, + arch=arch, + os=os_, + extractor=extractor.__class__.__name__, + rules=tuple(rules_path), + base_address=frz.Address.from_capa(extractor.get_base_address()), + layout=rdoc.StaticLayout( + functions=(), + # this is updated after capabilities have been collected. + # will look like: + # + # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } + ), + feature_counts=counts["feature_counts"], + library_functions=counts["library_functions"], + ) + elif isinstance(extractor, DynamicFeatureExtractor): + return rdoc.DynamicAnalysis( + format=format_, + arch=arch, + os=os_, + extractor=extractor.__class__.__name__, + rules=tuple(rules_path), + layout=rdoc.DynamicLayout( + processes=(), + ), + feature_counts=counts["feature_counts"], + ) + else: + raise ValueError("invalid extractor type") + + def collect_metadata( argv: List[str], sample_path: Path, @@ -780,55 +936,83 @@ def collect_metadata( os_: str, rules_path: List[Path], extractor: FeatureExtractor, + counts: dict, ) -> rdoc.Metadata: - md5 = hashlib.md5() - sha1 = hashlib.sha1() - sha256 = hashlib.sha256() - - assert isinstance(extractor, StaticFeatureExtractor) - buf = sample_path.read_bytes() - - md5.update(buf) - sha1.update(buf) - sha256.update(buf) + # if it's a binary sample we hash it, if it's a report + # we fetch the hashes from the report + sample_hashes: SampleHashes = extractor.get_sample_hashes() + md5, sha1, sha256 = sample_hashes.md5, sample_hashes.sha1, sample_hashes.sha256 rules = tuple(r.resolve().absolute().as_posix() for r in rules_path) format_ = get_format(sample_path) if format_ == FORMAT_AUTO else format_ arch = get_arch(sample_path) os_ = get_os(sample_path) if os_ == OS_AUTO else os_ - base_addr = extractor.get_base_address() if hasattr(extractor, "get_base_address") else NO_ADDRESS return rdoc.Metadata( timestamp=datetime.datetime.now(), version=capa.version.__version__, argv=tuple(argv) if argv else None, sample=rdoc.Sample( - md5=md5.hexdigest(), - sha1=sha1.hexdigest(), - sha256=sha256.hexdigest(), - path=sample_path.resolve().absolute().as_posix(), + md5=md5, + sha1=sha1, + sha256=sha256, + path=str(Path(sample_path).resolve()), ), - analysis=rdoc.Analysis( - format=format_, - arch=arch, - os=os_, - extractor=extractor.__class__.__name__, - rules=rules, - base_address=frz.Address.from_capa(base_addr), - layout=rdoc.Layout( - functions=(), - # this is updated after capabilities have been collected. - # will look like: - # - # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } - ), - feature_counts=rdoc.FeatureCounts(file=0, functions=()), - library_functions=(), + analysis=get_sample_analysis( + format_, + arch, + os_, + extractor, + rules, + counts, ), ) -def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: +def compute_dynamic_layout(rules, extractor: DynamicFeatureExtractor, capabilities) -> rdoc.DynamicLayout: + """ + compute a metadata structure that links threads + to the processes in which they're found. + + only collect the threads at which some rule matched. + otherwise, we may pollute the json document with + a large amount of un-referenced data. + """ + assert isinstance(extractor, DynamicFeatureExtractor) + processes_by_thread: Dict[Address, Address] = {} + threads_by_processes: Dict[Address, List[Address]] = {} + for p in extractor.get_processes(): + threads_by_processes[p.address] = [] + for t in extractor.get_threads(p): + processes_by_thread[t.address] = p.address + threads_by_processes[p.address].append(t.address) + + matched_threads = set() + for rule_name, matches in capabilities.items(): + rule = rules[rule_name] + if capa.rules.THREAD_SCOPE in rule.meta.get("scopes")["dynamic"]: + for addr, _ in matches: + assert addr in processes_by_thread + matched_threads.add(addr) + + layout = rdoc.DynamicLayout( + processes=tuple( + rdoc.ProcessLayout( + address=frz.Address.from_capa(p), + matched_threads=tuple( + rdoc.ThreadLayout(address=frz.Address.from_capa(t)) for t in threads if t in matched_threads + ) # this object is open to extension in the future, + # such as with the function name, etc. + ) + for p, threads in threads_by_processes.items() + if len([t for t in threads if t in matched_threads]) > 0 + ) + ) + + return layout + + +def compute_static_layout(rules, extractor: StaticFeatureExtractor, capabilities) -> rdoc.StaticLayout: """ compute a metadata structure that links basic blocks to the functions in which they're found. @@ -837,6 +1021,7 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: otherwise, we may pollute the json document with a large amount of un-referenced data. """ + assert isinstance(extractor, StaticFeatureExtractor) functions_by_bb: Dict[Address, Address] = {} bbs_by_function: Dict[Address, List[Address]] = {} for f in extractor.get_functions(): @@ -848,12 +1033,12 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: matched_bbs = set() for rule_name, matches in capabilities.items(): rule = rules[rule_name] - if rule.meta.get("scope") == capa.rules.BASIC_BLOCK_SCOPE: + if capa.rules.BASIC_BLOCK_SCOPE in rule.meta.get("scopes")["static"]: for addr, _ in matches: assert addr in functions_by_bb matched_bbs.add(addr) - layout = rdoc.Layout( + layout = rdoc.StaticLayout( functions=tuple( rdoc.FunctionLayout( address=frz.Address.from_capa(f), @@ -870,6 +1055,15 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: return layout +def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: + if isinstance(extractor, StaticFeatureExtractor): + return compute_static_layout(rules, extractor, capabilities) + elif isinstance(extractor, DynamicFeatureExtractor): + return compute_dynamic_layout(rules, extractor, capabilities) + else: + raise ValueError("extractor must be either a static or dynamic extracotr") + + def install_common_args(parser, wanted=None): """ register a common set of command line arguments for re-use by main & scripts. @@ -1307,12 +1501,9 @@ def main(argv: Optional[List[str]] = None): log_unsupported_os_error() return E_INVALID_FILE_OS - meta = collect_metadata(argv, args.sample, args.format, args.os, args.rules, extractor) - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = collect_metadata(argv, args.sample, args.format, args.os, args.rules, extractor, counts) meta.analysis.layout = compute_layout(rules, extractor, capabilities) if has_file_limitation(rules, capabilities): diff --git a/capa/render/proto/__init__.py b/capa/render/proto/__init__.py index 2457b7ecd..94f977ab5 100644 --- a/capa/render/proto/__init__.py +++ b/capa/render/proto/__init__.py @@ -122,6 +122,7 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType: def metadata_to_pb2(meta: rd.Metadata) -> capa_pb2.Metadata: + assert isinstance(meta.analysis, rd.StaticAnalysis) return capa_pb2.Metadata( timestamp=str(meta.timestamp), version=meta.version, @@ -490,14 +491,14 @@ def metadata_from_pb2(meta: capa_pb2.Metadata) -> rd.Metadata: sha256=meta.sample.sha256, path=meta.sample.path, ), - analysis=rd.Analysis( + analysis=rd.StaticAnalysis( format=meta.analysis.format, arch=meta.analysis.arch, os=meta.analysis.os, extractor=meta.analysis.extractor, rules=tuple(meta.analysis.rules), base_address=addr_from_pb2(meta.analysis.base_address), - layout=rd.Layout( + layout=rd.StaticLayout( functions=tuple( [ rd.FunctionLayout( @@ -513,7 +514,7 @@ def metadata_from_pb2(meta: capa_pb2.Metadata) -> rd.Metadata: ] ) ), - feature_counts=rd.FeatureCounts( + feature_counts=rd.StaticFeatureCounts( file=meta.analysis.feature_counts.file, functions=tuple( [ diff --git a/capa/render/result_document.py b/capa/render/result_document.py index 0919207c5..2cbea71f6 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -10,6 +10,7 @@ from typing import Dict, List, Tuple, Union, Optional from pydantic import Field, BaseModel +from typing_extensions import TypeAlias import capa.rules import capa.engine @@ -49,10 +50,26 @@ class FunctionLayout(Model): matched_basic_blocks: Tuple[BasicBlockLayout, ...] -class Layout(Model): +class ThreadLayout(Model): + address: frz.Address + + +class ProcessLayout(Model): + address: frz.Address + matched_threads: Tuple[ThreadLayout, ...] + + +class StaticLayout(Model): functions: Tuple[FunctionLayout, ...] +class DynamicLayout(Model): + processes: Tuple[ProcessLayout, ...] + + +Layout: TypeAlias = Union[StaticLayout, DynamicLayout] + + class LibraryFunction(Model): address: frz.Address name: str @@ -63,23 +80,49 @@ class FunctionFeatureCount(Model): count: int -class FeatureCounts(Model): +class ProcessFeatureCount(Model): + address: frz.Address + count: int + + +class StaticFeatureCounts(Model): file: int functions: Tuple[FunctionFeatureCount, ...] -class Analysis(Model): +class DynamicFeatureCounts(Model): + file: int + processes: Tuple[ProcessFeatureCount, ...] + + +FeatureCounts: TypeAlias = Union[StaticFeatureCounts, DynamicFeatureCounts] + + +class StaticAnalysis(Model): format: str arch: str os: str extractor: str rules: Tuple[str, ...] base_address: frz.Address - layout: Layout - feature_counts: FeatureCounts + layout: StaticLayout + feature_counts: StaticFeatureCounts library_functions: Tuple[LibraryFunction, ...] +class DynamicAnalysis(Model): + format: str + arch: str + os: str + extractor: str + rules: Tuple[str, ...] + layout: DynamicLayout + feature_counts: DynamicFeatureCounts + + +Analysis: TypeAlias = Union[StaticAnalysis, DynamicAnalysis] + + class Metadata(Model): timestamp: datetime.datetime version: str @@ -510,7 +553,7 @@ class RuleMetadata(FrozenModel): name: str namespace: Optional[str] authors: Tuple[str, ...] - scope: capa.rules.Scope + scopes: capa.rules.Scopes attack: Tuple[AttackSpec, ...] = Field(alias="att&ck") mbc: Tuple[MBCSpec, ...] references: Tuple[str, ...] @@ -527,7 +570,7 @@ def from_capa(cls, rule: capa.rules.Rule) -> "RuleMetadata": name=rule.meta.get("name"), namespace=rule.meta.get("namespace"), authors=rule.meta.get("authors"), - scope=capa.rules.Scope(rule.meta.get("scope")), + scopes=capa.rules.Scopes.from_dict(rule.meta.get("scopes")), attack=tuple(map(AttackSpec.from_str, rule.meta.get("att&ck", []))), mbc=tuple(map(MBCSpec.from_str, rule.meta.get("mbc", []))), references=rule.meta.get("references", []), diff --git a/capa/render/verbose.py b/capa/render/verbose.py index baf87364e..4be810edd 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -60,13 +60,26 @@ def format_address(address: frz.Address) -> str: assert isinstance(id_, int) assert isinstance(return_address, int) return f"event: {id_}, retaddr: 0x{return_address:x}" + elif address.type == frz.AddressType.PROCESS: + assert isinstance(address.value, tuple) + ppid, pid = address.value + assert isinstance(ppid, int) + assert isinstance(pid, int) + return f"process ppid: {ppid}, process pid: {pid}" + elif address.type == frz.AddressType.THREAD: + assert isinstance(address.value, tuple) + ppid, pid, tid = address.value + assert isinstance(ppid, int) + assert isinstance(pid, int) + assert isinstance(tid, int) + return f"process ppid: {ppid}, process pid: {pid}, thread id: {tid}" elif address.type == frz.AddressType.NO_ADDRESS: return "global" else: raise ValueError("unexpected address type") -def render_meta(ostream, doc: rd.ResultDocument): +def render_static_meta(ostream, doc: rd.ResultDocument): """ like: @@ -85,6 +98,8 @@ def render_meta(ostream, doc: rd.ResultDocument): function count 42 total feature count 1918 """ + + assert isinstance(doc.meta.analysis, rd.StaticAnalysis) rows = [ ("md5", doc.meta.sample.md5), ("sha1", doc.meta.sample.sha1), @@ -109,6 +124,57 @@ def render_meta(ostream, doc: rd.ResultDocument): ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) +def render_dynamic_meta(ostream, doc: rd.ResultDocument): + """ + like: + + md5 84882c9d43e23d63b82004fae74ebb61 + sha1 c6fb3b50d946bec6f391aefa4e54478cf8607211 + sha256 5eced7367ed63354b4ed5c556e2363514293f614c2c2eb187273381b2ef5f0f9 + path /tmp/packed-report,jspn + timestamp 2023-07-17T10:17:05.796933 + capa version 0.0.0 + os windows + format pe + arch amd64 + extractor CAPEFeatureExtractor + rules (embedded rules) + process count 42 + total feature count 1918 + """ + + assert isinstance(doc.meta.analysis, rd.DynamicAnalysis) + rows = [ + ("md5", doc.meta.sample.md5), + ("sha1", doc.meta.sample.sha1), + ("sha256", doc.meta.sample.sha256), + ("path", doc.meta.sample.path), + ("timestamp", doc.meta.timestamp), + ("capa version", doc.meta.version), + ("os", doc.meta.analysis.os), + ("format", doc.meta.analysis.format), + ("arch", doc.meta.analysis.arch), + ("extractor", doc.meta.analysis.extractor), + ("rules", "\n".join(doc.meta.analysis.rules)), + ("process count", len(doc.meta.analysis.feature_counts.processes)), + ( + "total feature count", + doc.meta.analysis.feature_counts.file + sum(p.count for p in doc.meta.analysis.feature_counts.processes), + ), + ] + + ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) + + +def render_meta(osstream, doc: rd.ResultDocument): + if isinstance(doc.meta.analysis, rd.StaticAnalysis): + render_static_meta(osstream, doc) + elif isinstance(doc.meta.analysis, rd.DynamicAnalysis): + render_dynamic_meta(osstream, doc) + else: + raise ValueError("invalid meta analysis") + + def render_rules(ostream, doc: rd.ResultDocument): """ like: @@ -132,7 +198,7 @@ def render_rules(ostream, doc: rd.ResultDocument): had_match = True rows = [] - for key in ("namespace", "description", "scope"): + for key in ("namespace", "description", "scopes"): v = getattr(rule.meta, key) if not v: continue @@ -145,7 +211,7 @@ def render_rules(ostream, doc: rd.ResultDocument): rows.append((key, v)) - if rule.meta.scope != capa.rules.FILE_SCOPE: + if capa.rules.FILE_SCOPE not in rule.meta.scopes: locations = [m[0] for m in doc.rules[rule.meta.name].matches] rows.append(("matches", "\n".join(map(format_address, locations)))) diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 59189833e..205b2e94d 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -260,7 +260,8 @@ def render_rules(ostream, doc: rd.ResultDocument): check for OutputDebugString error namespace anti-analysis/anti-debugging/debugger-detection author michael.hunhoff@mandiant.com - scope function + static scope: function + dynamic scope: process mbc Anti-Behavioral Analysis::Detect Debugger::OutputDebugString function @ 0x10004706 and: @@ -268,13 +269,25 @@ def render_rules(ostream, doc: rd.ResultDocument): api: kernel32.GetLastError @ 0x10004A87 api: kernel32.OutputDebugString @ 0x10004767, 0x10004787, 0x10004816, 0x10004895 """ - functions_by_bb: Dict[capa.features.address.Address, capa.features.address.Address] = {} - for finfo in doc.meta.analysis.layout.functions: - faddress = finfo.address.to_capa() - for bb in finfo.matched_basic_blocks: - bbaddress = bb.address.to_capa() - functions_by_bb[bbaddress] = faddress + functions_by_bb: Dict[capa.features.address.Address, capa.features.address.Address] = {} + processes_by_thread: Dict[capa.features.address.Address, capa.features.address.Address] = {} + if isinstance(doc.meta.analysis, rd.StaticAnalysis): + for finfo in doc.meta.analysis.layout.functions: + faddress = finfo.address.to_capa() + + for bb in finfo.matched_basic_blocks: + bbaddress = bb.address.to_capa() + functions_by_bb[bbaddress] = faddress + elif isinstance(doc.meta.analysis, rd.DynamicAnalysis): + for pinfo in doc.meta.analysis.layout.processes: + paddress = pinfo.address.to_capa() + + for thread in pinfo.matched_threads: + taddress = thread.address.to_capa() + processes_by_thread[taddress] = paddress + else: + raise ValueError("invalid analysis field in the document's meta") had_match = False @@ -323,7 +336,11 @@ def render_rules(ostream, doc: rd.ResultDocument): rows.append(("author", ", ".join(rule.meta.authors))) - rows.append(("scope", rule.meta.scope.value)) + if rule.meta.scopes.static: + rows.append(("static scope:", str(rule.meta.scopes.static))) + + if rule.meta.scopes.dynamic: + rows.append(("dynamic scope:", str(rule.meta.scopes.dynamic))) if rule.meta.attack: rows.append(("att&ck", ", ".join([rutils.format_parts_id(v) for v in rule.meta.attack]))) @@ -339,7 +356,7 @@ def render_rules(ostream, doc: rd.ResultDocument): ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) - if rule.meta.scope == capa.rules.FILE_SCOPE: + if capa.rules.FILE_SCOPE in rule.meta.scopes: matches = doc.rules[rule.meta.name].matches if len(matches) != 1: # i think there should only ever be one match per file-scope rule, @@ -351,16 +368,25 @@ def render_rules(ostream, doc: rd.ResultDocument): render_match(ostream, first_match, indent=0) else: for location, match in sorted(doc.rules[rule.meta.name].matches): - ostream.write(rule.meta.scope) + ostream.write(f"static scope: {rule.meta.scopes.static}") + ostream.write(f"dynamic scope: {rule.meta.scopes.dynamic}") ostream.write(" @ ") ostream.write(capa.render.verbose.format_address(location)) - if rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + if capa.rules.BASIC_BLOCK_SCOPE in rule.meta.scopes: ostream.write( " in function " + capa.render.verbose.format_address(frz.Address.from_capa(functions_by_bb[location.to_capa()])) ) + if capa.rules.THREAD_SCOPE in rule.meta.scopes: + ostream.write( + " in process " + + capa.render.verbose.format_address( + frz.Address.from_capa(processes_by_thread[location.to_capa()]) + ) + ) + ostream.write("\n") render_match(ostream, match, indent=1) if rule.meta.lib: diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py index fb6ecdd35..2ecae8989 100644 --- a/scripts/bulk-process.py +++ b/scripts/bulk-process.py @@ -136,11 +136,9 @@ def get_capa_results(args): "error": f"unexpected error: {e}", } - meta = capa.main.collect_metadata([], path, format, os_, [], extractor) capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = capa.main.collect_metadata([], path, format, os_, [], extractor, counts) meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) doc = rd.ResultDocument.from_capa(meta, rules, capabilities) diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py index 06613dcbd..7311107a9 100644 --- a/scripts/capa_as_library.py +++ b/scripts/capa_as_library.py @@ -178,10 +178,7 @@ def capa_details(rules_path: Path, file_path: Path, output_format="dictionary"): capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) # collect metadata (used only to make rendering more complete) - meta = capa.main.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, [rules_path], extractor) - - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = capa.main.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, [rules_path], extractor, counts) meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) capa_output: Any = False diff --git a/scripts/import-to-ida.py b/scripts/import-to-ida.py index 121d81580..5a8ed1893 100644 --- a/scripts/import-to-ida.py +++ b/scripts/import-to-ida.py @@ -89,7 +89,7 @@ def main(): continue if rule.meta.is_subscope_rule: continue - if rule.meta.scope != capa.rules.Scope.FUNCTION: + if capa.rules.Scope.FUNCTION in rule.meta.scopes: continue ns = rule.meta.namespace diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index 7bb2a8eba..d9e33183d 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -102,6 +102,7 @@ def render_matches_by_function(doc: rd.ResultDocument): - send HTTP request - connect to HTTP server """ + assert isinstance(doc.meta.analysis, rd.StaticAnalysis) functions_by_bb: Dict[Address, Address] = {} for finfo in doc.meta.analysis.layout.functions: faddress = finfo.address @@ -114,10 +115,10 @@ def render_matches_by_function(doc: rd.ResultDocument): matches_by_function = collections.defaultdict(set) for rule in rutils.capability_rules(doc): - if rule.meta.scope == capa.rules.FUNCTION_SCOPE: + if capa.rules.FUNCTION_SCOPE in rule.meta.scopes: for addr, _ in rule.matches: matches_by_function[addr].add(rule.meta.name) - elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + elif capa.rules.BASIC_BLOCK_SCOPE in rule.meta.scopes: for addr, _ in rule.matches: function = functions_by_bb[addr] matches_by_function[function].add(rule.meta.name) @@ -185,11 +186,9 @@ def main(argv=None): capa.helpers.log_unsupported_runtime_error() return -1 - meta = capa.main.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor) capabilities, counts = capa.main.find_capabilities(rules, extractor) - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = capa.main.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor, counts) meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) if capa.main.has_file_limitation(rules, capabilities): diff --git a/tests/_test_proto.py b/tests/_test_proto.py index 6f0137fef..c75ed3da1 100644 --- a/tests/_test_proto.py +++ b/tests/_test_proto.py @@ -129,6 +129,7 @@ def cmp_optional(a: Any, b: Any) -> bool: def assert_meta(meta: rd.Metadata, dst: capa_pb2.Metadata): + assert isinstance(rd.Metadata.analysis, rd.StaticAnalysis) assert str(meta.timestamp) == dst.timestamp assert meta.version == dst.version if meta.argv is None: @@ -148,6 +149,7 @@ def assert_meta(meta: rd.Metadata, dst: capa_pb2.Metadata): assert list(meta.analysis.rules) == dst.analysis.rules assert capa.render.proto.addr_to_pb2(meta.analysis.base_address) == dst.analysis.base_address + assert isinstance(rd.Metadata.analysis.layout, rd.StaticLayout) assert len(meta.analysis.layout.functions) == len(dst.analysis.layout.functions) for rd_f, proto_f in zip(meta.analysis.layout.functions, dst.analysis.layout.functions): assert capa.render.proto.addr_to_pb2(rd_f.address) == proto_f.address diff --git a/tests/fixtures.py b/tests/fixtures.py index f9a36041c..6ed04d6e0 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -38,7 +38,14 @@ FeatureAccess, ) from capa.features.address import Address -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, ThreadHandle, ProcessHandle, FunctionHandle +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + ThreadHandle, + ProcessHandle, + FunctionHandle, +) from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = Path(__file__).resolve().parent @@ -602,6 +609,54 @@ def parametrize(params, values, **kwargs): return pytest.mark.parametrize(params, values, ids=ids, **kwargs) +EXTRACTOR_HASHING_TESTS = [ + # viv extractor + ( + get_viv_extractor(get_data_path_by_name("mimikatz")), + SampleHashes( + md5="5f66b82558ca92e54e77f216ef4c066c", + sha1="e4f82e4d7f22938dc0a0ff8a4a7ad2a763643d38", + sha256="131314a6f6d1d263c75b9909586b3e1bd837036329ace5e69241749e861ac01d", + ), + ), + # PE extractor + ( + get_pefile_extractor(get_data_path_by_name("mimikatz")), + SampleHashes( + md5="5f66b82558ca92e54e77f216ef4c066c", + sha1="e4f82e4d7f22938dc0a0ff8a4a7ad2a763643d38", + sha256="131314a6f6d1d263c75b9909586b3e1bd837036329ace5e69241749e861ac01d", + ), + ), + # dnFile extractor + ( + get_dnfile_extractor(get_data_path_by_name("b9f5b")), + SampleHashes( + md5="b9f5bd514485fb06da39beff051b9fdc", + sha1="c72a2e50410475a51d897d29ffbbaf2103754d53", + sha256="34acc4c0b61b5ce0b37c3589f97d1f23e6d84011a241e6f85683ee517ce786f1", + ), + ), + # dotnet File + ( + get_dotnetfile_extractor(get_data_path_by_name("b9f5b")), + SampleHashes( + md5="b9f5bd514485fb06da39beff051b9fdc", + sha1="c72a2e50410475a51d897d29ffbbaf2103754d53", + sha256="34acc4c0b61b5ce0b37c3589f97d1f23e6d84011a241e6f85683ee517ce786f1", + ), + ), + # cape extractor + ( + get_cape_extractor(get_data_path_by_name("0000a657")), + SampleHashes( + md5="e2147b5333879f98d515cd9aa905d489", + sha1="ad4d520fb7792b4a5701df973d6bd8a6cbfbb57f", + sha256="0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82", + ), + ), +] + DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( [ # file/string diff --git a/tests/test_binja_features.py b/tests/test_binja_features.py index 4daaa7901..f0f137783 100644 --- a/tests/test_binja_features.py +++ b/tests/test_binja_features.py @@ -63,6 +63,7 @@ def test_binja_feature_counts(sample, scope, feature, expected): @pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +@pytest.mark.xfail(reason="relies on the legacy ruleset which hasn't been updated yet") def test_standalone_binja_backend(): CD = Path(__file__).resolve().parent test_path = CD / ".." / "tests" / "data" / "Practical Malware Analysis Lab 01-01.exe_" diff --git a/tests/test_extractor_hashing.py b/tests/test_extractor_hashing.py new file mode 100644 index 000000000..9bb2fe5e1 --- /dev/null +++ b/tests/test_extractor_hashing.py @@ -0,0 +1,50 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging + +import pytest +import fixtures + +from capa.features.extractors.base_extractor import SampleHashes + +logger = logging.getLogger(__name__) + + +@fixtures.parametrize( + "extractor,hashes", + fixtures.EXTRACTOR_HASHING_TESTS, +) +def test_hash_extraction(extractor, hashes): + assert extractor.get_sample_hashes() == hashes + + +# We need to skip the binja test if we cannot import binaryninja, e.g., in GitHub CI. +binja_present: bool = False +try: + import binaryninja + + try: + binaryninja.load(source=b"\x90") + except RuntimeError: + logger.warning("Binary Ninja license is not valid, provide via $BN_LICENSE or license.dat") + else: + binja_present = True +except ImportError: + pass + + +@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +def test_binja_hash_extraction(): + extractor = fixtures.get_binja_extractor(fixtures.get_data_path_by_name("mimikatz")) + hashes = SampleHashes( + md5="5f66b82558ca92e54e77f216ef4c066c", + sha1="e4f82e4d7f22938dc0a0ff8a4a7ad2a763643d38", + sha256="131314a6f6d1d263c75b9909586b3e1bd837036329ace5e69241749e861ac01d", + ) + assert extractor.get_sample_hashes() == hashes diff --git a/tests/test_main.py b/tests/test_main.py index c4c27d326..94401667f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -61,6 +61,7 @@ def test_main_single_rule(z9324d_extractor, tmpdir): ) +@pytest.mark.xfail(reason="relies on the legeacy ruleset. scopes keyword hasn't been added there") def test_main_non_ascii_filename(pingtaest_extractor, tmpdir, capsys): # here we print a string with unicode characters in it # (specifically, a byte string with utf-8 bytes in it, see file encoding) @@ -531,7 +532,7 @@ def test_main_dotnet4(_039a6_dotnetfile_extractor): assert capa.main.main([path, "-vv"]) == 0 -@pytest.mark.xfail(reason="ResultDocument hasn't been updated yet") +@pytest.mark.xfail(reason="relies on the legeacy ruleset. scopes keyword hasn't been added there") def test_main_rd(): path = str(fixtures.get_data_path_by_name("pma01-01-rd")) assert capa.main.main([path, "-vv"]) == 0 diff --git a/tests/_test_render.py b/tests/test_render.py similarity index 100% rename from tests/_test_render.py rename to tests/test_render.py diff --git a/tests/_test_result_document.py b/tests/test_result_document.py similarity index 88% rename from tests/_test_result_document.py rename to tests/test_result_document.py index 27a1dbb29..bcaf03fb7 100644 --- a/tests/_test_result_document.py +++ b/tests/test_result_document.py @@ -257,24 +257,42 @@ def assert_round_trip(rd: rdoc.ResultDocument): @pytest.mark.parametrize( "rd_file", [ - pytest.param("a3f3bbc_rd"), - pytest.param("al_khaserx86_rd"), - pytest.param("al_khaserx64_rd"), - pytest.param("a076114_rd"), + pytest.param( + "a3f3bbc_rd", + marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added"), + ), + pytest.param( + "al_khaserx86_rd", + marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added"), + ), + pytest.param( + "al_khaserx64_rd", + marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added"), + ), + pytest.param( + "a076114_rd", + marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added"), + ), pytest.param("pma0101_rd"), - pytest.param("dotnet_1c444e_rd"), + pytest.param( + "dotnet_1c444e_rd", + marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added"), + ), ], ) +@pytest.mark.xfail(reason="samples haven't been modified to the scopes keyword") def test_round_trip(request, rd_file): rd: rdoc.ResultDocument = request.getfixturevalue(rd_file) assert_round_trip(rd) +@pytest.mark.xfail(reason="samples haven't been modified to the scopes keyword") def test_json_to_rdoc(): path = fixtures.get_data_path_by_name("pma01-01-rd") assert isinstance(rdoc.ResultDocument.parse_file(path), rdoc.ResultDocument) +@pytest.mark.xfail(reason="samples haven't been modified to the scopes keyword") def test_rdoc_to_capa(): path = fixtures.get_data_path_by_name("pma01-01-rd") diff --git a/tests/test_static_freeze.py b/tests/test_static_freeze.py index 2a5765299..16dde31d6 100644 --- a/tests/test_static_freeze.py +++ b/tests/test_static_freeze.py @@ -22,10 +22,15 @@ import capa.features.extractors.null import capa.features.extractors.base_extractor from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, FunctionHandle +from capa.features.extractors.base_extractor import BBHandle, SampleHashes, FunctionHandle EXTRACTOR = capa.features.extractors.null.NullStaticFeatureExtractor( base_address=AbsoluteVirtualAddress(0x401000), + sample_hashes=SampleHashes( + md5="6eb7ee7babf913d75df3f86c229df9e7", + sha1="2a082494519acd5130d5120fa48786df7275fdd7", + sha256="0c7d1a34eb9fd55bedbf37ba16e3d5dd8c1dd1d002479cc4af27ef0f82bb4792", + ), global_features=[], file_features=[ (AbsoluteVirtualAddress(0x402345), capa.features.common.Characteristic("embedded pe")),