From e3f60ea0fbb0c6431ed2e93ce3e8ade108bdbb09 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Mon, 17 Jul 2023 11:50:49 +0100 Subject: [PATCH 01/37] initial commit --- CHANGELOG.md | 1 + capa/main.py | 306 +++++++++++++++++++---- capa/render/result_document.py | 57 ++++- capa/render/verbose.py | 72 +++++- capa/render/vverbose.py | 10 +- scripts/bulk-process.py | 4 +- scripts/capa_as_library.py | 5 +- scripts/import-to-ida.py | 2 +- scripts/show-capabilities-by-function.py | 9 +- 9 files changed, 396 insertions(+), 70 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index caebb42f4..6b2db7612 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ - publish via PyPI trusted publishing #1491 @williballenthin - migrate to pyproject.toml #1301 @williballenthin - Add ProcessesAddress and ThreadAddress #1612 @yelhamer +- Add dynamic capability extraction @yelhamer ### Breaking Changes - Update Metadata type in capa main [#1411](https://github.com/mandiant/capa/issues/1411) [@Aayush-Goel-04](https://github.com/aayush-goel-04) @manasghandat diff --git a/capa/main.py b/capa/main.py index ea460e366..79296c040 100644 --- a/capa/main.py +++ b/capa/main.py @@ -22,7 +22,7 @@ import itertools import contextlib import collections -from typing import Any, Dict, List, Tuple, Callable, cast +from typing import Any, Dict, List, Tuple, Callable import halo import tqdm @@ -84,6 +84,8 @@ from capa.features.extractors.base_extractor import ( BBHandle, InsnHandle, + ThreadHandle, + ProcessHandle, FunctionHandle, FeatureExtractor, StaticFeatureExtractor, @@ -264,6 +266,7 @@ def find_static_capabilities( feature_counts = rdoc.FeatureCounts(file=0, functions=()) library_functions: Tuple[rdoc.LibraryFunction, ...] = () + assert isinstance(extractor, StaticFeatureExtractor) with redirecting_print_to_tqdm(disable_progress): with tqdm.contrib.logging.logging_redirect_tqdm(): pbar = tqdm.tqdm @@ -338,13 +341,131 @@ def pbar(s, *args, **kwargs): return matches, meta +def find_thread_capabilities( + ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle +) -> Tuple[FeatureSet, MatchResults]: + """ + find matches for the given rules for the given thread. + + returns: tuple containing (features for thread, match results for thread) + """ + # all features found for the instruction. + features = collections.defaultdict(set) # type: FeatureSet + + for feature, addr in itertools.chain( + extractor.extract_thread_features(ph, th), extractor.extract_global_features() + ): + features[feature].add(addr) + + # matches found at this instruction. + _, matches = ruleset.match(Scope.THREAD, features, th.address) + + for rule_name, res in matches.items(): + rule = ruleset[rule_name] + for addr, _ in res: + capa.engine.index_rule_matches(features, rule, [addr]) + + return features, matches + + +def find_process_capabilities( + ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle +) -> Tuple[MatchResults, MatchResults, int]: + """ + find matches for the given rules within the given process. + + returns: tuple containing (match results for process, match results for threads, number of features) + """ + # all features found within this process, + # includes features found within threads. + process_features = collections.defaultdict(set) # type: FeatureSet + + # matches found at the thread scope. + # might be found at different threads, thats ok. + thread_matches = collections.defaultdict(list) # type: MatchResults + + for th in extractor.get_threads(ph): + features, tmatches = find_thread_capabilities(ruleset, extractor, ph, th) + for feature, vas in features.items(): + process_features[feature].update(vas) + + for rule_name, res in tmatches.items(): + thread_matches[rule_name].extend(res) + + for feature, va in itertools.chain(extractor.extract_process_features(ph), extractor.extract_global_features()): + process_features[feature].add(va) + + _, process_matches = ruleset.match(Scope.PROCESS, process_features, ph.address) + return process_matches, thread_matches, len(process_features) + + +def find_dynamic_capabilities( + ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None +) -> Tuple[MatchResults, Any]: + all_process_matches = collections.defaultdict(list) # type: MatchResults + all_thread_matches = collections.defaultdict(list) # type: MatchResults + + feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=()) + + assert isinstance(extractor, DynamicFeatureExtractor) + with redirecting_print_to_tqdm(disable_progress): + with tqdm.contrib.logging.logging_redirect_tqdm(): + pbar = tqdm.tqdm + if disable_progress: + # do not use tqdm to avoid unnecessary side effects when caller intends + # to disable progress completely + def pbar(s, *args, **kwargs): + return s + + processes = list(extractor.get_processes()) + + pb = pbar(processes, desc="matching", unit=" processes", leave=False) + for p in pb: + process_matches, thread_matches, feature_count = find_process_capabilities(ruleset, extractor, p) + feature_counts.processes += ( + rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count), + ) + logger.debug("analyzed process 0x%x and extracted %d features", p.address, feature_count) + + for rule_name, res in process_matches.items(): + all_process_matches[rule_name].extend(res) + for rule_name, res in thread_matches.items(): + all_thread_matches[rule_name].extend(res) + + # collection of features that captures the rule matches within process and thread scopes. + # mapping from feature (matched rule) to set of addresses at which it matched. + process_and_lower_features: FeatureSet = collections.defaultdict(set) + for rule_name, results in itertools.chain(all_process_matches.items(), all_thread_matches.items()): + locations = {p[0] for p in results} + rule = ruleset[rule_name] + capa.engine.index_rule_matches(process_and_lower_features, rule, locations) + + all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, process_and_lower_features) + feature_counts.file = feature_count + + matches = dict( + itertools.chain( + # each rule exists in exactly one scope, + # so there won't be any overlap among these following MatchResults, + # and we can merge the dictionaries naively. + all_thread_matches.items(), + all_process_matches.items(), + all_file_matches.items(), + ) + ) + + meta = { + "feature_counts": feature_counts, + } + + return matches, meta + + def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, **kwargs) -> Tuple[MatchResults, Any]: if isinstance(extractor, StaticFeatureExtractor): - extractor_: StaticFeatureExtractor = cast(StaticFeatureExtractor, extractor) - return find_static_capabilities(ruleset, extractor_, kwargs) + return find_static_capabilities(ruleset, extractor, kwargs) elif isinstance(extractor, DynamicFeatureExtractor): - # extractor_ = cast(DynamicFeatureExtractor, extractor) - raise NotImplementedError() + return find_dynamic_capabilities(ruleset, extractor, kwargs) else: raise ValueError(f"unexpected extractor type: {extractor.__class__.__name__}") @@ -773,6 +894,72 @@ def get_signatures(sigs_path): return paths +def get_sample_hashes(sample_path, extractor: FeatureExtractor) -> Tuple[str, str, str]: + if isinstance(extractor, StaticFeatureExtractor): + md5_ = hashlib.md5() + sha1_ = hashlib.sha1() + sha256_ = hashlib.sha256() + + with open(sample_path, "rb") as f: + buf = f.read() + + md5_.update(buf) + sha1_.update(buf) + sha256_.update(buf) + + md5, sha1, sha256 = md5_.hexdigest(), sha1_.hexdigest(), sha256_.hexdigest() + elif isinstance(extractor, DynamicFeatureExtractor): + import json + + if isinstance(extractor, capa.features.extractors.cape.extractor.CapeExtractor): + with open(sample_path, "rb") as f: + report = json.load(f) + md5 = report["target"]["file"]["md5"] + sha1 = report["target"]["file"]["sha1"] + sha256 = report["target"]["file"]["sha256"] + else: + md5, sha1, sha256 = "0", "0", "0" + else: + raise ValueError("invalid extractor") + + return md5, sha1, sha256 + + +def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts): + if isinstance(extractor, StaticFeatureExtractor): + return rdoc.StaticAnalysis( + format=format_, + arch=arch, + os=os_, + extractor=extractor.__class__.__name__, + rules=tuple(rules_path), + base_address=frz.Address.from_capa(extractor.get_base_address()), + layout=rdoc.StaticLayout( + functions=(), + # this is updated after capabilities have been collected. + # will look like: + # + # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } + ), + feature_counts=counts["feature_counts"], + library_functions=counts["library_functions"], + ) + elif isinstance(extractor, DynamicFeatureExtractor): + return rdoc.DynamicAnalysis( + format=format_, + arch=arch, + os=os_, + extractor=extractor.__class__.__name__, + rules=tuple(rules_path), + layout=rdoc.DynamicLayout( + processes=(), + ), + feature_counts=counts["feature_counts"], + ) + else: + raise ValueError("invalid extractor type") + + def collect_metadata( argv: List[str], sample_path: str, @@ -780,18 +967,11 @@ def collect_metadata( os_: str, rules_path: List[str], extractor: FeatureExtractor, + counts: dict, ) -> rdoc.Metadata: - md5 = hashlib.md5() - sha1 = hashlib.sha1() - sha256 = hashlib.sha256() - - assert isinstance(extractor, StaticFeatureExtractor) - with open(sample_path, "rb") as f: - buf = f.read() - - md5.update(buf) - sha1.update(buf) - sha256.update(buf) + # if it's a binary sample we hash it, if it's a report + # we fetch the hashes from the report + md5, sha1, sha256 = get_sample_hashes(sample_path, extractor) if rules_path != [RULES_PATH_DEFAULT_STRING]: rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path] @@ -799,39 +979,72 @@ def collect_metadata( format_ = get_format(sample_path) if format_ == FORMAT_AUTO else format_ arch = get_arch(sample_path) os_ = get_os(sample_path) if os_ == OS_AUTO else os_ - base_addr = extractor.get_base_address() if hasattr(extractor, "get_base_address") else NO_ADDRESS return rdoc.Metadata( timestamp=datetime.datetime.now(), version=capa.version.__version__, argv=tuple(argv) if argv else None, sample=rdoc.Sample( - md5=md5.hexdigest(), - sha1=sha1.hexdigest(), - sha256=sha256.hexdigest(), + md5=md5, + sha1=sha1, + sha256=sha256, path=os.path.normpath(sample_path), ), - analysis=rdoc.Analysis( - format=format_, - arch=arch, - os=os_, - extractor=extractor.__class__.__name__, - rules=tuple(rules_path), - base_address=frz.Address.from_capa(base_addr), - layout=rdoc.Layout( - functions=(), - # this is updated after capabilities have been collected. - # will look like: - # - # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } - ), - feature_counts=rdoc.FeatureCounts(file=0, functions=()), - library_functions=(), + analysis=get_sample_analysis( + format_, + arch, + os_, + extractor, + rules_path, + counts, ), ) -def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: +def compute_dynamic_layout(rules, extractor, capabilities) -> rdoc.Layout: + """ + compute a metadata structure that links threads + to the processes in which they're found. + + only collect the threads at which some rule matched. + otherwise, we may pollute the json document with + a large amount of un-referenced data. + """ + assert isinstance(extractor, DynamicFeatureExtractor) + processes_by_thread: Dict[Address, Address] = {} + threads_by_processes: Dict[Address, List[Address]] = {} + for p in extractor.get_processes(): + threads_by_processes[p.address] = [] + for t in extractor.get_threads(p): + processes_by_thread[t.address] = p.address + threads_by_processes[p.address].append(t.address) + + matched_threads = set() + for rule_name, matches in capabilities.items(): + rule = rules[rule_name] + if capa.rules.BASIC_BLOCK_SCOPE in rule.meta.get("scopes")["dynamic"]: + for addr, _ in matches: + assert addr in processes_by_thread + matched_threads.add(addr) + + layout = rdoc.DynamicLayout( + processes=tuple( + rdoc.ProcessLayout( + address=frz.Address.from_capa(p), + matched_threads=tuple( + rdoc.ThreadLayout(address=frz.Address.from_capa(t)) for t in threads if t in matched_threads + ) # this object is open to extension in the future, + # such as with the function name, etc. + ) + for p, threads in threads_by_processes.items() + if len([t for t in threads if t in matched_threads]) > 0 + ) + ) + + return layout + + +def compute_static_layout(rules, extractor, capabilities) -> rdoc.Layout: """ compute a metadata structure that links basic blocks to the functions in which they're found. @@ -840,6 +1053,7 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: otherwise, we may pollute the json document with a large amount of un-referenced data. """ + assert isinstance(extractor, StaticFeatureExtractor) functions_by_bb: Dict[Address, Address] = {} bbs_by_function: Dict[Address, List[Address]] = {} for f in extractor.get_functions(): @@ -851,12 +1065,12 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: matched_bbs = set() for rule_name, matches in capabilities.items(): rule = rules[rule_name] - if rule.meta.get("scope") == capa.rules.BASIC_BLOCK_SCOPE: + if capa.rules.BASIC_BLOCK_SCOPE in rule.meta.get("scopes")["static"]: for addr, _ in matches: assert addr in functions_by_bb matched_bbs.add(addr) - layout = rdoc.Layout( + layout = rdoc.StaticLayout( functions=tuple( rdoc.FunctionLayout( address=frz.Address.from_capa(f), @@ -873,6 +1087,15 @@ def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: return layout +def compute_layout(rules, extractor, capabilities) -> rdoc.Layout: + if isinstance(extractor, StaticFeatureExtractor): + return compute_static_layout(rules, extractor, capabilities) + elif isinstance(extractor, DynamicFeatureExtractor): + return compute_dynamic_layout(rules, extractor, capabilities) + else: + raise ValueError("extractor must be either a static or dynamic extracotr") + + def install_common_args(parser, wanted=None): """ register a common set of command line arguments for re-use by main & scripts. @@ -1308,12 +1531,9 @@ def main(argv=None): log_unsupported_os_error() return E_INVALID_FILE_OS - meta = collect_metadata(argv, args.sample, args.format, args.os, args.rules, extractor) - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = collect_metadata(argv, args.sample, args.format, args.os, args.rules, extractor, counts) meta.analysis.layout = compute_layout(rules, extractor, capabilities) if has_file_limitation(rules, capabilities): diff --git a/capa/render/result_document.py b/capa/render/result_document.py index 00c3eb9bc..ae7917d02 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -10,6 +10,7 @@ from typing import Dict, List, Tuple, Union, Optional from pydantic import Field, BaseModel +from typing_extensions import TypeAlias import capa.rules import capa.engine @@ -49,10 +50,26 @@ class FunctionLayout(Model): matched_basic_blocks: Tuple[BasicBlockLayout, ...] -class Layout(Model): +class ThreadLayout(Model): + address: frz.Address + + +class ProcessLayout(Model): + address: frz.Address + matched_threads: Tuple[ThreadLayout, ...] + + +class StaticLayout(Model): functions: Tuple[FunctionLayout, ...] +class DynamicLayout(Model): + processes: Tuple[ProcessLayout, ...] + + +Layout: TypeAlias = Union[StaticLayout, DynamicLayout] + + class LibraryFunction(Model): address: frz.Address name: str @@ -63,23 +80,49 @@ class FunctionFeatureCount(Model): count: int -class FeatureCounts(Model): +class ProcessFeatureCount(Model): + address: frz.Address + count: int + + +class StaticFeatureCounts(Model): file: int functions: Tuple[FunctionFeatureCount, ...] -class Analysis(Model): +class DynamicFeatureCounts(Model): + file: int + processes: Tuple[ProcessFeatureCount, ...] + + +FeatureCounts: TypeAlias = Union[StaticFeatureCounts, DynamicFeatureCounts] + + +class StaticAnalysis(Model): format: str arch: str os: str extractor: str rules: Tuple[str, ...] base_address: frz.Address - layout: Layout - feature_counts: FeatureCounts + layout: StaticLayout + feature_counts: StaticFeatureCounts library_functions: Tuple[LibraryFunction, ...] +class DynamicAnalysis(Model): + format: str + arch: str + os: str + extractor: str + rules: Tuple[str, ...] + layout: DynamicLayout + feature_counts: DynamicFeatureCounts + + +Analysis: TypeAlias = Union[StaticAnalysis, DynamicAnalysis] + + class Metadata(Model): timestamp: datetime.datetime version: str @@ -510,7 +553,7 @@ class RuleMetadata(FrozenModel): name: str namespace: Optional[str] authors: Tuple[str, ...] - scope: capa.rules.Scope + scopes: capa.rules.Scopes attack: Tuple[AttackSpec, ...] = Field(alias="att&ck") mbc: Tuple[MBCSpec, ...] references: Tuple[str, ...] @@ -527,7 +570,7 @@ def from_capa(cls, rule: capa.rules.Rule) -> "RuleMetadata": name=rule.meta.get("name"), namespace=rule.meta.get("namespace"), authors=rule.meta.get("authors"), - scope=capa.rules.Scope(rule.meta.get("scope")), + scopes=capa.rules.Scopes.from_dict(rule.meta.get("scopes")), attack=tuple(map(AttackSpec.from_str, rule.meta.get("att&ck", []))), mbc=tuple(map(MBCSpec.from_str, rule.meta.get("mbc", []))), references=rule.meta.get("references", []), diff --git a/capa/render/verbose.py b/capa/render/verbose.py index ea8c30d67..ad3085d35 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -60,13 +60,26 @@ def format_address(address: frz.Address) -> str: assert isinstance(id_, int) assert isinstance(return_address, int) return f"event: {id_}, retaddr: 0x{return_address:x}" + elif address.type == frz.AddressType.PROCESS: + assert isinstance(address.value, tuple) + ppid, pid = address.value + assert isinstance(ppid, int) + assert isinstance(pid, int) + return f"process ppid: {ppid}, process pid: {pid}" + elif address.type == frz.AddressType.THREAD: + assert isinstance(address.value, tuple) + ppid, pid, tid = address.value + assert isinstance(ppid, int) + assert isinstance(pid, int) + assert isinstance(tid, int) + return f"process ppid: {ppid}, process pid: {pid}, thread id: {tid}" elif address.type == frz.AddressType.NO_ADDRESS: return "global" else: raise ValueError("unexpected address type") -def render_meta(ostream, doc: rd.ResultDocument): +def render_static_meta(ostream, doc: rd.ResultDocument): """ like: @@ -85,6 +98,8 @@ def render_meta(ostream, doc: rd.ResultDocument): function count 42 total feature count 1918 """ + + assert isinstance(doc.meta.analysis, rd.StaticAnalysis) rows = [ ("md5", doc.meta.sample.md5), ("sha1", doc.meta.sample.sha1), @@ -109,6 +124,57 @@ def render_meta(ostream, doc: rd.ResultDocument): ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) +def render_dynamic_meta(ostream, doc: rd.ResultDocument): + """ + like: + + md5 84882c9d43e23d63b82004fae74ebb61 + sha1 c6fb3b50d946bec6f391aefa4e54478cf8607211 + sha256 5eced7367ed63354b4ed5c556e2363514293f614c2c2eb187273381b2ef5f0f9 + path /tmp/packed-report,jspn + timestamp 2023-07-17T10:17:05.796933 + capa version 0.0.0 + os windows + format pe + arch amd64 + extractor CAPEFeatureExtractor + rules (embedded rules) + process count 42 + total feature count 1918 + """ + + assert isinstance(doc.meta.analysis, rd.DynamicAnalysis) + rows = [ + ("md5", doc.meta.sample.md5), + ("sha1", doc.meta.sample.sha1), + ("sha256", doc.meta.sample.sha256), + ("path", doc.meta.sample.path), + ("timestamp", doc.meta.timestamp), + ("capa version", doc.meta.version), + ("os", doc.meta.analysis.os), + ("format", doc.meta.analysis.format), + ("arch", doc.meta.analysis.arch), + ("extractor", doc.meta.analysis.extractor), + ("rules", "\n".join(doc.meta.analysis.rules)), + ("process count", len(doc.meta.analysis.feature_counts.processes)), + ( + "total feature count", + doc.meta.analysis.feature_counts.file + sum(p.count for p in doc.meta.analysis.feature_counts.processes), + ), + ] + + ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) + + +def render_meta(osstream, doc: rd.ResultDocument): + if isinstance(doc.meta.analysis, rd.StaticAnalysis): + render_static_meta(osstream, doc) + elif isinstance(doc.meta.analysis, rd.DynamicAnalysis): + render_dynamic_meta(osstream, doc) + else: + raise ValueError("invalid meta analysis") + + def render_rules(ostream, doc: rd.ResultDocument): """ like: @@ -132,7 +198,7 @@ def render_rules(ostream, doc: rd.ResultDocument): had_match = True rows = [] - for key in ("namespace", "description", "scope"): + for key in ("namespace", "description", "scopes"): v = getattr(rule.meta, key) if not v: continue @@ -145,7 +211,7 @@ def render_rules(ostream, doc: rd.ResultDocument): rows.append((key, v)) - if rule.meta.scope != capa.rules.FILE_SCOPE: + if capa.rules.FILE_SCOPE not in rule.meta.scopes: locations = [m[0] for m in doc.rules[rule.meta.name].matches] rows.append(("matches", "\n".join(map(format_address, locations)))) diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index ba90f76a6..db04ce745 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -267,6 +267,8 @@ def render_rules(ostream, doc: rd.ResultDocument): api: kernel32.GetLastError @ 0x10004A87 api: kernel32.OutputDebugString @ 0x10004767, 0x10004787, 0x10004816, 0x10004895 """ + + assert isinstance(doc.meta.analysis, rd.StaticAnalysis) functions_by_bb: Dict[capa.features.address.Address, capa.features.address.Address] = {} for finfo in doc.meta.analysis.layout.functions: faddress = finfo.address.to_capa() @@ -322,7 +324,7 @@ def render_rules(ostream, doc: rd.ResultDocument): rows.append(("author", ", ".join(rule.meta.authors))) - rows.append(("scope", rule.meta.scope.value)) + rows.append(("scopes", str(rule.meta.scopes))) if rule.meta.attack: rows.append(("att&ck", ", ".join([rutils.format_parts_id(v) for v in rule.meta.attack]))) @@ -338,7 +340,7 @@ def render_rules(ostream, doc: rd.ResultDocument): ostream.writeln(tabulate.tabulate(rows, tablefmt="plain")) - if rule.meta.scope == capa.rules.FILE_SCOPE: + if capa.rules.FILE_SCOPE in rule.meta.scopes: matches = doc.rules[rule.meta.name].matches if len(matches) != 1: # i think there should only ever be one match per file-scope rule, @@ -350,11 +352,11 @@ def render_rules(ostream, doc: rd.ResultDocument): render_match(ostream, first_match, indent=0) else: for location, match in sorted(doc.rules[rule.meta.name].matches): - ostream.write(rule.meta.scope) + ostream.write(rule.meta.scopes) ostream.write(" @ ") ostream.write(capa.render.verbose.format_address(location)) - if rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + if capa.rules.BASIC_BLOCK_SCOPE in rule.meta.scopes: ostream.write( " in function " + capa.render.verbose.format_address(frz.Address.from_capa(functions_by_bb[location.to_capa()])) diff --git a/scripts/bulk-process.py b/scripts/bulk-process.py index b5d486910..e8f59b58f 100644 --- a/scripts/bulk-process.py +++ b/scripts/bulk-process.py @@ -129,11 +129,9 @@ def get_capa_results(args): "error": f"unexpected error: {e}", } - meta = capa.main.collect_metadata([], path, format, os_, [], extractor) capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = capa.main.collect_metadata([], path, format, os_, [], extractor, counts) meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) doc = rd.ResultDocument.from_capa(meta, rules, capabilities) diff --git a/scripts/capa_as_library.py b/scripts/capa_as_library.py index 8150a1ac7..1aa38cf86 100644 --- a/scripts/capa_as_library.py +++ b/scripts/capa_as_library.py @@ -170,10 +170,7 @@ def capa_details(rules_path, file_path, output_format="dictionary"): capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) # collect metadata (used only to make rendering more complete) - meta = capa.main.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, rules_path, extractor) - - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = capa.main.collect_metadata([], file_path, FORMAT_AUTO, OS_AUTO, rules_path, extractor, counts) meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) capa_output: Any = False diff --git a/scripts/import-to-ida.py b/scripts/import-to-ida.py index 42c564456..624091d2a 100644 --- a/scripts/import-to-ida.py +++ b/scripts/import-to-ida.py @@ -89,7 +89,7 @@ def main(): continue if rule.meta.is_subscope_rule: continue - if rule.meta.scope != capa.rules.Scope.FUNCTION: + if capa.rules.Scope.FUNCTION in rule.meta.scopes: continue ns = rule.meta.namespace diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index c5bfd5716..bd2ae0827 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -94,6 +94,7 @@ def render_matches_by_function(doc: rd.ResultDocument): - send HTTP request - connect to HTTP server """ + assert isinstance(doc.meta.analysis, rd.StaticAnalysis) functions_by_bb: Dict[Address, Address] = {} for finfo in doc.meta.analysis.layout.functions: faddress = finfo.address @@ -106,10 +107,10 @@ def render_matches_by_function(doc: rd.ResultDocument): matches_by_function = collections.defaultdict(set) for rule in rutils.capability_rules(doc): - if rule.meta.scope == capa.rules.FUNCTION_SCOPE: + if capa.rules.FUNCTION_SCOPE in rule.meta.scopes: for addr, _ in rule.matches: matches_by_function[addr].add(rule.meta.name) - elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + elif capa.rules.BASIC_BLOCK_SCOPE in rule.meta.scopes: for addr, _ in rule.matches: function = functions_by_bb[addr] matches_by_function[function].add(rule.meta.name) @@ -178,11 +179,9 @@ def main(argv=None): capa.helpers.log_unsupported_runtime_error() return -1 - meta = capa.main.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor) capabilities, counts = capa.main.find_capabilities(rules, extractor) - meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.library_functions = counts["library_functions"] + meta = capa.main.collect_metadata(argv, args.sample, format_, args.os, args.rules, extractor, counts) meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) if capa.main.has_file_limitation(rules, capabilities): From 4af84e53d5b4782420bd14f0fa208d7d5081947a Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:25:12 +0100 Subject: [PATCH 02/37] bugfixes --- capa/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index 79296c040..118069567 100644 --- a/capa/main.py +++ b/capa/main.py @@ -349,7 +349,7 @@ def find_thread_capabilities( returns: tuple containing (features for thread, match results for thread) """ - # all features found for the instruction. + # all features found for the thread. features = collections.defaultdict(set) # type: FeatureSet for feature, addr in itertools.chain( @@ -357,7 +357,7 @@ def find_thread_capabilities( ): features[feature].add(addr) - # matches found at this instruction. + # matches found at this thread. _, matches = ruleset.match(Scope.THREAD, features, th.address) for rule_name, res in matches.items(): From bc46bf32029d23ff6e0807eb3c2642367a4c447d Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 18 Jul 2023 11:25:39 +0100 Subject: [PATCH 03/37] add vverbose rendering --- capa/features/extractors/viv/extractor.py | 14 ++++++-- capa/main.py | 2 +- capa/render/proto/__init__.py | 1 + capa/render/vverbose.py | 44 +++++++++++++++++------ 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index 8b2b44156..c9c3a1dba 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -5,6 +5,7 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import hashlib import logging from typing import Any, Dict, List, Tuple, Iterator @@ -19,12 +20,12 @@ import capa.features.extractors.viv.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor logger = logging.getLogger(__name__) -class VivisectFeatureExtractor(StaticFeatureExtractor): +class VivisectFeatureExtractor(FeatureExtractor): def __init__(self, vw, path, os): super().__init__() self.vw = vw @@ -32,6 +33,12 @@ def __init__(self, vw, path, os): with open(self.path, "rb") as f: self.buf = f.read() + self.sample_hashes = ( + hashlib.md5().update(self.buf).hexdigest(), + hashlib.sha1().update(self.buf).hexdigest(), + hashlib.sha256().update(self.buf).hexdigest(), + ) + # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] self.global_features.extend(capa.features.extractors.viv.file.extract_file_format(self.buf)) @@ -42,6 +49,9 @@ def get_base_address(self): # assume there is only one file loaded into the vw return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"]) + def get_sample_hashes(self) -> Tuple[str, str, str]: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/main.py b/capa/main.py index 118069567..83a7a453e 100644 --- a/capa/main.py +++ b/capa/main.py @@ -1022,7 +1022,7 @@ def compute_dynamic_layout(rules, extractor, capabilities) -> rdoc.Layout: matched_threads = set() for rule_name, matches in capabilities.items(): rule = rules[rule_name] - if capa.rules.BASIC_BLOCK_SCOPE in rule.meta.get("scopes")["dynamic"]: + if capa.rules.THREAD_SCOPE in rule.meta.get("scopes")["dynamic"]: for addr, _ in matches: assert addr in processes_by_thread matched_threads.add(addr) diff --git a/capa/render/proto/__init__.py b/capa/render/proto/__init__.py index b55622a9c..9ed62c124 100644 --- a/capa/render/proto/__init__.py +++ b/capa/render/proto/__init__.py @@ -122,6 +122,7 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType: def metadata_to_pb2(meta: rd.Metadata) -> capa_pb2.Metadata: + assert isinstance(meta.analysis, rd.StaticAnalysis) return capa_pb2.Metadata( timestamp=str(meta.timestamp), version=meta.version, diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index db04ce745..b21499662 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -259,7 +259,8 @@ def render_rules(ostream, doc: rd.ResultDocument): check for OutputDebugString error namespace anti-analysis/anti-debugging/debugger-detection author michael.hunhoff@mandiant.com - scope function + static scope: function + dynamic scope: process mbc Anti-Behavioral Analysis::Detect Debugger::OutputDebugString function @ 0x10004706 and: @@ -268,14 +269,24 @@ def render_rules(ostream, doc: rd.ResultDocument): api: kernel32.OutputDebugString @ 0x10004767, 0x10004787, 0x10004816, 0x10004895 """ - assert isinstance(doc.meta.analysis, rd.StaticAnalysis) functions_by_bb: Dict[capa.features.address.Address, capa.features.address.Address] = {} - for finfo in doc.meta.analysis.layout.functions: - faddress = finfo.address.to_capa() - - for bb in finfo.matched_basic_blocks: - bbaddress = bb.address.to_capa() - functions_by_bb[bbaddress] = faddress + processes_by_thread: Dict[capa.features.address.Address, capa.features.address.Address] = {} + if isinstance(doc.meta.analysis, rd.StaticAnalysis): + for finfo in doc.meta.analysis.layout.functions: + faddress = finfo.address.to_capa() + + for bb in finfo.matched_basic_blocks: + bbaddress = bb.address.to_capa() + functions_by_bb[bbaddress] = faddress + elif isinstance(doc.meta.analysis, rd.DynamicAnalysis): + for pinfo in doc.meta.analysis.layout.processes: + paddress = pinfo.address.to_capa() + + for thread in pinfo.matched_threads: + taddress = thread.address.to_capa() + processes_by_thread[taddress] = paddress + else: + raise ValueError("invalid analysis field in the document's meta") had_match = False @@ -324,7 +335,11 @@ def render_rules(ostream, doc: rd.ResultDocument): rows.append(("author", ", ".join(rule.meta.authors))) - rows.append(("scopes", str(rule.meta.scopes))) + if rule.meta.scopes.static: + rows.append(("static scope:", str(rule.meta.scopes.static))) + + if rule.meta.scopes.dynamic: + rows.append(("dynamic scope:", str(rule.meta.scopes.dynamic))) if rule.meta.attack: rows.append(("att&ck", ", ".join([rutils.format_parts_id(v) for v in rule.meta.attack]))) @@ -352,7 +367,8 @@ def render_rules(ostream, doc: rd.ResultDocument): render_match(ostream, first_match, indent=0) else: for location, match in sorted(doc.rules[rule.meta.name].matches): - ostream.write(rule.meta.scopes) + ostream.write(f"static scope: {rule.meta.scopes.static}") + ostream.write(f"dynamic scope: {rule.meta.scopes.dynamic}") ostream.write(" @ ") ostream.write(capa.render.verbose.format_address(location)) @@ -362,6 +378,14 @@ def render_rules(ostream, doc: rd.ResultDocument): + capa.render.verbose.format_address(frz.Address.from_capa(functions_by_bb[location.to_capa()])) ) + if capa.rules.THREAD_SCOPE in rule.meta.scopes: + ostream.write( + " in process " + + capa.render.verbose.format_address( + frz.Address.from_capa(processes_by_thread[location.to_capa()]) + ) + ) + ostream.write("\n") render_match(ostream, match, indent=1) if rule.meta.lib: From e5d7903475e550b72adc11058ad4b4f491f165ab Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 18 Jul 2023 20:38:54 +0100 Subject: [PATCH 04/37] add removed tests --- capa/features/extractors/base_extractor.py | 40 +++ capa/features/extractors/cape/extractor.py | 10 +- capa/features/extractors/dnfile/extractor.py | 13 +- capa/features/extractors/viv/extractor.py | 24 +- capa/ida/plugin/model.py | 10 +- capa/main.py | 36 +-- tests/_test_proto.py | 1 + tests/test_main.py | 2 +- tests/{_test_render.py => test_render.py} | 0 tests/test_result_document.py | 286 +++++++++++++++++++ 10 files changed, 368 insertions(+), 54 deletions(-) rename tests/{_test_render.py => test_render.py} (100%) create mode 100644 tests/test_result_document.py diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 836e72160..d381ac2c6 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -7,6 +7,7 @@ # See the License for the specific language governing permissions and limitations under the License. import abc +import hashlib import dataclasses from typing import Any, Dict, Tuple, Union, Iterator from dataclasses import dataclass @@ -24,6 +25,29 @@ # the feature extractor from which they were created. +@dataclass +class SampleHashes: + md5: str + sha1: str + sha256: str + + def __iter__(self) -> Iterator[str]: + yield self.md5 + yield self.sha1 + yield self.sha256 + + @classmethod + def from_sample(cls, buf) -> "SampleHashes": + md5 = hashlib.md5() + sha1 = hashlib.sha1() + sha256 = hashlib.sha256() + md5.update(buf) + sha1.update(buf) + sha256.update(buf) + + return cls(md5=md5.hexdigest(), sha1=sha1.hexdigest(), sha256=sha256.hexdigest()) + + @dataclass class FunctionHandle: """reference to a function recognized by a feature extractor. @@ -104,6 +128,14 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.addres """ raise NotImplementedError() + def get_sample_hashes(self) -> Tuple[str, str, str]: + """ + fetch the hashes for the sample contained within the extractor. + + the order of the hashes is: md5, sha1, sha256 + """ + raise NotImplementedError() + @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: """ @@ -309,6 +341,14 @@ class DynamicFeatureExtractor: This class is not instantiated directly; it is the base class for other implementations. """ + def get_sample_hashes(self) -> Tuple[str, str, str]: + """ + fetch the hashes for the sample contained within the extractor. + + the order of the hashes is: md5, sha1, sha256 + """ + raise NotImplementedError() + @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: """ diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index 48bf2a577..21686a37e 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -14,7 +14,7 @@ import capa.features.extractors.cape.process from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress, _NoAddress -from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle, DynamicFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, ThreadHandle, ProcessHandle, DynamicFeatureExtractor logger = logging.getLogger(__name__) @@ -27,6 +27,11 @@ def __init__(self, cape_version: str, static: Dict, behavior: Dict): self.cape_version = cape_version self.static = static self.behavior = behavior + self.hashes = SampleHashes( + md5=static["file"]["md5"], + sha1=static["file"]["sha1"], + sha256=static["file"]["sha256"], + ) self.global_features = capa.features.extractors.cape.global_.extract_features(self.static) @@ -34,6 +39,9 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]: # value according to the PE header, the actual trace may use a different imagebase return AbsoluteVirtualAddress(self.static["pe"]["imagebase"]) + def get_sample_hashes(self): + return tuple(self.hashes) + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index e5d03462e..fe6a69a97 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -21,7 +21,13 @@ from capa.features.common import Feature from capa.features.address import NO_ADDRESS, Address, DNTokenAddress, DNTokenOffsetAddress from capa.features.extractors.dnfile.types import DnType, DnUnmanagedMethod -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) from capa.features.extractors.dnfile.helpers import ( get_dotnet_types, get_dotnet_fields, @@ -71,6 +77,8 @@ class DnfileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: str): super().__init__() self.pe: dnfile.dnPE = dnfile.dnPE(path) + with open(path, "rb") as f: + self.sample_hashes = SampleHashes.from_sample(f.read()) # pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction # most relevant at instruction scope @@ -85,6 +93,9 @@ def __init__(self, path: str): def get_base_address(self): return NO_ADDRESS + def get_sample_hashes(self): + return tuple(self.sample_hashes) + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index c9c3a1dba..66d244f5a 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -5,7 +5,6 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -import hashlib import logging from typing import Any, Dict, List, Tuple, Iterator @@ -20,24 +19,25 @@ import capa.features.extractors.viv.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) logger = logging.getLogger(__name__) -class VivisectFeatureExtractor(FeatureExtractor): +class VivisectFeatureExtractor(StaticFeatureExtractor): def __init__(self, vw, path, os): super().__init__() self.vw = vw self.path = path - with open(self.path, "rb") as f: + with open(path, "rb") as f: self.buf = f.read() - - self.sample_hashes = ( - hashlib.md5().update(self.buf).hexdigest(), - hashlib.sha1().update(self.buf).hexdigest(), - hashlib.sha256().update(self.buf).hexdigest(), - ) + self.sample_hashes = SampleHashes.from_sample(self.buf) # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] @@ -49,8 +49,8 @@ def get_base_address(self): # assume there is only one file loaded into the vw return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"]) - def get_sample_hashes(self) -> Tuple[str, str, str]: - return self.sample_hashes + def get_sample_hashes(self): + return tuple(self.sample_hashes) def extract_global_features(self): yield from self.global_features diff --git a/capa/ida/plugin/model.py b/capa/ida/plugin/model.py index 547d5349f..87dd70810 100644 --- a/capa/ida/plugin/model.py +++ b/capa/ida/plugin/model.py @@ -500,16 +500,16 @@ def render_capa_doc_by_program(self, doc: rd.ResultDocument): location = location_.to_capa() parent2: CapaExplorerDataItem - if rule.meta.scope == capa.rules.FILE_SCOPE: + if capa.rules.FILE_SCOPE in rule.meta.scopes: parent2 = parent - elif rule.meta.scope == capa.rules.FUNCTION_SCOPE: + elif capa.rules.FUNCTION_SCOPE in rule.meta.scopes: parent2 = CapaExplorerFunctionItem(parent, location) - elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + elif capa.rules.BASIC_BLOCK_SCOPE in rule.meta.scopes: parent2 = CapaExplorerBlockItem(parent, location) - elif rule.meta.scope == capa.rules.INSTRUCTION_SCOPE: + elif capa.rules.INSTRUCTION_SCOPE in rule.meta.scopes: parent2 = CapaExplorerInstructionItem(parent, location) else: - raise RuntimeError("unexpected rule scope: " + str(rule.meta.scope)) + raise RuntimeError("unexpected rule scope: " + str(rule.meta.scopes.static)) self.render_capa_doc_match(parent2, match, doc) diff --git a/capa/main.py b/capa/main.py index 83a7a453e..2c404d370 100644 --- a/capa/main.py +++ b/capa/main.py @@ -13,7 +13,6 @@ import sys import json import time -import hashlib import logging import os.path import argparse @@ -263,7 +262,7 @@ def find_static_capabilities( all_bb_matches = collections.defaultdict(list) # type: MatchResults all_insn_matches = collections.defaultdict(list) # type: MatchResults - feature_counts = rdoc.FeatureCounts(file=0, functions=()) + feature_counts = rdoc.StaticFeatureCounts(file=0, functions=()) library_functions: Tuple[rdoc.LibraryFunction, ...] = () assert isinstance(extractor, StaticFeatureExtractor) @@ -894,37 +893,6 @@ def get_signatures(sigs_path): return paths -def get_sample_hashes(sample_path, extractor: FeatureExtractor) -> Tuple[str, str, str]: - if isinstance(extractor, StaticFeatureExtractor): - md5_ = hashlib.md5() - sha1_ = hashlib.sha1() - sha256_ = hashlib.sha256() - - with open(sample_path, "rb") as f: - buf = f.read() - - md5_.update(buf) - sha1_.update(buf) - sha256_.update(buf) - - md5, sha1, sha256 = md5_.hexdigest(), sha1_.hexdigest(), sha256_.hexdigest() - elif isinstance(extractor, DynamicFeatureExtractor): - import json - - if isinstance(extractor, capa.features.extractors.cape.extractor.CapeExtractor): - with open(sample_path, "rb") as f: - report = json.load(f) - md5 = report["target"]["file"]["md5"] - sha1 = report["target"]["file"]["sha1"] - sha256 = report["target"]["file"]["sha256"] - else: - md5, sha1, sha256 = "0", "0", "0" - else: - raise ValueError("invalid extractor") - - return md5, sha1, sha256 - - def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts): if isinstance(extractor, StaticFeatureExtractor): return rdoc.StaticAnalysis( @@ -971,7 +939,7 @@ def collect_metadata( ) -> rdoc.Metadata: # if it's a binary sample we hash it, if it's a report # we fetch the hashes from the report - md5, sha1, sha256 = get_sample_hashes(sample_path, extractor) + md5, sha1, sha256 = extractor.get_sample_hashes() if rules_path != [RULES_PATH_DEFAULT_STRING]: rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path] diff --git a/tests/_test_proto.py b/tests/_test_proto.py index 8a76ccfc2..f45282b70 100644 --- a/tests/_test_proto.py +++ b/tests/_test_proto.py @@ -130,6 +130,7 @@ def cmp_optional(a: Any, b: Any) -> bool: def assert_meta(meta: rd.Metadata, dst: capa_pb2.Metadata): + assert isinstance(rd.Metadata.analysis, rd.StaticAnalysis) assert str(meta.timestamp) == dst.timestamp assert meta.version == dst.version if meta.argv is None: diff --git a/tests/test_main.py b/tests/test_main.py index a84c6f54c..673a50176 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -541,7 +541,7 @@ def test_main_dotnet4(_039a6_dotnetfile_extractor): assert capa.main.main([path, "-vv"]) == 0 -@pytest.mark.xfail(reason="ResultDocument hasn't been updated yet") +@pytest.mark.xfail(reason="relies on the legeacy ruleset. scopes keyword hasn't been added there") def test_main_rd(): path = fixtures.get_data_path_by_name("pma01-01-rd") assert capa.main.main([path, "-vv"]) == 0 diff --git a/tests/_test_render.py b/tests/test_render.py similarity index 100% rename from tests/_test_render.py rename to tests/test_render.py diff --git a/tests/test_result_document.py b/tests/test_result_document.py new file mode 100644 index 000000000..161628ffa --- /dev/null +++ b/tests/test_result_document.py @@ -0,0 +1,286 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import copy + +import pytest +import fixtures +from fixtures import a3f3bbc_rd, a076114_rd, pma0101_rd, al_khaserx64_rd, al_khaserx86_rd, dotnet_1c444e_rd + +import capa +import capa.engine as ceng +import capa.render.result_document as rdoc +import capa.features.freeze.features as frzf + + +def test_optional_node_from_capa(): + node = rdoc.node_from_capa( + ceng.Some( + 0, + [], + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.CompoundStatement) + assert node.statement.type == rdoc.CompoundStatementType.OPTIONAL + + +def test_some_node_from_capa(): + node = rdoc.node_from_capa( + ceng.Some( + 1, + [ + capa.features.insn.Number(0), + ], + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.SomeStatement) + + +def test_range_node_from_capa(): + node = rdoc.node_from_capa( + ceng.Range( + capa.features.insn.Number(0), + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.RangeStatement) + + +def test_subscope_node_from_capa(): + node = rdoc.node_from_capa( + ceng.Subscope( + capa.rules.Scope.BASIC_BLOCK, + capa.features.insn.Number(0), + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.SubscopeStatement) + + +def test_and_node_from_capa(): + node = rdoc.node_from_capa( + ceng.And( + [ + capa.features.insn.Number(0), + ], + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.CompoundStatement) + assert node.statement.type == rdoc.CompoundStatementType.AND + + +def test_or_node_from_capa(): + node = rdoc.node_from_capa( + ceng.Or( + [ + capa.features.insn.Number(0), + ], + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.CompoundStatement) + assert node.statement.type == rdoc.CompoundStatementType.OR + + +def test_not_node_from_capa(): + node = rdoc.node_from_capa( + ceng.Not( + [ + capa.features.insn.Number(0), + ], + ) + ) + assert isinstance(node, rdoc.StatementNode) + assert isinstance(node.statement, rdoc.CompoundStatement) + assert node.statement.type == rdoc.CompoundStatementType.NOT + + +def test_os_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.OS("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.OSFeature) + + +def test_arch_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Arch("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.ArchFeature) + + +def test_format_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Format("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.FormatFeature) + + +def test_match_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.MatchedRule("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.MatchFeature) + + +def test_characteristic_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Characteristic("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.CharacteristicFeature) + + +def test_substring_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Substring("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.SubstringFeature) + + +def test_regex_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Regex("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.RegexFeature) + + +def test_class_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Class("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.ClassFeature) + + +def test_namespace_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Namespace("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.NamespaceFeature) + + +def test_bytes_node_from_capa(): + node = rdoc.node_from_capa(capa.features.common.Bytes(b"")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.BytesFeature) + + +def test_export_node_from_capa(): + node = rdoc.node_from_capa(capa.features.file.Export("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.ExportFeature) + + +def test_import_node_from_capa(): + node = rdoc.node_from_capa(capa.features.file.Import("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.ImportFeature) + + +def test_section_node_from_capa(): + node = rdoc.node_from_capa(capa.features.file.Section("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.SectionFeature) + + +def test_function_name_node_from_capa(): + node = rdoc.node_from_capa(capa.features.file.FunctionName("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.FunctionNameFeature) + + +def test_api_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.API("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.APIFeature) + + +def test_property_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.Property("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.PropertyFeature) + + +def test_number_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.Number(0)) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.NumberFeature) + + +def test_offset_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.Offset(0)) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.OffsetFeature) + + +def test_mnemonic_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.Mnemonic("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.MnemonicFeature) + + +def test_operand_number_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.OperandNumber(0, 0)) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.OperandNumberFeature) + + +def test_operand_offset_node_from_capa(): + node = rdoc.node_from_capa(capa.features.insn.OperandOffset(0, 0)) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.OperandOffsetFeature) + + +def test_basic_block_node_from_capa(): + node = rdoc.node_from_capa(capa.features.basicblock.BasicBlock("")) + assert isinstance(node, rdoc.FeatureNode) + assert isinstance(node.feature, frzf.BasicBlockFeature) + + +def assert_round_trip(rd: rdoc.ResultDocument): + one = rd + + doc = one.json(exclude_none=True) + two = rdoc.ResultDocument.parse_raw(doc) + + # show the round trip works + # first by comparing the objects directly, + # which works thanks to pydantic model equality. + assert one == two + # second by showing their json representations are the same. + assert one.json(exclude_none=True) == two.json(exclude_none=True) + + # now show that two different versions are not equal. + three = copy.deepcopy(two) + three.meta.__dict__.update({"version": "0.0.0"}) + assert one.meta.version != three.meta.version + assert one != three + assert one.json(exclude_none=True) != three.json(exclude_none=True) + + +@pytest.mark.parametrize( + "rd_file", + [ + pytest.param("a3f3bbc_rd"), + pytest.param("al_khaserx86_rd"), + pytest.param("al_khaserx64_rd"), + pytest.param("a076114_rd"), + pytest.param("pma0101_rd"), + pytest.param("dotnet_1c444e_rd"), + ], +) +def test_round_trip(request, rd_file): + rd: rdoc.ResultDocument = request.getfixturevalue(rd_file) + assert_round_trip(rd) + + +def test_json_to_rdoc(): + path = fixtures.get_data_path_by_name("pma01-01-rd") + assert isinstance(rdoc.ResultDocument.parse_file(path), rdoc.ResultDocument) + + +def test_rdoc_to_capa(): + path = fixtures.get_data_path_by_name("pma01-01-rd") + + rd = rdoc.ResultDocument.parse_file(path) + + meta, capabilites = rd.to_capa() + assert isinstance(meta, rdoc.Metadata) + assert isinstance(capabilites, dict) From 4e4b1235c3d8a92c609d8fc919487a27c22986fa Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 18 Jul 2023 21:04:51 +0100 Subject: [PATCH 05/37] mypy.ini: ignore proto issues --- .github/mypy/mypy.ini | 6 ++++++ tests/_test_proto.py | 1 + 2 files changed, 7 insertions(+) diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index 603f2e42f..81614afe1 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -1,5 +1,11 @@ [mypy] +exclude = (?x)( + ^capa/render/proto/__init__.py$ + | ^tests/_test_proto.py$ + | ^capa/ida/helpers.py$ + ) + [mypy-halo.*] ignore_missing_imports = True diff --git a/tests/_test_proto.py b/tests/_test_proto.py index f45282b70..412db8f66 100644 --- a/tests/_test_proto.py +++ b/tests/_test_proto.py @@ -150,6 +150,7 @@ def assert_meta(meta: rd.Metadata, dst: capa_pb2.Metadata): assert list(meta.analysis.rules) == dst.analysis.rules assert capa.render.proto.addr_to_pb2(meta.analysis.base_address) == dst.analysis.base_address + assert isinstance(rd.Metadata.analysis.layout, rd.StaticLayout) assert len(meta.analysis.layout.functions) == len(dst.analysis.layout.functions) for rd_f, proto_f in zip(meta.analysis.layout.functions, dst.analysis.layout.functions): assert capa.render.proto.addr_to_pb2(rd_f.address) == proto_f.address From c5d08ec0d1ea68e716c0110eb95347e32cb95044 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jul 2023 14:00:00 +0100 Subject: [PATCH 06/37] update extractors and tests --- capa/features/extractors/binja/extractor.py | 13 ++++++++++++- capa/features/extractors/ida/extractor.py | 13 ++++++++++++- tests/test_main.py | 1 + tests/test_result_document.py | 10 +++++----- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index e4ca1d8dd..e8b30db93 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -17,7 +17,13 @@ import capa.features.extractors.binja.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) class BinjaFeatureExtractor(StaticFeatureExtractor): @@ -28,10 +34,15 @@ def __init__(self, bv: binja.BinaryView): self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) + with open(self.bv, "rb") as f: + self.sample_hashes = SampleHashes.from_sample(f.read()) def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) + def get_sample_hashes(self): + return tuple(self.sample_hashes) + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 2fe20ba72..63c396263 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -18,7 +18,13 @@ import capa.features.extractors.ida.basicblock from capa.features.common import Feature from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + FunctionHandle, + StaticFeatureExtractor, +) class IdaFeatureExtractor(StaticFeatureExtractor): @@ -28,10 +34,15 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) + with open(idaapi.get_input_file_path, "rb") as f: + self.sample_hashes = SampleHashes(f.read()) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) + def get_sample_hashes(self): + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/tests/test_main.py b/tests/test_main.py index 673a50176..6087934a4 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -71,6 +71,7 @@ def test_main_single_rule(z9324d_extractor, tmpdir): ) +@pytest.mark.xfail(reason="relies on the legeacy ruleset. scopes keyword hasn't been added there") def test_main_non_ascii_filename(pingtaest_extractor, tmpdir, capsys): # here we print a string with unicode characters in it # (specifically, a byte string with utf-8 bytes in it, see file encoding) diff --git a/tests/test_result_document.py b/tests/test_result_document.py index 161628ffa..421e199b9 100644 --- a/tests/test_result_document.py +++ b/tests/test_result_document.py @@ -258,12 +258,12 @@ def assert_round_trip(rd: rdoc.ResultDocument): @pytest.mark.parametrize( "rd_file", [ - pytest.param("a3f3bbc_rd"), - pytest.param("al_khaserx86_rd"), - pytest.param("al_khaserx64_rd"), - pytest.param("a076114_rd"), + pytest.param("a3f3bbc_rd", marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added")), + pytest.param("al_khaserx86_rd", marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added")), + pytest.param("al_khaserx64_rd", marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added")), + pytest.param("a076114_rd", marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added")), pytest.param("pma0101_rd"), - pytest.param("dotnet_1c444e_rd"), + pytest.param("dotnet_1c444e_rd", marks=pytest.mark.xfail(reason="document needs to be updated to the final scopes syntax once that's added")), ], ) def test_round_trip(request, rd_file): From 7de223f116932caa2b78fd6e437f50ff06c7d2ea Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:39:06 +0100 Subject: [PATCH 07/37] Update capa/features/extractors/ida/extractor.py: add call to get_input_file_path() Co-authored-by: Willi Ballenthin --- capa/features/extractors/ida/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 63c396263..d45f0860c 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -34,7 +34,7 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) - with open(idaapi.get_input_file_path, "rb") as f: + with open(idaapi.get_input_file_path(), "rb") as f: self.sample_hashes = SampleHashes(f.read()) def get_base_address(self): From 8ac9caf45c3a4ef0655646ec983a0939cd8994d0 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 20:20:33 +0100 Subject: [PATCH 08/37] fix bugs --- capa/main.py | 4 ++-- tests/test_result_document.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index 9119f7831..d2dbe4c45 100644 --- a/capa/main.py +++ b/capa/main.py @@ -20,7 +20,7 @@ import itertools import contextlib import collections -from typing import Any, Dict, List, Tuple, Callable +from typing import Any, Dict, List, Tuple, Callable, Optional from pathlib import Path import halo @@ -961,7 +961,7 @@ def collect_metadata( arch, os_, extractor, - rules_path, + rules, counts, ), ) diff --git a/tests/test_result_document.py b/tests/test_result_document.py index 8dc110305..e894a00b5 100644 --- a/tests/test_result_document.py +++ b/tests/test_result_document.py @@ -281,16 +281,19 @@ def assert_round_trip(rd: rdoc.ResultDocument): ), ], ) +@pytest.mark.xfail(reason="samples haven't been modified to the scopes keyword") def test_round_trip(request, rd_file): rd: rdoc.ResultDocument = request.getfixturevalue(rd_file) assert_round_trip(rd) +@pytest.mark.xfail(reason="samples haven't been modified to the scopes keyword") def test_json_to_rdoc(): path = fixtures.get_data_path_by_name("pma01-01-rd") assert isinstance(rdoc.ResultDocument.parse_file(path), rdoc.ResultDocument) +@pytest.mark.xfail(reason="samples haven't been modified to the scopes keyword") def test_rdoc_to_capa(): path = fixtures.get_data_path_by_name("pma01-01-rd") From 0a4fe58ac62e80e63b7f5b195bf7176416628d65 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 20:25:11 +0100 Subject: [PATCH 09/37] fix tests --- capa/features/extractors/ida/extractor.py | 2 +- tests/test_binja_features.py | 2 +- tests/test_result_document.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index bd1ed62ed..3f215f05c 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -35,7 +35,7 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) with open(idaapi.get_input_file_path(), "rb") as f: - self.sample_hashes = SampleHashes(f.read()) + self.sample_hashes = SampleHashes.from_sample(f.read()) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) diff --git a/tests/test_binja_features.py b/tests/test_binja_features.py index 4daaa7901..4397cf823 100644 --- a/tests/test_binja_features.py +++ b/tests/test_binja_features.py @@ -62,7 +62,7 @@ def test_binja_feature_counts(sample, scope, feature, expected): fixtures.do_test_feature_count(fixtures.get_binja_extractor, sample, scope, feature, expected) -@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +@pytest.mark.xfail(reason="relies on the legacy ruleset which hasn't been updated yet") def test_standalone_binja_backend(): CD = Path(__file__).resolve().parent test_path = CD / ".." / "tests" / "data" / "Practical Malware Analysis Lab 01-01.exe_" diff --git a/tests/test_result_document.py b/tests/test_result_document.py index e894a00b5..8e3090495 100644 --- a/tests/test_result_document.py +++ b/tests/test_result_document.py @@ -9,7 +9,6 @@ import pytest import fixtures -from fixtures import a3f3bbc_rd, a076114_rd, pma0101_rd, al_khaserx64_rd, al_khaserx86_rd, dotnet_1c444e_rd import capa import capa.engine as ceng From d99b16ed5e80c38249ae496f3991244808b41635 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 21:41:16 +0100 Subject: [PATCH 10/37] add copyright and remove old test --- tests/_test_result_document.py | 285 --------------------------------- tests/test_result_document.py | 2 +- 2 files changed, 1 insertion(+), 286 deletions(-) delete mode 100644 tests/_test_result_document.py diff --git a/tests/_test_result_document.py b/tests/_test_result_document.py deleted file mode 100644 index 27a1dbb29..000000000 --- a/tests/_test_result_document.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at: [package root]/LICENSE.txt -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and limitations under the License. -import copy - -import pytest -import fixtures - -import capa -import capa.engine as ceng -import capa.render.result_document as rdoc -import capa.features.freeze.features as frzf - - -def test_optional_node_from_capa(): - node = rdoc.node_from_capa( - ceng.Some( - 0, - [], - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.CompoundStatement) - assert node.statement.type == rdoc.CompoundStatementType.OPTIONAL - - -def test_some_node_from_capa(): - node = rdoc.node_from_capa( - ceng.Some( - 1, - [ - capa.features.insn.Number(0), - ], - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.SomeStatement) - - -def test_range_node_from_capa(): - node = rdoc.node_from_capa( - ceng.Range( - capa.features.insn.Number(0), - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.RangeStatement) - - -def test_subscope_node_from_capa(): - node = rdoc.node_from_capa( - ceng.Subscope( - capa.rules.Scope.BASIC_BLOCK, - capa.features.insn.Number(0), - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.SubscopeStatement) - - -def test_and_node_from_capa(): - node = rdoc.node_from_capa( - ceng.And( - [ - capa.features.insn.Number(0), - ], - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.CompoundStatement) - assert node.statement.type == rdoc.CompoundStatementType.AND - - -def test_or_node_from_capa(): - node = rdoc.node_from_capa( - ceng.Or( - [ - capa.features.insn.Number(0), - ], - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.CompoundStatement) - assert node.statement.type == rdoc.CompoundStatementType.OR - - -def test_not_node_from_capa(): - node = rdoc.node_from_capa( - ceng.Not( - [ - capa.features.insn.Number(0), - ], - ) - ) - assert isinstance(node, rdoc.StatementNode) - assert isinstance(node.statement, rdoc.CompoundStatement) - assert node.statement.type == rdoc.CompoundStatementType.NOT - - -def test_os_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.OS("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.OSFeature) - - -def test_arch_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Arch("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.ArchFeature) - - -def test_format_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Format("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.FormatFeature) - - -def test_match_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.MatchedRule("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.MatchFeature) - - -def test_characteristic_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Characteristic("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.CharacteristicFeature) - - -def test_substring_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Substring("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.SubstringFeature) - - -def test_regex_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Regex("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.RegexFeature) - - -def test_class_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Class("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.ClassFeature) - - -def test_namespace_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Namespace("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.NamespaceFeature) - - -def test_bytes_node_from_capa(): - node = rdoc.node_from_capa(capa.features.common.Bytes(b"")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.BytesFeature) - - -def test_export_node_from_capa(): - node = rdoc.node_from_capa(capa.features.file.Export("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.ExportFeature) - - -def test_import_node_from_capa(): - node = rdoc.node_from_capa(capa.features.file.Import("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.ImportFeature) - - -def test_section_node_from_capa(): - node = rdoc.node_from_capa(capa.features.file.Section("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.SectionFeature) - - -def test_function_name_node_from_capa(): - node = rdoc.node_from_capa(capa.features.file.FunctionName("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.FunctionNameFeature) - - -def test_api_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.API("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.APIFeature) - - -def test_property_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.Property("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.PropertyFeature) - - -def test_number_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.Number(0)) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.NumberFeature) - - -def test_offset_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.Offset(0)) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.OffsetFeature) - - -def test_mnemonic_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.Mnemonic("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.MnemonicFeature) - - -def test_operand_number_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.OperandNumber(0, 0)) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.OperandNumberFeature) - - -def test_operand_offset_node_from_capa(): - node = rdoc.node_from_capa(capa.features.insn.OperandOffset(0, 0)) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.OperandOffsetFeature) - - -def test_basic_block_node_from_capa(): - node = rdoc.node_from_capa(capa.features.basicblock.BasicBlock("")) - assert isinstance(node, rdoc.FeatureNode) - assert isinstance(node.feature, frzf.BasicBlockFeature) - - -def assert_round_trip(rd: rdoc.ResultDocument): - one = rd - - doc = one.json(exclude_none=True) - two = rdoc.ResultDocument.parse_raw(doc) - - # show the round trip works - # first by comparing the objects directly, - # which works thanks to pydantic model equality. - assert one == two - # second by showing their json representations are the same. - assert one.json(exclude_none=True) == two.json(exclude_none=True) - - # now show that two different versions are not equal. - three = copy.deepcopy(two) - three.meta.__dict__.update({"version": "0.0.0"}) - assert one.meta.version != three.meta.version - assert one != three - assert one.json(exclude_none=True) != three.json(exclude_none=True) - - -@pytest.mark.parametrize( - "rd_file", - [ - pytest.param("a3f3bbc_rd"), - pytest.param("al_khaserx86_rd"), - pytest.param("al_khaserx64_rd"), - pytest.param("a076114_rd"), - pytest.param("pma0101_rd"), - pytest.param("dotnet_1c444e_rd"), - ], -) -def test_round_trip(request, rd_file): - rd: rdoc.ResultDocument = request.getfixturevalue(rd_file) - assert_round_trip(rd) - - -def test_json_to_rdoc(): - path = fixtures.get_data_path_by_name("pma01-01-rd") - assert isinstance(rdoc.ResultDocument.parse_file(path), rdoc.ResultDocument) - - -def test_rdoc_to_capa(): - path = fixtures.get_data_path_by_name("pma01-01-rd") - - rd = rdoc.ResultDocument.parse_file(path) - - meta, capabilites = rd.to_capa() - assert isinstance(meta, rdoc.Metadata) - assert isinstance(capabilites, dict) diff --git a/tests/test_result_document.py b/tests/test_result_document.py index 8e3090495..bcaf03fb7 100644 --- a/tests/test_result_document.py +++ b/tests/test_result_document.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: [package root]/LICENSE.txt From 482e0d386b5d0caff1c6bb664d98ccdd6c1aff25 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 21:42:14 +0100 Subject: [PATCH 11/37] use pathlib.Path() in binja and ida extractors --- capa/features/extractors/binja/extractor.py | 4 ++-- capa/features/extractors/ida/extractor.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 77a105747..90821b1c5 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -6,6 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. from typing import List, Tuple, Iterator +from pathlib import Path import binaryninja as binja @@ -34,8 +35,7 @@ def __init__(self, bv: binja.BinaryView): self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) - with open(self.bv.name, "rb") as f: - self.sample_hashes = SampleHashes.from_sample(f.read()) + self.sample_hashes = SampleHashes.from_sample(Path(self.bv.name).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 3f215f05c..0439d5323 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -6,6 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. from typing import List, Tuple, Iterator +from pathlib import Path import idaapi @@ -34,8 +35,7 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) - with open(idaapi.get_input_file_path(), "rb") as f: - self.sample_hashes = SampleHashes.from_sample(f.read()) + self.sample_hashes = SampleHashes.from_sample(Path(idaapi.get_input_file_path()).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) From fd7b926a3322768f901ca31aff17180c69445d25 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Thu, 20 Jul 2023 21:47:23 +0100 Subject: [PATCH 12/37] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index f036a5f08..eaab47fd6 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -37,7 +37,7 @@ def __iter__(self) -> Iterator[str]: yield self.sha256 @classmethod - def from_sample(cls, buf) -> "SampleHashes": + def from_bytes(cls, buf: bytes) -> "SampleHashes": md5 = hashlib.md5() sha1 = hashlib.sha1() sha256 = hashlib.sha256() From 2b2b2b6545f79e0218adcef9246364b0e788fb68 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Thu, 20 Jul 2023 21:47:30 +0100 Subject: [PATCH 13/37] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index eaab47fd6..ec860ccb9 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -128,7 +128,7 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.addres """ raise NotImplementedError() - def get_sample_hashes(self) -> Tuple[str, str, str]: + def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. From b4cf50fb6e88c349fb71d6c91859c09acfed7278 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 21:46:59 +0100 Subject: [PATCH 14/37] fix mypy issues --- capa/ida/helpers.py | 6 +++--- capa/render/proto/__init__.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/capa/ida/helpers.py b/capa/ida/helpers.py index 89e12c60e..f03ba444b 100644 --- a/capa/ida/helpers.py +++ b/capa/ida/helpers.py @@ -153,14 +153,14 @@ def collect_metadata(rules: List[Path]): sha256=sha256, path=idaapi.get_input_file_path(), ), - analysis=rdoc.Analysis( + analysis=rdoc.StaticAnalysis( format=idaapi.get_file_type_name(), arch=arch, os=os, extractor="ida", rules=tuple(r.resolve().absolute().as_posix() for r in rules), base_address=capa.features.freeze.Address.from_capa(idaapi.get_imagebase()), - layout=rdoc.Layout( + layout=rdoc.StaticLayout( functions=(), # this is updated after capabilities have been collected. # will look like: @@ -168,7 +168,7 @@ def collect_metadata(rules: List[Path]): # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } ), # ignore these for now - not used by IDA plugin. - feature_counts=rdoc.FeatureCounts(file=0, functions=()), + feature_counts=rdoc.StaticFeatureCounts(file=0, functions=()), library_functions=(), ), ) diff --git a/capa/render/proto/__init__.py b/capa/render/proto/__init__.py index 4a953e6e8..94f977ab5 100644 --- a/capa/render/proto/__init__.py +++ b/capa/render/proto/__init__.py @@ -491,14 +491,14 @@ def metadata_from_pb2(meta: capa_pb2.Metadata) -> rd.Metadata: sha256=meta.sample.sha256, path=meta.sample.path, ), - analysis=rd.Analysis( + analysis=rd.StaticAnalysis( format=meta.analysis.format, arch=meta.analysis.arch, os=meta.analysis.os, extractor=meta.analysis.extractor, rules=tuple(meta.analysis.rules), base_address=addr_from_pb2(meta.analysis.base_address), - layout=rd.Layout( + layout=rd.StaticLayout( functions=tuple( [ rd.FunctionLayout( @@ -514,7 +514,7 @@ def metadata_from_pb2(meta: capa_pb2.Metadata) -> rd.Metadata: ] ) ), - feature_counts=rd.FeatureCounts( + feature_counts=rd.StaticFeatureCounts( file=meta.analysis.feature_counts.file, functions=tuple( [ From ab092cb53630c71432b04e1b0bee288570ff1e2d Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 21:51:37 +0100 Subject: [PATCH 15/37] add sample_hashes attribute to the base extractors --- capa/features/extractors/base_extractor.py | 24 +++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index ec860ccb9..184ff0d60 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -116,6 +116,8 @@ def __init__(self): # this base class doesn't know what to do with that info, though. # super().__init__() + # all extractors must be able to provide a samples hashes + self.sample_hashes: SampleHashes @abc.abstractmethod def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: @@ -131,10 +133,8 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.addres def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. - - the order of the hashes is: md5, sha1, sha256 """ - raise NotImplementedError() + return self.sample_hashes @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: @@ -341,13 +341,23 @@ class DynamicFeatureExtractor: This class is not instantiated directly; it is the base class for other implementations. """ - def get_sample_hashes(self) -> Tuple[str, str, str]: + __metaclass__ = abc.ABCMeta + + def __init__(self): + # + # note: a subclass should define ctor parameters for its own use. + # for example, the Vivisect feature extract might require the vw and/or path. + # this base class doesn't know what to do with that info, though. + # + super().__init__() + # all extractors must be able to provide a samples hashes + self.sample_hashes: SampleHashes + + def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. - - the order of the hashes is: md5, sha1, sha256 """ - raise NotImplementedError() + return self.sample_hashes @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: From 6ee1dfd656a0238ace3acfe1568f7b869c13f098 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jul 2023 21:53:28 +0100 Subject: [PATCH 16/37] address review comments: rename SampleHashes's from_sample() method to from_bytes() method --- capa/features/extractors/binja/extractor.py | 2 +- capa/features/extractors/dnfile/extractor.py | 2 +- capa/features/extractors/ida/extractor.py | 2 +- capa/features/extractors/viv/extractor.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 90821b1c5..e0a024c9c 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -35,7 +35,7 @@ def __init__(self, bv: binja.BinaryView): self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) - self.sample_hashes = SampleHashes.from_sample(Path(self.bv.name).read_bytes()) + self.sample_hashes = SampleHashes.from_bytes(Path(self.bv.name).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index 1f5b1e71b..e047e2b87 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -78,7 +78,7 @@ class DnfileFeatureExtractor(StaticFeatureExtractor): def __init__(self, path: Path): super().__init__() self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) - self.sample_hashes = SampleHashes.from_sample(path.read_bytes()) + self.sample_hashes = SampleHashes.from_bytes(path.read_bytes()) # pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction # most relevant at instruction scope diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 0439d5323..e3b97934f 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -35,7 +35,7 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) - self.sample_hashes = SampleHashes.from_sample(Path(idaapi.get_input_file_path()).read_bytes()) + self.sample_hashes = SampleHashes.from_bytes(Path(idaapi.get_input_file_path()).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index d556468d7..75a62da2a 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -37,7 +37,7 @@ def __init__(self, vw, path: Path, os): self.vw = vw self.path = path self.buf = path.read_bytes() - self.sample_hashes = SampleHashes.from_sample(self.buf) + self.sample_hashes = SampleHashes.from_bytes(self.buf) # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] From 806bc1853d3067964221f55d55fe6770049dbec1 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Thu, 20 Jul 2023 22:13:06 +0100 Subject: [PATCH 17/37] Update mypy.ini: add TODO comment --- .github/mypy/mypy.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index 81614afe1..b7d06e15e 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -1,9 +1,10 @@ [mypy] +# TODO(yelhamer): remove this once proto has been added +# for the dynamic rendering exclude = (?x)( ^capa/render/proto/__init__.py$ | ^tests/_test_proto.py$ - | ^capa/ida/helpers.py$ ) [mypy-halo.*] From 24b3abd70668249c673fc7e8c0d20a6f9eea5022 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 08:44:37 +0100 Subject: [PATCH 18/37] add get_sample_hashes() to base extractor --- capa/features/extractors/binja/extractor.py | 3 --- capa/features/extractors/cape/extractor.py | 3 --- capa/features/extractors/dnfile/extractor.py | 3 --- capa/features/extractors/dotnetfile.py | 3 ++- capa/features/extractors/ida/extractor.py | 3 --- capa/features/extractors/null.py | 3 +++ capa/features/extractors/pefile.py | 3 ++- capa/features/extractors/viv/extractor.py | 3 --- capa/features/freeze/__init__.py | 12 +++++++++++- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index e0a024c9c..09858c949 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -40,9 +40,6 @@ def __init__(self, bv: binja.BinaryView): def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) - def get_sample_hashes(self): - return tuple(self.sample_hashes) - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index 588744cc1..e4c474fb3 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -40,9 +40,6 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]: # value according to the PE header, the actual trace may use a different imagebase return AbsoluteVirtualAddress(self.static["pe"]["imagebase"]) - def get_sample_hashes(self): - return tuple(self.hashes) - def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index e047e2b87..7f7faa49b 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -93,9 +93,6 @@ def __init__(self, path: Path): def get_base_address(self): return NO_ADDRESS - def get_sample_hashes(self): - return tuple(self.sample_hashes) - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index 823d9e229..715c10e5b 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -31,7 +31,7 @@ Characteristic, ) from capa.features.address import NO_ADDRESS, Address, DNTokenAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor from capa.features.extractors.dnfile.helpers import ( DnType, iter_dotnet_table, @@ -170,6 +170,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) + self.hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return NO_ADDRESS diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index e3b97934f..c80f1e4f6 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -40,9 +40,6 @@ def __init__(self): def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) - def get_sample_hashes(self): - return self.sample_hashes - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/null.py b/capa/features/extractors/null.py index 65c3f6ac9..507156c16 100644 --- a/capa/features/extractors/null.py +++ b/capa/features/extractors/null.py @@ -15,6 +15,7 @@ from capa.features.extractors.base_extractor import ( BBHandle, InsnHandle, + SampleHashes, ThreadHandle, ProcessHandle, FunctionHandle, @@ -49,6 +50,7 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor): """ base_address: Address + sample_hashes: SampleHashes global_features: List[Feature] file_features: List[Tuple[Address, Feature]] functions: Dict[Address, FunctionFeatures] @@ -103,6 +105,7 @@ class ProcessFeatures: @dataclass class NullDynamicFeatureExtractor(DynamicFeatureExtractor): base_address: Address + sample_hashes: SampleHashes global_features: List[Feature] file_features: List[Tuple[Address, Feature]] processes: Dict[Address, ProcessFeatures] diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index 9418955ff..a8748979a 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -19,7 +19,7 @@ from capa.features.file import Export, Import, Section from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -190,6 +190,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe = pefile.PE(str(path)) + self.hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase) diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index 75a62da2a..fde0f7cc6 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -49,9 +49,6 @@ def get_base_address(self): # assume there is only one file loaded into the vw return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"]) - def get_sample_hashes(self): - return tuple(self.sample_hashes) - def extract_global_features(self): yield from self.global_features diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py index b2dd3cc25..5c606f665 100644 --- a/capa/features/freeze/__init__.py +++ b/capa/features/freeze/__init__.py @@ -27,7 +27,12 @@ import capa.features.extractors.null as null from capa.helpers import assert_never from capa.features.freeze.features import Feature, feature_from_capa -from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor +from capa.features.extractors.base_extractor import ( + SampleHashes, + FeatureExtractor, + StaticFeatureExtractor, + DynamicFeatureExtractor, +) logger = logging.getLogger(__name__) @@ -300,6 +305,7 @@ class Config: class Freeze(BaseModel): version: int = 2 base_address: Address = Field(alias="base address") + sample_hashes: SampleHashes extractor: Extractor features: Features @@ -400,6 +406,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str: freeze = Freeze( version=2, base_address=Address.from_capa(extractor.get_base_address()), + sample_hashes=extractor.get_sample_hashes(), extractor=Extractor(name=extractor.__class__.__name__), features=features, ) # type: ignore @@ -484,6 +491,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str: freeze = Freeze( version=2, base_address=Address.from_capa(base_addr), + sample_hashes=extractor.get_sample_hashes(), extractor=Extractor(name=extractor.__class__.__name__), features=features, ) # type: ignore @@ -501,6 +509,7 @@ def loads_static(s: str) -> StaticFeatureExtractor: assert isinstance(freeze.features, StaticFeatures) return null.NullStaticFeatureExtractor( base_address=freeze.base_address.to_capa(), + sample_hashes=freeze.sample_hashes, global_features=[f.feature.to_capa() for f in freeze.features.global_], file_features=[(f.address.to_capa(), f.feature.to_capa()) for f in freeze.features.file], functions={ @@ -533,6 +542,7 @@ def loads_dynamic(s: str) -> DynamicFeatureExtractor: assert isinstance(freeze.features, DynamicFeatures) return null.NullDynamicFeatureExtractor( base_address=freeze.base_address.to_capa(), + sample_hashes=freeze.sample_hashes, global_features=[f.feature.to_capa() for f in freeze.features.global_], file_features=[(f.address.to_capa(), f.feature.to_capa()) for f in freeze.features.file], processes={ From 6d1a8858640cd638d4af35ea4ebb25399ab59ad0 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 08:48:18 +0100 Subject: [PATCH 19/37] update static freeze test --- tests/test_static_freeze.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_static_freeze.py b/tests/test_static_freeze.py index 2a5765299..16dde31d6 100644 --- a/tests/test_static_freeze.py +++ b/tests/test_static_freeze.py @@ -22,10 +22,15 @@ import capa.features.extractors.null import capa.features.extractors.base_extractor from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import BBHandle, FunctionHandle +from capa.features.extractors.base_extractor import BBHandle, SampleHashes, FunctionHandle EXTRACTOR = capa.features.extractors.null.NullStaticFeatureExtractor( base_address=AbsoluteVirtualAddress(0x401000), + sample_hashes=SampleHashes( + md5="6eb7ee7babf913d75df3f86c229df9e7", + sha1="2a082494519acd5130d5120fa48786df7275fdd7", + sha256="0c7d1a34eb9fd55bedbf37ba16e3d5dd8c1dd1d002479cc4af27ef0f82bb4792", + ), global_features=[], file_features=[ (AbsoluteVirtualAddress(0x402345), capa.features.common.Characteristic("embedded pe")), From b1e468dae43d6ff8b590b9c9b06b2f9232b8d849 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 11:04:21 +0100 Subject: [PATCH 20/37] add tests for the get_sample_hashes() method --- capa/features/extractors/cape/extractor.py | 2 +- capa/features/extractors/dnfile_.py | 3 +- capa/features/extractors/dotnetfile.py | 2 +- capa/features/extractors/pefile.py | 2 +- tests/fixtures.py | 57 +++++++++++++++++++++- tests/test_extractor_hashing.py | 50 +++++++++++++++++++ 6 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 tests/test_extractor_hashing.py diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index e4c474fb3..a6bf1dd3d 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -28,7 +28,7 @@ def __init__(self, cape_version: str, static: Dict, behavior: Dict): self.cape_version = cape_version self.static = static self.behavior = behavior - self.hashes = SampleHashes( + self.sample_hashes = SampleHashes( md5=static["file"]["md5"], sha1=static["file"]["sha1"], sha256=static["file"]["sha256"], diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py index 733fabde2..38e95b87f 100644 --- a/capa/features/extractors/dnfile_.py +++ b/capa/features/extractors/dnfile_.py @@ -25,7 +25,7 @@ Feature, ) from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -86,6 +86,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self) -> AbsoluteVirtualAddress: return AbsoluteVirtualAddress(0x0) diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index 715c10e5b..987fad5bc 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -170,7 +170,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe: dnfile.dnPE = dnfile.dnPE(str(path)) - self.hashes = SampleHashes.from_bytes(self.path.read_bytes()) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return NO_ADDRESS diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index a8748979a..17808e9ad 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -190,7 +190,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.pe = pefile.PE(str(path)) - self.hashes = SampleHashes.from_bytes(self.path.read_bytes()) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase) diff --git a/tests/fixtures.py b/tests/fixtures.py index f9a36041c..6ed04d6e0 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -38,7 +38,14 @@ FeatureAccess, ) from capa.features.address import Address -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, ThreadHandle, ProcessHandle, FunctionHandle +from capa.features.extractors.base_extractor import ( + BBHandle, + InsnHandle, + SampleHashes, + ThreadHandle, + ProcessHandle, + FunctionHandle, +) from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor CD = Path(__file__).resolve().parent @@ -602,6 +609,54 @@ def parametrize(params, values, **kwargs): return pytest.mark.parametrize(params, values, ids=ids, **kwargs) +EXTRACTOR_HASHING_TESTS = [ + # viv extractor + ( + get_viv_extractor(get_data_path_by_name("mimikatz")), + SampleHashes( + md5="5f66b82558ca92e54e77f216ef4c066c", + sha1="e4f82e4d7f22938dc0a0ff8a4a7ad2a763643d38", + sha256="131314a6f6d1d263c75b9909586b3e1bd837036329ace5e69241749e861ac01d", + ), + ), + # PE extractor + ( + get_pefile_extractor(get_data_path_by_name("mimikatz")), + SampleHashes( + md5="5f66b82558ca92e54e77f216ef4c066c", + sha1="e4f82e4d7f22938dc0a0ff8a4a7ad2a763643d38", + sha256="131314a6f6d1d263c75b9909586b3e1bd837036329ace5e69241749e861ac01d", + ), + ), + # dnFile extractor + ( + get_dnfile_extractor(get_data_path_by_name("b9f5b")), + SampleHashes( + md5="b9f5bd514485fb06da39beff051b9fdc", + sha1="c72a2e50410475a51d897d29ffbbaf2103754d53", + sha256="34acc4c0b61b5ce0b37c3589f97d1f23e6d84011a241e6f85683ee517ce786f1", + ), + ), + # dotnet File + ( + get_dotnetfile_extractor(get_data_path_by_name("b9f5b")), + SampleHashes( + md5="b9f5bd514485fb06da39beff051b9fdc", + sha1="c72a2e50410475a51d897d29ffbbaf2103754d53", + sha256="34acc4c0b61b5ce0b37c3589f97d1f23e6d84011a241e6f85683ee517ce786f1", + ), + ), + # cape extractor + ( + get_cape_extractor(get_data_path_by_name("0000a657")), + SampleHashes( + md5="e2147b5333879f98d515cd9aa905d489", + sha1="ad4d520fb7792b4a5701df973d6bd8a6cbfbb57f", + sha256="0000a65749f5902c4d82ffa701198038f0b4870b00a27cfca109f8f933476d82", + ), + ), +] + DYNAMIC_FEATURE_PRESENCE_TESTS = sorted( [ # file/string diff --git a/tests/test_extractor_hashing.py b/tests/test_extractor_hashing.py new file mode 100644 index 000000000..9bb2fe5e1 --- /dev/null +++ b/tests/test_extractor_hashing.py @@ -0,0 +1,50 @@ +# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging + +import pytest +import fixtures + +from capa.features.extractors.base_extractor import SampleHashes + +logger = logging.getLogger(__name__) + + +@fixtures.parametrize( + "extractor,hashes", + fixtures.EXTRACTOR_HASHING_TESTS, +) +def test_hash_extraction(extractor, hashes): + assert extractor.get_sample_hashes() == hashes + + +# We need to skip the binja test if we cannot import binaryninja, e.g., in GitHub CI. +binja_present: bool = False +try: + import binaryninja + + try: + binaryninja.load(source=b"\x90") + except RuntimeError: + logger.warning("Binary Ninja license is not valid, provide via $BN_LICENSE or license.dat") + else: + binja_present = True +except ImportError: + pass + + +@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +def test_binja_hash_extraction(): + extractor = fixtures.get_binja_extractor(fixtures.get_data_path_by_name("mimikatz")) + hashes = SampleHashes( + md5="5f66b82558ca92e54e77f216ef4c066c", + sha1="e4f82e4d7f22938dc0a0ff8a4a7ad2a763643d38", + sha256="131314a6f6d1d263c75b9909586b3e1bd837036329ace5e69241749e861ac01d", + ) + assert extractor.get_sample_hashes() == hashes From da4e887aeef0f7a4b365f773e839ba8d72a2b9f1 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 12:40:02 +0100 Subject: [PATCH 21/37] fix comment typo Co-authored-by: Moritz --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 184ff0d60..676074585 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -116,7 +116,7 @@ def __init__(self): # this base class doesn't know what to do with that info, though. # super().__init__() - # all extractors must be able to provide a samples hashes + # all extractors must be able to provide a sample's hashes self.sample_hashes: SampleHashes @abc.abstractmethod From 6f3fb423853925886d3f77a4d2b44f250e433939 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 13:15:55 +0100 Subject: [PATCH 22/37] update compute_dynamic_layout with the appropriate type Co-authored-by: Willi Ballenthin --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index d2dbe4c45..561a92d1b 100644 --- a/capa/main.py +++ b/capa/main.py @@ -967,7 +967,7 @@ def collect_metadata( ) -def compute_dynamic_layout(rules, extractor, capabilities) -> rdoc.Layout: +def compute_dynamic_layout(rules, extractor: DynamicFeatureExtractor, capabilities) -> rdoc.DynamicLayout: """ compute a metadata structure that links threads to the processes in which they're found. From bd8331678c8cd90b04e167458c2a5a2cc37e2b05 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 13:16:51 +0100 Subject: [PATCH 23/37] update compute_static_layout with the appropriate types Co-authored-by: Willi Ballenthin --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 561a92d1b..cf59c3eed 100644 --- a/capa/main.py +++ b/capa/main.py @@ -1010,7 +1010,7 @@ def compute_dynamic_layout(rules, extractor: DynamicFeatureExtractor, capabiliti return layout -def compute_static_layout(rules, extractor, capabilities) -> rdoc.Layout: +def compute_static_layout(rules, extractor: StaticFeatureExtractor, capabilities) -> rdoc.StaticLayout: """ compute a metadata structure that links basic blocks to the functions in which they're found. From 736b2cd689fa8dffbaf54855d72803d3e51543ba Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 12:48:03 +0100 Subject: [PATCH 24/37] address @mr-tz main.py review comments --- capa/main.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/capa/main.py b/capa/main.py index cf59c3eed..e336a19f7 100644 --- a/capa/main.py +++ b/capa/main.py @@ -139,7 +139,7 @@ def find_instruction_capabilities( returns: tuple containing (features for instruction, match results for instruction) """ # all features found for the instruction. - features = collections.defaultdict(set) # type: FeatureSet + features: FeatureSet = collections.defaultdict(set) # type: FeatureSet for feature, addr in itertools.chain( extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() @@ -167,7 +167,7 @@ def find_basic_block_capabilities( """ # all features found within this basic block, # includes features found within instructions. - features = collections.defaultdict(set) # type: FeatureSet + features: FeatureSet = collections.defaultdict(set) # type: FeatureSet # matches found at the instruction scope. # might be found at different instructions, thats ok. @@ -207,7 +207,7 @@ def find_code_capabilities( """ # all features found within this function, # includes features found within basic blocks (and instructions). - function_features = collections.defaultdict(set) # type: FeatureSet + function_features: FeatureSet = collections.defaultdict(set) # type: FeatureSet # matches found at the basic block scope. # might be found at different basic blocks, thats ok. @@ -236,7 +236,7 @@ def find_code_capabilities( def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet): - file_features = collections.defaultdict(set) # type: FeatureSet + file_features: FeatureSet = collections.defaultdict(set) # type: FeatureSet for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()): # not all file features may have virtual addresses. @@ -362,7 +362,7 @@ def find_thread_capabilities( returns: tuple containing (features for thread, match results for thread) """ # all features found for the thread. - features = collections.defaultdict(set) # type: FeatureSet + features: FeatureSet = collections.defaultdict(set) # type: FeatureSet for feature, addr in itertools.chain( extractor.extract_thread_features(ph, th), extractor.extract_global_features() @@ -390,7 +390,7 @@ def find_process_capabilities( """ # all features found within this process, # includes features found within threads. - process_features = collections.defaultdict(set) # type: FeatureSet + process_features: FeatureSet = collections.defaultdict(set) # type: FeatureSet # matches found at the thread scope. # might be found at different threads, thats ok. @@ -954,7 +954,7 @@ def collect_metadata( md5=md5, sha1=sha1, sha256=sha256, - path=os.path.normpath(sample_path), + path=str(Path(sample_path).resolve()), ), analysis=get_sample_analysis( format_, From 3ab3c61d5ec51639cb5712326d37fda06e46d0c4 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 13:14:35 +0100 Subject: [PATCH 25/37] use ida's hash-extraction functions --- capa/features/extractors/ida/extractor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index c80f1e4f6..7ac8ec4d6 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -35,7 +35,9 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.file.extract_file_format()) self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) - self.sample_hashes = SampleHashes.from_bytes(Path(idaapi.get_input_file_path()).read_bytes()) + self.sample_hashes = SampleHashes( + md5=idaapi.get_input_file_md5(), sha1=idaapi.get_input_file_sha1(), sha256=idaapi.get_input_file_sha256() + ) def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) From 8085caef35a62377ac016b95cf1861e863e7b3e4 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 13:48:28 +0100 Subject: [PATCH 26/37] remove the usage of SampleHashes's __iter__() method --- capa/features/extractors/binja/extractor.py | 2 +- capa/features/extractors/cape/extractor.py | 6 +++--- capa/main.py | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 09858c949..76ee40974 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -35,7 +35,7 @@ def __init__(self, bv: binja.BinaryView): self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) - self.sample_hashes = SampleHashes.from_bytes(Path(self.bv.name).read_bytes()) + self.sample_hashes = SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes()) def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index a6bf1dd3d..2e91c7dbf 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -29,9 +29,9 @@ def __init__(self, cape_version: str, static: Dict, behavior: Dict): self.static = static self.behavior = behavior self.sample_hashes = SampleHashes( - md5=static["file"]["md5"], - sha1=static["file"]["sha1"], - sha256=static["file"]["sha256"], + md5=static["file"]["md5"].lower(), + sha1=static["file"]["sha1"].lower(), + sha256=static["file"]["sha256"].lower(), ) self.global_features = capa.features.extractors.cape.global_.extract_features(self.static) diff --git a/capa/main.py b/capa/main.py index e336a19f7..461e45067 100644 --- a/capa/main.py +++ b/capa/main.py @@ -83,6 +83,7 @@ from capa.features.extractors.base_extractor import ( BBHandle, InsnHandle, + SampleHashes, ThreadHandle, ProcessHandle, FunctionHandle, @@ -939,7 +940,8 @@ def collect_metadata( ) -> rdoc.Metadata: # if it's a binary sample we hash it, if it's a report # we fetch the hashes from the report - md5, sha1, sha256 = extractor.get_sample_hashes() + sample_hashes: SampleHashes = extractor.get_sample_hashes() + md5, sha1, sha256 = sample_hashes.md5, sample_hashes.sha1, sample_hashes.sha256 rules = tuple(r.resolve().absolute().as_posix() for r in rules_path) format_ = get_format(sample_path) if format_ == FORMAT_AUTO else format_ From 674122999fc0d70d9aec8b48b7a0695ddbb6a9c7 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 13:59:29 +0100 Subject: [PATCH 27/37] migrate the `get_sample_hashes()` function to each individual extractor --- capa/features/extractors/base_extractor.py | 8 ++------ capa/features/extractors/binja/extractor.py | 3 +++ capa/features/extractors/cape/extractor.py | 3 +++ capa/features/extractors/ida/extractor.py | 6 ++++++ capa/features/extractors/pefile.py | 3 +++ capa/features/extractors/viv/extractor.py | 3 +++ 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 676074585..93115ca44 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -116,8 +116,6 @@ def __init__(self): # this base class doesn't know what to do with that info, though. # super().__init__() - # all extractors must be able to provide a sample's hashes - self.sample_hashes: SampleHashes @abc.abstractmethod def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: @@ -134,7 +132,7 @@ def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. """ - return self.sample_hashes + raise NotImplementedError() @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: @@ -350,14 +348,12 @@ def __init__(self): # this base class doesn't know what to do with that info, though. # super().__init__() - # all extractors must be able to provide a samples hashes - self.sample_hashes: SampleHashes def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. """ - return self.sample_hashes + raise NotImplementedError() @abc.abstractmethod def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 76ee40974..9f63aebb1 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -40,6 +40,9 @@ def __init__(self, bv: binja.BinaryView): def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/cape/extractor.py b/capa/features/extractors/cape/extractor.py index 2e91c7dbf..881802d4b 100644 --- a/capa/features/extractors/cape/extractor.py +++ b/capa/features/extractors/cape/extractor.py @@ -40,6 +40,9 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]: # value according to the PE header, the actual trace may use a different imagebase return AbsoluteVirtualAddress(self.static["pe"]["imagebase"]) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from self.global_features diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 7ac8ec4d6..99ffe02c2 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -42,6 +42,12 @@ def __init__(self): def get_base_address(self): return AbsoluteVirtualAddress(idaapi.get_imagebase()) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/pefile.py b/capa/features/extractors/pefile.py index 17808e9ad..e79134401 100644 --- a/capa/features/extractors/pefile.py +++ b/capa/features/extractors/pefile.py @@ -195,6 +195,9 @@ def __init__(self, path: Path): def get_base_address(self): return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): buf = Path(self.path).read_bytes() diff --git a/capa/features/extractors/viv/extractor.py b/capa/features/extractors/viv/extractor.py index fde0f7cc6..a4f9c748e 100644 --- a/capa/features/extractors/viv/extractor.py +++ b/capa/features/extractors/viv/extractor.py @@ -49,6 +49,9 @@ def get_base_address(self): # assume there is only one file loaded into the vw return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"]) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features From ab585ef951d5153a3adb38b0ca6847b2cd4d5044 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 14:00:58 +0100 Subject: [PATCH 28/37] add the `skipif` mark back --- tests/test_binja_features.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_binja_features.py b/tests/test_binja_features.py index 4397cf823..f0f137783 100644 --- a/tests/test_binja_features.py +++ b/tests/test_binja_features.py @@ -62,6 +62,7 @@ def test_binja_feature_counts(sample, scope, feature, expected): fixtures.do_test_feature_count(fixtures.get_binja_extractor, sample, scope, feature, expected) +@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") @pytest.mark.xfail(reason="relies on the legacy ruleset which hasn't been updated yet") def test_standalone_binja_backend(): CD = Path(__file__).resolve().parent From 4ec39d49aaa501263d904850b2d2dcac62c70d70 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 14:03:57 +0100 Subject: [PATCH 29/37] fix linting issues --- capa/features/extractors/ida/extractor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index 99ffe02c2..c13bed076 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -6,7 +6,6 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. from typing import List, Tuple, Iterator -from pathlib import Path import idaapi @@ -45,9 +44,6 @@ def get_base_address(self): def get_sample_hashes(self) -> SampleHashes: return self.sample_hashes - def get_sample_hashes(self) -> SampleHashes: - return self.sample_hashes - def extract_global_features(self): yield from self.global_features From c4ba5afe6b42903028f06b004e5036f11881cfd9 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 14:32:42 +0100 Subject: [PATCH 30/37] replace `: FeatureSet` annotations with a comment type annotation --- capa/main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/capa/main.py b/capa/main.py index 461e45067..9ce97bc94 100644 --- a/capa/main.py +++ b/capa/main.py @@ -140,7 +140,7 @@ def find_instruction_capabilities( returns: tuple containing (features for instruction, match results for instruction) """ # all features found for the instruction. - features: FeatureSet = collections.defaultdict(set) # type: FeatureSet + features = collections.defaultdict(set) # type: FeatureSet for feature, addr in itertools.chain( extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() @@ -168,7 +168,7 @@ def find_basic_block_capabilities( """ # all features found within this basic block, # includes features found within instructions. - features: FeatureSet = collections.defaultdict(set) # type: FeatureSet + features = collections.defaultdict(set) # type: FeatureSet # matches found at the instruction scope. # might be found at different instructions, thats ok. @@ -208,7 +208,7 @@ def find_code_capabilities( """ # all features found within this function, # includes features found within basic blocks (and instructions). - function_features: FeatureSet = collections.defaultdict(set) # type: FeatureSet + function_features = collections.defaultdict(set) # type: FeatureSet # matches found at the basic block scope. # might be found at different basic blocks, thats ok. @@ -237,7 +237,7 @@ def find_code_capabilities( def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet): - file_features: FeatureSet = collections.defaultdict(set) # type: FeatureSet + file_features = collections.defaultdict(set) # type: FeatureSet for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()): # not all file features may have virtual addresses. @@ -323,7 +323,7 @@ def pbar(s, *args, **kwargs): # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. - function_and_lower_features: FeatureSet = collections.defaultdict(set) + function_and_lower_features = collections.defaultdict(set) # type: FeatureSet for rule_name, results in itertools.chain( all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items() ): @@ -363,7 +363,7 @@ def find_thread_capabilities( returns: tuple containing (features for thread, match results for thread) """ # all features found for the thread. - features: FeatureSet = collections.defaultdict(set) # type: FeatureSet + features = collections.defaultdict(set) # type: FeatureSet for feature, addr in itertools.chain( extractor.extract_thread_features(ph, th), extractor.extract_global_features() @@ -391,7 +391,7 @@ def find_process_capabilities( """ # all features found within this process, # includes features found within threads. - process_features: FeatureSet = collections.defaultdict(set) # type: FeatureSet + process_features = collections.defaultdict(set) # type: FeatureSet # matches found at the thread scope. # might be found at different threads, thats ok. @@ -447,7 +447,7 @@ def pbar(s, *args, **kwargs): # collection of features that captures the rule matches within process and thread scopes. # mapping from feature (matched rule) to set of addresses at which it matched. - process_and_lower_features: FeatureSet = collections.defaultdict(set) + process_and_lower_features = collections.defaultdict(set) # type: FeatureSet for rule_name, results in itertools.chain(all_process_matches.items(), all_thread_matches.items()): locations = {p[0] for p in results} rule = ruleset[rule_name] From 830bad54bd135267dd89bf8d7cfb477f137ba16e Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 14:41:07 +0100 Subject: [PATCH 31/37] fix bugs --- capa/features/extractors/dnfile/extractor.py | 3 +++ capa/features/extractors/dnfile_.py | 3 +++ capa/features/extractors/dotnetfile.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/capa/features/extractors/dnfile/extractor.py b/capa/features/extractors/dnfile/extractor.py index 7f7faa49b..5d34b7cf4 100644 --- a/capa/features/extractors/dnfile/extractor.py +++ b/capa/features/extractors/dnfile/extractor.py @@ -93,6 +93,9 @@ def __init__(self, path: Path): def get_base_address(self): return NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): yield from self.global_features diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py index 38e95b87f..d18c325de 100644 --- a/capa/features/extractors/dnfile_.py +++ b/capa/features/extractors/dnfile_.py @@ -91,6 +91,9 @@ def __init__(self, path: Path): def get_base_address(self) -> AbsoluteVirtualAddress: return AbsoluteVirtualAddress(0x0) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token diff --git a/capa/features/extractors/dotnetfile.py b/capa/features/extractors/dotnetfile.py index 987fad5bc..70789598a 100644 --- a/capa/features/extractors/dotnetfile.py +++ b/capa/features/extractors/dotnetfile.py @@ -175,6 +175,9 @@ def __init__(self, path: Path): def get_base_address(self): return NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def get_entry_point(self) -> int: # self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT # True: native EP: Token From 3d1a1fb9fa50142227b1ea9c009403651cd7240b Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 14:54:54 +0100 Subject: [PATCH 32/37] add get_sample_hashes() to NullFeatureExtractor --- capa/features/extractors/null.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/capa/features/extractors/null.py b/capa/features/extractors/null.py index 507156c16..800fb7030 100644 --- a/capa/features/extractors/null.py +++ b/capa/features/extractors/null.py @@ -62,6 +62,9 @@ def extract_global_features(self): for feature in self.global_features: yield feature, NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_file_features(self): for address, feature in self.file_features: yield feature, address @@ -114,6 +117,9 @@ def extract_global_features(self): for feature in self.global_features: yield feature, NO_ADDRESS + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_file_features(self): for address, feature in self.file_features: yield feature, address From 90298fe2c84636c8d34ad96ee18bd149a15e8325 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:39:30 +0100 Subject: [PATCH 33/37] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 93115ca44..07a408462 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -128,6 +128,7 @@ def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.addres """ raise NotImplementedError() + @abc.abstractmethod def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. From d13114e9078a5491e1332ceb8adb4960ef5b0898 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:43:22 +0100 Subject: [PATCH 34/37] remove SampleHashes __iter__method Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 07a408462..b67488c67 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -31,11 +31,6 @@ class SampleHashes: sha1: str sha256: str - def __iter__(self) -> Iterator[str]: - yield self.md5 - yield self.sha1 - yield self.sha256 - @classmethod def from_bytes(cls, buf: bytes) -> "SampleHashes": md5 = hashlib.md5() From c32ac19c0d1e3aad63d9d0f845083a8c55d0212c Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:43:41 +0100 Subject: [PATCH 35/37] Update capa/features/extractors/ida/extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/ida/extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/ida/extractor.py b/capa/features/extractors/ida/extractor.py index c13bed076..62b047c44 100644 --- a/capa/features/extractors/ida/extractor.py +++ b/capa/features/extractors/ida/extractor.py @@ -35,7 +35,7 @@ def __init__(self): self.global_features.extend(capa.features.extractors.ida.global_.extract_os()) self.global_features.extend(capa.features.extractors.ida.global_.extract_arch()) self.sample_hashes = SampleHashes( - md5=idaapi.get_input_file_md5(), sha1=idaapi.get_input_file_sha1(), sha256=idaapi.get_input_file_sha256() + md5=idaapi.get_input_file_md5(), sha1="(unknown)", sha256=idaapi.get_input_file_sha256() ) def get_base_address(self): From 344b3e993137ec032c623c37dbe07a7efed2aa88 Mon Sep 17 00:00:00 2001 From: yelhamer <16624109+yelhamer@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:43:56 +0100 Subject: [PATCH 36/37] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index b67488c67..c45722316 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -345,6 +345,7 @@ def __init__(self): # super().__init__() + @abc.abstractmethod def get_sample_hashes(self) -> SampleHashes: """ fetch the hashes for the sample contained within the extractor. From d8c28e80eb159d68454f0c3ba4a166d97ac08cbd Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jul 2023 15:50:09 +0100 Subject: [PATCH 37/37] add get_sample_hashes() to elf extractor --- capa/features/extractors/elffile.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/elffile.py b/capa/features/extractors/elffile.py index dbe9475b8..7e2249e08 100644 --- a/capa/features/extractors/elffile.py +++ b/capa/features/extractors/elffile.py @@ -16,7 +16,7 @@ from capa.features.file import Import, Section from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress -from capa.features.extractors.base_extractor import StaticFeatureExtractor +from capa.features.extractors.base_extractor import SampleHashes, StaticFeatureExtractor logger = logging.getLogger(__name__) @@ -112,6 +112,7 @@ def __init__(self, path: Path): super().__init__() self.path: Path = path self.elf = ELFFile(io.BytesIO(path.read_bytes())) + self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes()) def get_base_address(self): # virtual address of the first segment with type LOAD @@ -119,6 +120,9 @@ def get_base_address(self): if segment.header.p_type == "PT_LOAD": return AbsoluteVirtualAddress(segment.header.p_vaddr) + def get_sample_hashes(self) -> SampleHashes: + return self.sample_hashes + def extract_global_features(self): buf = self.path.read_bytes()